In [336]:
#import libaries
import pandas as pd
import numpy as np
import geopandas
import pycountry
from geopy import Nominatim
import matplotlib.pyplot as plt
import plotly.express as px

### Data Loading and Preprocessing

In [337]:
#import vaccine data into dataframe
df = pd.read_csv('VaccineData.csv')

In [338]:
#change Date col to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [339]:
#review dataframe info (data types, nulls, etc)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48150 entries, 0 to 48149
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Country                   48150 non-null  object        
 1   Date                      48150 non-null  datetime64[ns]
 2   Vaccine_Manufacturer      48150 non-null  object        
 3   Total_Vaccinations        48150 non-null  int64         
 4   Severe Disease Ancestral  48150 non-null  int64         
 5   Infection Ancestral       48150 non-null  int64         
 6   Severe Disease Alpha      48150 non-null  int64         
 7   Infection Alpha           48150 non-null  int64         
 8   Severe Disease Beta       48150 non-null  int64         
 9   Infection Beta            48150 non-null  int64         
 10  Severe Disease Gamma      48150 non-null  int64         
 11  Infection Gamma           48150 non-null  int64         
 12  Severe Disease Del

In [340]:
#drop European Union rows, since they are unneeded for this analysis
df.drop(df.loc[df['Country'] == 'European Union'].index, inplace=True, axis=0)

In [341]:
#drop columns that are not the alpha, delta, or omicron
df.drop(df.columns[[4,5,8,9,10,11]], axis=1, inplace = True)

In [342]:
#find number of unique vaccines giving by vaccine manufacturer
df.groupby('Country')['Vaccine_Manufacturer'].nunique()

Country
Argentina        6
Austria          6
Belgium          5
Bulgaria         4
Canada           6
Chile            5
Croatia          5
Cyprus           5
Czechia          8
Denmark          4
Ecuador          4
Estonia          5
Finland          5
France           5
Germany          6
Hong Kong        2
Hungary          6
Iceland          4
Ireland          5
Italy            5
Japan            4
Latvia           7
Liechtenstein    4
Lithuania        4
Luxembourg       5
Malta            4
Nepal            5
Netherlands      5
Norway           4
Peru             4
Poland           5
Portugal         8
Romania          4
Slovakia         6
Slovenia         5
South Africa     2
South Korea      6
Spain            4
Sweden           4
Switzerland      4
Ukraine          5
United States    3
Uruguay          3
Name: Vaccine_Manufacturer, dtype: int64

In [343]:
#function for finding country codes
def countrycode(column):
    CODE = []
    for country in column:
        try:
            code=pycountry.countries.get(name=country).alpha_3
            CODE.append(code)
        except:
            CODE.append('None')
    return CODE

In [344]:
#create nominatim object to obtain lat long of country
geolocator = Nominatim(user_agent = 'DSEI270_Proj1')

#function to get lat and long from country name
def latlong(column):
    loclist = []
    for country in column:
        try:
            loc = geolocator.geocode(country)
            loclist.append([country, loc.latitude, loc.longitude])
        except:
            loclist.append(['None','None','None'])
    return pd.DataFrame(loclist, columns=['code','lat','long'])

In [345]:
#create code column of 3 letter code for each country; used to merge with geopandas dataset
df['code'] = countrycode(df['Country'])

In [346]:
#import world dataset from geopandas, rename code column, and drop unneeded columns
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world.columns = ['pop_est', 'continent', 'name', 'code', 'gdp_md_est', 'geometry']
world = world[['continent','code','geometry']]

In [347]:
#create dataframe of lat and long info for each unique country
latlongdf = latlong(df['code'].unique())

In [348]:
#merge geometry and lat/long dataframes to df
df = pd.merge(df, world, on='code')
df = pd.merge(df, latlongdf, on='code')

In [349]:
#create geopandas dataframe
gdf = geopandas.GeoDataFrame(df, geometry=df['geometry'])

In [350]:
#load efficacy data 
df_eff = pd.read_csv('Vaccine_Efficacy.csv')
df_eff.drop(df_eff.columns[[1,2,5,6,7,8]], axis=1, inplace = True)

In [351]:
#merge vaccine data and efficacy
df = pd.merge(df,df_eff,on='Vaccine_Manufacturer',how='left')
df.head()

Unnamed: 0,Country,Date,Vaccine_Manufacturer,Total_Vaccinations,Severe Disease Alpha,Infection Alpha,Severe Disease Delta,Infection Delta,Severe Disease Omicron,Infection Omicron,...,continent,geometry,lat,long,Eff Severe Disease Alpha,Eff Infection Alpha,Eff Severe Disease delta,Eff Infection Delta,Eff Severe Disease Omicron,Eff Infection Omicron
0,Argentina,2020-12-29,Oxford/AstraZeneca,1,1,1,1,1,1,0,...,South America,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25000...",31.184568,-7.919683,94,63,94,69,71,36
1,Argentina,2020-12-29,Sinopharm/Beijing,1,1,1,1,1,1,0,...,South America,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25000...",31.184568,-7.919683,73,68,71,67,53,35
2,Argentina,2020-12-29,Sputnik V,20488,18849,17620,18234,17415,11268,7376,...,South America,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25000...",31.184568,-7.919683,92,86,89,85,67,44
3,Argentina,2020-12-30,Sputnik V,40590,37343,34907,36125,34502,22325,14612,...,South America,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25000...",31.184568,-7.919683,92,86,89,85,67,44
4,Argentina,2020-12-31,Sputnik V,43396,39924,37321,38622,36887,23868,15623,...,South America,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25000...",31.184568,-7.919683,92,86,89,85,67,44


In [352]:
#Find latest date for each country/manuf pair
df_latest = df.loc[df.groupby(['Country','Vaccine_Manufacturer']).Date.idxmax()]

In [353]:
'''
#added column for total vaccine of all manuf for specific date and country for proportion calculations
total_vacc = df.groupby(['Country','Date']).sum()[['Total_Vaccinations']].rename(columns={'Total_Vaccinations':'Total'})
df = pd.merge(df, total_vacc, how='left', on=['Country','Date'])
'''

"\n#added column for total vaccine of all manuf for specific date and country for proportion calculations\ntotal_vacc = df.groupby(['Country','Date']).sum()[['Total_Vaccinations']].rename(columns={'Total_Vaccinations':'Total'})\ndf = pd.merge(df, total_vacc, how='left', on=['Country','Date'])\n"

In [354]:
'''
#expanded dataframe of all dates and manuf 
mylist = ['Oxford/AstraZeneca','Sinopharm/Beijing','Sputnik V','Pfizer/BioNTech','CanSino','Moderna','Johnson&Johnson','Novavax','Valneva','Medicago','Sinovac','Covaxin']
df_expanded = pd.DataFrame({'Date':pd.date_range(start='12/4/2020', end='10/18/2022')})
df_expanded['Vaccine_Manufacturer'] = [mylist] * len(df_expanded)
df_expanded = df_expanded.explode('Vaccine_Manufacturer')
df_expanded = pd.merge(df_expanded,df,how='left',on=['Date','Vaccine_Manufacturer'])
df_expanded = df_expanded.drop(columns=['geometry'])
'''

"\n#expanded dataframe of all dates and manuf \nmylist = ['Oxford/AstraZeneca','Sinopharm/Beijing','Sputnik V','Pfizer/BioNTech','CanSino','Moderna','Johnson&Johnson','Novavax','Valneva','Medicago','Sinovac','Covaxin']\ndf_expanded = pd.DataFrame({'Date':pd.date_range(start='12/4/2020', end='10/18/2022')})\ndf_expanded['Vaccine_Manufacturer'] = [mylist] * len(df_expanded)\ndf_expanded = df_expanded.explode('Vaccine_Manufacturer')\ndf_expanded = pd.merge(df_expanded,df,how='left',on=['Date','Vaccine_Manufacturer'])\ndf_expanded = df_expanded.drop(columns=['geometry'])\n"

In [355]:
#Really bad function to fill in missing data for time series
def reallybadcode(test):
    dataframes =[]
    for country in test['Country'].unique():
        df = test[test['Country']==country]
        manuflist = list(df['Vaccine_Manufacturer'].unique())
        for manuf in manuflist:
            df_sm = df[df['Vaccine_Manufacturer']==manuf]
            df_expanded = pd.DataFrame({'Date':pd.date_range(start=df_sm.loc[df_sm.Date.idxmin()]['Date'], end=df.loc[df.Date.idxmax()]['Date'])})
            df_expanded = pd.merge(df_expanded,df_sm,how='left',on=['Date'])
            df_expanded = df_expanded.ffill()
            dataframes.append(df_expanded)
    return pd.concat(dataframes)


In [357]:
#expanded dataframe with forward filled data for all dates in timeseries 
df_expanded = reallybadcode(df.drop(columns=['geometry']))

#added column for total vaccine of all manuf for specific date and country for proportion calculations
total_vacc = df_expanded.groupby(['Country','Date']).sum()[['Total_Vaccinations']].rename(columns={'Total_Vaccinations':'Total'})
df_expanded = pd.merge(df_expanded,total_vacc,how='left',on=['Country','Date']).sort_values(by=['Country','Date'])

### Line Area Graph for breakout infection 

In [358]:
px.line(df_expanded, x='Date', y='Total',color='Country', markers=True)

In [326]:
dftester[dftester['Country']=='Argentina'].head(1)

Unnamed: 0,Date,Country,Vaccine_Manufacturer,Total_Vaccinations,Severe Disease Alpha,Infection Alpha,Severe Disease Delta,Infection Delta,Severe Disease Omicron,Infection Omicron,...,continent,lat,long,Eff Severe Disease Alpha,Eff Infection Alpha,Eff Severe Disease delta,Eff Infection Delta,Eff Severe Disease Omicron,Eff Infection Omicron,Total
0,2020-12-29,Argentina,Oxford/AstraZeneca,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,South America,31.184568,-7.919683,94.0,63.0,94.0,69.0,71.0,36.0,20490.0
