In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

from functools import reduce

In [2]:
mpl.rcParams['figure.figsize'] = (9, 5)

# Goal
My goal is to visualize various aspect of the `COVID-19` pandemic.

# Data sources

In this project I use data from the following sources:
- https://github.com/CSSEGISandData/COVID-19 - JHU CSSE COVID-19 Data.
- [GDP per capita PPP](https://data.worldbank.org/indicator/NY.GDP.PCAP.PP.CD) - The World Bank.
- [Population](https://data.worldbank.org/indicator/SP.POP.TOTL) - The World Bank.
- [Urban Population](https://data.worldbank.org/indicator/SP.URB.TOTL.IN.ZS) - The World Bank.
- [Population living in slums](https://data.worldbank.org/indicator/EN.POP.SLUM.UR.ZS) - The World Bank.
- [Rural population](https://data.worldbank.org/indicator/SP.RUR.TOTL.ZS) - The World Bank.
- [Life expectancy at birth](https://data.worldbank.org/indicator/SP.DYN.LE00.IN) - The World Bank.
- [Current healthcare expenditure](https://data.worldbank.org/indicator/SH.XPD.CHEX.GD.ZS) - The World Bank.
- https://datahub.io/JohnSnowLabs/country-and-continent-codes-list - country codes and continents.

# Data preparation

## COVID-19 data
To  obtain a copy of the data clone the repository: <br>
`git clone https://github.com/CSSEGISandData/COVID-19`

In [3]:
path = './data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/'

In [4]:
conf = pd.read_csv(f'{path}/time_series_covid19_confirmed_global.csv')
recov = pd.read_csv(f'{path}/time_series_covid19_recovered_global.csv')
dead = pd.read_csv(f'{path}/time_series_covid19_deaths_global.csv')

In [5]:
def rename_countries(df):
    
    # Fix country names.
    # This also helps with grouping (eg. Congo)
    df['Country'] = df['Country'].apply(lambda x: "Taiwan" if x == "Taiwan*" else x)
    df['Country'] = df['Country'].apply(lambda x: "Korea" if x == "Korea, South" else x)
    df['Country'] = df['Country'].apply(lambda x: "Macedonia"  if x == "North Macedonia" else x)
    df['Country'] = df['Country'].apply(lambda x: "Cape Verde" if x == "Cabo Verde" else x)
    df['Country'] = df['Country'].apply(lambda x: "Congo" if x == "Congo (Brazzaville)"  else x)
    df['Country'] = df['Country'].apply(lambda x: "Congo" if x == "Congo (Kinshasa)" else x)
    
    return df

In [6]:
def process_data(df):
    """
    
    Convert data from columns to rows.
    
    """
    
    # Drop columns
    df = df.drop(['Lat', 'Long', 'Province/State'], axis=1)
    df = df.rename(columns={"Country/Region": "Country"})    
    
    # Rename countries
    df = rename_countries(df=df)
    
    # Enforce countries are unique
    df = df.groupby('Country', as_index=False).sum()        
    
    # Switch to colum format
    df = df.transpose()
    
    # Copy headers from first row
    df.columns = df.iloc[0, :].to_list()
    df['Date'] = df.index
    df = df[1:]    
    
    # Convert dates
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Convert to ints
    cols = df.columns.to_list()
    cols.remove('Date')
    for col in cols:
        df[col] = df[col].astype(int)
    
    # Reorder & Sort    
    cols = ['Date'] + sorted(cols)
    df = df[cols]
    
    # Reset index
    df = df.reset_index(drop=True)
    df.head()    
    
    return df

In [7]:
conf = process_data(df=conf)
recov = process_data(df=recov)
dead = process_data(df=dead)
conf.tail()

Unnamed: 0,Date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
243,2020-09-21,39074,12535,50023,1681,4117,96,640147,47552,26942,...,401122,1927,52070,67443,1068,36151,10,2028,14175,7683
244,2020-09-22,39096,12666,50214,1681,4236,96,652174,47667,26972,...,406058,1934,52685,68453,1068,36580,10,2028,14389,7711
245,2020-09-23,39145,12787,50400,1753,4363,97,664799,47877,26980,...,412245,1946,53275,69439,1069,37083,10,2029,14443,7725
246,2020-09-24,39170,12921,50579,1753,4475,97,678266,48251,27000,...,418889,1959,53834,70406,1069,37591,10,2029,14491,7752
247,2020-09-25,39186,13045,50754,1836,4590,98,691235,48643,27016,...,425767,1967,54392,71273,1069,37963,10,2029,14515,7787


In [8]:
boats = ['Diamond Princess', 'MS Zaandam']
conf = conf.drop(boats, axis=1)
recov = recov.drop(boats, axis=1)
dead = dead.drop(boats, axis=1)

active = conf.drop(['Date'], axis=1) - recov.drop(['Date'], axis=1) - dead.drop(['Date'], axis=1)
active['Date'] = conf['Date']

## Helper functions

In [9]:
dataframes = [conf, recov, dead]
names = ['Confirmed', 'Recovered', 'Deaths']

In [10]:
def get_country_ts(country, dataframes, columns):
    """
    
    Extract data for specific country.
    
    Notes
    -----
    Apply backfill to NaN's.
    
    """
    
    cols = ['Date'] + columns
    ctry = list()
    for df in dataframes:
        tmp = df.loc[:, ['Date', country]]
        ctry.append(tmp)        
    ctry = reduce(lambda x, y: pd.merge(x, y, on='Date', how='outer'), ctry)    
    ctry.columns = cols
    ctry = ctry.fillna(method='bfill')
    
    return ctry

get_country_ts(country='Poland', 
               dataframes=dataframes, 
               columns=names).tail()

Unnamed: 0,Date,Confirmed,Recovered,Deaths
243,2020-09-21,79988,64604,2298
244,2020-09-22,80699,64972,2316
245,2020-09-23,81673,65561,2344
246,2020-09-24,82809,66158,2369
247,2020-09-25,84396,66740,2392


In [11]:
def get_country_stats(dataframes, names):
    """
    
    Create dataframe with cases summarize
    by country.
    
    """
    
    stats = []
    for df,name in zip(dataframes, names):
        tmp = df.tail(1).drop('Date', axis=1)
        tmp = tmp.transpose()
        tmp = tmp.reset_index()
        tmp.columns = ['Country', name]
        stats.append(tmp)
    stats = reduce(lambda x, y: pd.merge(x, y, on='Country', how='outer'), stats)    
    
    return stats

get_country_stats(dataframes, names).head()

Unnamed: 0,Country,Confirmed,Recovered,Deaths
0,Afghanistan,39186,32619,1451
1,Albania,13045,7309,373
2,Algeria,50754,35654,1707
3,Andorra,1836,1263,53
4,Angola,4590,1554,167


## Extract mortality rate

In [12]:
def extract_mortality(country):
    """
    
    Calculate mortality rate over time
    for specific country.
    
    """
    
    df = get_country_ts(country=country, 
                        dataframes=[conf, dead], 
                        columns=['Confirmed', 'Deaths'])
    df = df[df['Confirmed'] > 0]
    df['Mortality'] = df['Deaths'] / df['Confirmed']
    df['Mortality'] = df['Mortality'] * 100
    df['Mortality'] = np.round(df['Mortality'], 2)
    df = df[['Date', 'Mortality']]
    df.columns = ['Date', country]
    
    return df

extract_mortality('Poland').tail()

Unnamed: 0,Date,Poland
243,2020-09-21,2.87
244,2020-09-22,2.87
245,2020-09-23,2.87
246,2020-09-24,2.86
247,2020-09-25,2.83


In [13]:
all_countries = sorted(set(conf.drop('Date', axis=1).columns))
mort = list()
for c in all_countries:
    tmp = extract_mortality(country=c)
    mort.append(tmp)
    
mort = reduce(lambda x, y: pd.merge(x, y, on='Date'), mort)    

mort.tail()

Unnamed: 0,Date,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
131,2020-09-21,3.7,2.9,3.36,3.15,3.74,3.12,2.11,1.97,3.17,...,10.44,2.39,0.84,0.82,3.28,0.73,10.0,28.9,2.34,2.93
132,2020-09-22,3.7,2.9,3.36,3.15,3.66,3.12,2.14,1.97,3.18,...,10.32,2.38,0.84,0.82,3.28,0.74,10.0,28.9,2.3,2.93
133,2020-09-23,3.69,2.89,3.37,3.02,3.64,3.09,2.16,1.97,3.19,...,10.18,2.42,0.83,0.83,3.27,0.73,10.0,28.88,2.3,2.94
134,2020-09-24,3.7,2.86,3.37,3.02,3.62,3.09,2.18,1.96,3.22,...,10.02,2.4,0.83,0.83,3.27,0.73,10.0,28.88,2.29,2.93
135,2020-09-25,3.7,2.86,3.36,2.89,3.64,3.06,2.2,1.95,3.22,...,9.87,2.39,0.83,0.83,3.27,0.73,10.0,28.93,2.29,2.92


In [14]:
print(conf.shape)
print(recov.shape)
print(dead.shape)
print(active.shape)
print(mort.shape)

(248, 186)
(248, 186)
(248, 186)
(248, 186)
(136, 186)


In [15]:
def count_na(df):
    df = df.drop('Date', axis=1).isna().sum(axis=0)
    df.name = 'Missing'
    df = df.to_frame()
    df.sort_values('Missing', ascending=False)
    return df

In [16]:
count_na(conf).head()

Unnamed: 0,Missing
Afghanistan,0
Albania,0
Algeria,0
Andorra,0
Angola,0


In [17]:
count_na(recov).head()

Unnamed: 0,Missing
Afghanistan,0
Albania,0
Algeria,0
Andorra,0
Angola,0


In [18]:
count_na(dead).head()

Unnamed: 0,Missing
Afghanistan,0
Albania,0
Algeria,0
Andorra,0
Angola,0


In [19]:
count_na(active).head()

Unnamed: 0,Missing
Afghanistan,0
Albania,0
Algeria,0
Andorra,0
Angola,0


In [20]:
count_na(mort).head()

Unnamed: 0,Missing
Afghanistan,0
Albania,0
Algeria,0
Andorra,0
Angola,0


In [21]:
dataframes = [conf, recov, dead, active, mort]
names = ['Confirmed', 'Recovered', 'Deaths', 'Active', 'Mortality']
country_stats = get_country_stats(dataframes, names)
country_stats.head()

Unnamed: 0,Country,Confirmed,Recovered,Deaths,Active,Mortality
0,Afghanistan,39186,32619,1451,5116,3.7
1,Albania,13045,7309,373,5363,2.86
2,Algeria,50754,35654,1707,13393,3.36
3,Andorra,1836,1263,53,520,2.89
4,Angola,4590,1554,167,2869,3.64


In [22]:
country_stats.isna().sum(axis=0)

Country      0
Confirmed    0
Recovered    0
Deaths       0
Active       0
Mortality    0
dtype: int64

## First order differences

In [23]:
def get_daily_changes(df):   
    """
    
    Calculate daily change in case
    data, ie apply difference operator.
    
    """
    diff = df.drop(['Date'], axis=1) - df.drop(['Date'], axis=1).shift(1)
    diff['Date'] = df['Date']
    diff.dropna(inplace=True)
    diff.tail()
    return diff

In [24]:
conf_diff = get_daily_changes(df=conf)
recov_diff = get_daily_changes(df=recov)
dead_diff = get_daily_changes(df=dead)
active_diff = get_daily_changes(df=active)
conf_diff.tail()

Unnamed: 0,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe,Date
243,30.0,150.0,197.0,117.0,126.0,0.0,8782.0,121.0,30.0,563.0,...,10.0,430.0,787.0,0.0,465.0,0.0,2.0,44.0,0.0,2020-09-21
244,22.0,131.0,191.0,0.0,119.0,0.0,12027.0,115.0,30.0,645.0,...,7.0,615.0,1010.0,0.0,429.0,0.0,0.0,214.0,28.0,2020-09-22
245,49.0,121.0,186.0,72.0,127.0,1.0,12625.0,210.0,8.0,681.0,...,12.0,590.0,986.0,1.0,503.0,0.0,1.0,54.0,14.0,2020-09-23
246,25.0,134.0,179.0,0.0,112.0,0.0,13467.0,374.0,20.0,832.0,...,13.0,559.0,967.0,0.0,508.0,0.0,0.0,48.0,27.0,2020-09-24
247,16.0,124.0,175.0,83.0,115.0,1.0,12969.0,392.0,16.0,684.0,...,8.0,558.0,867.0,0.0,372.0,0.0,0.0,24.0,35.0,2020-09-25


## Coordinate data

In [25]:
coords = pd.read_csv(f'{path}/time_series_covid19_confirmed_global.csv')
coords = coords.rename(columns={"Country/Region": "Country"})   
coords = coords.loc[~coords['Country'].isin(boats)]
coords = rename_countries(df=coords)
coords = coords[['Country', 'Lat', 'Long']]
coords = coords.groupby('Country', as_index=False).mean()
coords = coords.sort_values('Country')
coords = coords.reset_index(drop=True)
coords.head()

Unnamed: 0,Country,Lat,Long
0,Afghanistan,33.93911,67.709953
1,Albania,41.1533,20.1683
2,Algeria,28.0339,1.6596
3,Andorra,42.5063,1.5218
4,Angola,-11.2027,17.8739


## Continent data

In [26]:
cont_path = './data/datahub/countries.csv'
cont_map = pd.read_csv(cont_path)

cont_map = cont_map.drop(['Continent_Code', 
                          'Two_Letter_Country_Code', 
                          'Country_Number'], axis=1)

cont_map['Country'] = cont_map['Country_Name'].apply(lambda x: x.split(", ")[0])
cont_map = cont_map.rename(columns={"Continent_Name": "Continent", 'Three_Letter_Country_Code': 'Country Code'})
cont_map = cont_map.drop(['Country_Name'], axis=1)
cont_map = cont_map.drop_duplicates(subset=['Country'])

cont_map.head()

Unnamed: 0,Continent,Country Code,Country
0,Asia,AFG,Afghanistan
1,Europe,ALB,Albania
2,Antarctica,ATA,Antarctica (the territory South of 60 deg S)
3,Africa,DZA,Algeria
4,Oceania,ASM,American Samoa


In [27]:
countries_covid = set(coords['Country'])

In [28]:
# Change values in countries.csv to match covid data.
to_swap = [('Russian Federation', 'Russia'),
           ('Slovakia (Slovak Republic)', 'Slovakia'),
           ('Kyrgyz Republic', 'Kyrgyzstan'),
           ('Syrian Arab Republic', 'Syria'),
           ('Libyan Arab Jamahiriya', 'Libya'),
           ('Korea, South', 'Korea'),
           ('Brunei Darussalam', 'Brunei'),
           ('Cabo Verde', 'Cape Verde'),
           ('Holy See (Vatican City State)', 'Holy See'),
           ('United States of America', 'US'),
           ('United Kingdom of Great Britain & Northern Ireland', 'United Kingdom'),
           ("Lao People's Democratic Republic", 'Laos'),
           ('Myanmar', 'Burma'),
           ('Czech Republic', 'Czechia'),
           ('Swaziland',  'Eswatini')]

for x in to_swap:
    cont_map.loc[cont_map['Country'] == x[0], 'Country'] = x[1]    

In [29]:
countries_cont_map = set(cont_map['Country'])

In [30]:
countries_covid.difference(countries_cont_map)

{'Kosovo', 'West Bank and Gaza'}

In [31]:
ctry_to_cont = pd.merge(coords, cont_map, how='left', on='Country')
ctry_to_cont = ctry_to_cont.dropna()
ctry_to_cont.head()

Unnamed: 0,Country,Lat,Long,Continent,Country Code
0,Afghanistan,33.93911,67.709953,Asia,AFG
1,Albania,41.1533,20.1683,Europe,ALB
2,Algeria,28.0339,1.6596,Africa,DZA
3,Andorra,42.5063,1.5218,Europe,AND
4,Angola,-11.2027,17.8739,Africa,AGO


In [32]:
print(ctry_to_cont.shape)

(183, 5)


## World Bank data

In [33]:
wb_path = './data/world_bank/'

In [34]:
def get_world_bank_data(path, desc):
    """
    
    Get World Bank data into usable format.
    
    Notes
    -----
    Forward filling is applied rowwise.
    
    """
    
    df = pd.read_csv(path, skiprows=4)
    df = df.rename(columns={'Country Name': 'Country'})
    df = df.drop(['Country', 'Indicator Name', 'Indicator Code'], axis=1)    
    df = df.ffill(axis=1)
    df = df[['Country Code', '2019']]
    df = df.rename(columns={'2019': desc})
    df = df[~(df[desc].str.isalpha() == True)]
    df[desc] = df[desc].astype(float)
    df[desc] = df[desc].round(2)
    
    return df

In [35]:
wb_le = 'API_SP.DYN.LE00.IN_DS2_en_csv_v2_1308162.csv'
life_expectancy = get_world_bank_data(path=f'{wb_path}/{wb_le}',
                                      desc='Life Expectancy')
life_expectancy.head()

Unnamed: 0,Country Code,Life Expectancy
0,ABW,76.15
1,AFG,64.49
2,AGO,60.78
3,ALB,78.46
5,ARB,71.81


In [36]:
wb_gdp = 'API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2_1217517.csv'
gdp_per_capita = get_world_bank_data(path=f'{wb_path}/{wb_gdp}',
                                     desc='GDP Per Capita')
gdp_per_capita.head()

Unnamed: 0,Country Code,GDP Per Capita
0,ABW,38442.41
1,AFG,2293.55
2,AGO,6929.68
3,ALB,14495.08
5,ARB,15216.54


In [37]:
wb_pop = 'API_SP.POP.TOTL_DS2_en_csv_v2_1308146.csv'
population = get_world_bank_data(path=f'{wb_path}/{wb_pop}',
                                     desc='Population')
population.head()

Unnamed: 0,Country Code,Population
0,ABW,106314.0
1,AFG,38041754.0
2,AGO,31825295.0
3,ALB,2854191.0
4,AND,77142.0


In [38]:
wb_urb = 'API_SP.URB.TOTL.IN.ZS_DS2_en_csv_v2_1219669.csv'
urban_population = get_world_bank_data(path=f'{wb_path}/{wb_urb}',
                                     desc='Urban Population %')
urban_population.head()

Unnamed: 0,Country Code,Urban Population %
0,ABW,43.55
1,AFG,25.75
2,AGO,66.18
3,ALB,61.23
4,AND,87.98


In [39]:
wb_slum = 'API_EN.POP.SLUM.UR.ZS_DS2_en_csv_v2_1221614.csv'
slum_population = get_world_bank_data(path=f'{wb_path}/{wb_slum}',
                                     desc='Slum Population %')
slum_population.head()

Unnamed: 0,Country Code,Slum Population %
1,AFG,62.7
2,AGO,55.5
5,ARB,32.52
7,ARG,16.7
8,ARM,14.4


In [40]:
wb_rur = 'API_SP.RUR.TOTL.ZS_DS2_en_csv_v2_1222914.csv'
rural_population = get_world_bank_data(path=f'{wb_path}/{wb_rur}',
                                     desc='Rural Population %')
rural_population.head()

Unnamed: 0,Country Code,Rural Population %
0,ABW,56.45
1,AFG,74.25
2,AGO,33.82
3,ALB,38.77
4,AND,12.02


In [41]:
wb_hc = 'API_SH.XPD.CHEX.GD.ZS_DS2_en_csv_v2_1217782.csv'
gdp_healthcare = get_world_bank_data(path=f'{wb_path}/{wb_hc}',
                                     desc='GDP Healthcare %')
gdp_healthcare.head()

Unnamed: 0,Country Code,GDP Healthcare %
1,AFG,11.78
2,AGO,2.79
4,AND,10.32
5,ARB,4.86
6,ARE,3.33


In [42]:
world_bank = [life_expectancy,
              gdp_per_capita,
              population,
              urban_population,
              slum_population,
              rural_population,
              gdp_healthcare]

world_bank = reduce(lambda x, y: pd.merge(x, y, on='Country Code', how='outer'), world_bank)
world_bank.head()

Unnamed: 0,Country Code,Life Expectancy,GDP Per Capita,Population,Urban Population %,Slum Population %,Rural Population %,GDP Healthcare %
0,ABW,76.15,38442.41,106314.0,43.55,,56.45,
1,AFG,64.49,2293.55,38041754.0,25.75,62.7,74.25,11.78
2,AGO,60.78,6929.68,31825295.0,66.18,55.5,33.82,2.79
3,ALB,78.46,14495.08,2854191.0,61.23,,38.77,
4,ARB,71.81,15216.54,427870270.0,59.2,32.52,40.8,4.86


In [43]:
world_bank.shape

(263, 8)

In [44]:
wb_missing = world_bank.isnull().sum(axis=0)
wb_missing.name = 'Missing'
wb_missing = wb_missing.to_frame()
wb_missing = wb_missing.sort_values('Missing', ascending=False)
wb_missing = wb_missing.reset_index()
wb_missing = wb_missing.rename(columns={'index': 'Column'})
wb_missing = wb_missing[wb_missing['Missing'] > 0]
wb_missing

Unnamed: 0,Column,Missing
0,Slum Population %,124
1,GDP Healthcare %,29
2,GDP Per Capita,20
3,Life Expectancy,9
4,Urban Population %,2
5,Rural Population %,2


## Merge world bank with COVID-19

In [45]:
# Check missing countries
S1 = set(ctry_to_cont['Country Code'])
S2 = set(world_bank['Country Code'])
sorted(S1.difference(S2))

['ESH', 'TWN', 'VAT']

In [46]:
ctry_to_cont[ctry_to_cont['Country Code'] == 'ESH']

Unnamed: 0,Country,Lat,Long,Continent,Country Code
181,Western Sahara,24.2155,-12.8858,Africa,ESH


In [47]:
ctry_to_cont[ctry_to_cont['Country Code'] == 'TWN']

Unnamed: 0,Country,Lat,Long,Continent,Country Code
162,Taiwan,23.7,121.0,Asia,TWN


In [48]:
ctry_to_cont[ctry_to_cont['Country Code'] == 'VAT']

Unnamed: 0,Country,Lat,Long,Continent,Country Code
73,Holy See,41.9029,12.4534,Europe,VAT


In [49]:
merged = pd.merge(ctry_to_cont, country_stats, on='Country', how='outer')
merged = pd.merge(merged, world_bank, on='Country Code', how='outer')
merged.shape

(268, 17)

# Data - summary
## Remarks
After the lengthy process of preparing the data I feel obliged to summarize in short some of the key datasets.

From the original `COVID-19` data we created:
- `conf` - Confirmed cases. Timeseries, by country.
- `recov` - Recovered cases. Timeseries, by country.
- `dead` - Fatal cases. Timeseries, by country.
- `active` - Active cases. Calculation: `conf` - `recov` - `dead`. Timeseries, by country.
- `coords` - Countries with latitude and longitude data.
- `country_stats` - Case data summarized by country.

By transforming the above we obtain:
- `conf_diff` - Confirmed cases daily change. Calculated using the difference operator. Timeseries, by country.
- `active_diff` - Confirmed cases daily change. Calculated using the difference operator. Timeseries, by country.
- `recov_diff` - Recovered cases daily change. Calculated using the difference operator. Timeseries, by country.
- `dead_diff` - Fatal cases daily change. Calculated using the difference operator. Timeseries, by country.
- `mort` - Mortality, expressed by `dead`/`conf`. Timeseries, by country.

From X we create:
- `ctry_to_cont` - Map each country to continent.

Using the data above and data from the World Bank we create:
- `world_bank` - Combines all the data from the World Bank.
- `merged` - Combines COVID-19 summary statistics, continents, gdp and population data.

The `COVID-19` data used in this notebook starts on:

In [50]:
str(conf['Date'].min().date())

'2020-01-22'

and ends on:

In [51]:
str(conf['Date'].max().date())

'2020-09-25'

## Storing the data

After all this wrangling we save the data for further analysis.

In [52]:
conf.to_csv('data/cleaned/conf.csv', index=False)
recov.to_csv('data/cleaned/recov.csv', index=False)
dead.to_csv('data/cleaned/dead.csv', index=False)
active.to_csv('data/cleaned/active.csv', index=False)

coords.to_csv('data/cleaned/coords.csv', index=False)
country_stats.to_csv('data/cleaned/country_stats.csv', index=False)

conf_diff.to_csv('data/cleaned/conf_diff.csv', index=False)
active_diff.to_csv('data/cleaned/active_diff.csv', index=False)
recov_diff.to_csv('data/cleaned/recov_diff.csv', index=False)
dead_diff.to_csv('data/cleaned/dead_diff.csv', index=False)

mort.to_csv('data/cleaned/mort.csv', index=False)

ctry_to_cont.to_csv('data/cleaned/ctry_to_cont.csv', index=False)

world_bank.to_csv('data/cleaned/world_bank.csv', index=False)

merged.to_csv('data/cleaned/merged.csv', index=False)