In [2]:
import sys
sys.path.append('..')

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta, datetime, date
import os
from utils import data_paths, load_config
from pathlib import Path
from nltk.metrics import edit_distance #(Levenshtein)
import pycountry
import math

# Estimating The Infected Population From Deaths
> Estimating the number of infected people by country based on the number of deaths and case fatality rate. 

- comments: true
- author: Joao B. Duarte
- categories: [growth, compare, interactive, estimation]
- hide: false
- image: images/covid-estimate-infections.png
- permalink: /covid-infected/
- toc: true

In [4]:
LOCAL_FILES=True
#jupyter or script
IS_SCRIPT = False

In [5]:
os.getcwd()

'/mnt/963GB/Data/Python/ACode/medical/covid19/forecaster/covidforecaster/tools'

In [6]:
if IS_SCRIPT:
    RUN_PATH = Path(os.path.realpath(__file__))
    DATA_PARENT = RUN_PATH.parent.parent
else:
    #for jupyter
    cw=!pwd
    RUN_PATH = Path(cw[0])
    DATA_PARENT = RUN_PATH.parent

In [7]:
if IS_SCRIPT:
    csse_data = data_paths('tools/csse_data_paths.yml')
else:
    csse_data = data_paths('csse_data_paths.yml')

In [8]:
if LOCAL_FILES:
    confirmed_url=csse_data.get("csse_ts_local", {}).get('confirmed', {})
    deaths_url=csse_data.get("csse_ts_local", {}).get('deaths', {})
    recovered_url=csse_data.get("csse_ts_local", {}).get('recovered', {})
    
    confirmed_url = str(DATA_PARENT/confirmed_url)
    deaths_url = str(DATA_PARENT/deaths_url)
    recovered_url = str(DATA_PARENT/recovered_url)
else:
    confirmed_url=csse_data.get("csse_ts_global", {}).get('confirmed', {})
    deaths_url=csse_data.get("csse_ts_global", {}).get('deaths', {})
    recovered_url=csse_data.get("csse_ts_global", {}).get('recovered', {})

In [9]:
### UN stats

In [10]:
df_un_pop_density_info=pd.read_csv(DATA_PARENT/'data/un/df_un_pop_density_info.csv')
df_un_urban_growth_info=pd.read_csv(DATA_PARENT/'data/un/urban_growth_info.csv')
df_un_health_info=pd.read_csv(DATA_PARENT/'data/un/df_un_health_info.csv')
df_un_tourism_info=pd.read_csv(DATA_PARENT/'data/un/df_un_tourism_info.csv')
df_un_gdp_info=pd.read_csv(DATA_PARENT/'data/un/df_un_gdp_info.csv')
df_un_edu_info=pd.read_csv(DATA_PARENT/'data/un/df_un_edu_info.csv')
df_un_pop_growth_info=pd.read_csv(DATA_PARENT/'data/un/df_un_pop_growth_info.csv')
df_un_gdrp_rnd_info=pd.read_csv(DATA_PARENT/'data/un/df_un_gdrp_rnd_info.csv')
df_un_education_info=pd.read_csv(DATA_PARENT/'data/un/df_un_education_info.csv')
df_un_sanitation_info=pd.read_csv(DATA_PARENT/'data/un/df_un_sanitation_info.csv')

df_un_health_expenditure_info=pd.read_csv(DATA_PARENT/'data/un/df_un_health_expenditure_info.csv')
df_un_immigration_info=pd.read_csv(DATA_PARENT/'data/un/df_un_immigration_info.csv')
df_un_trading_info=pd.read_csv(DATA_PARENT/'data/un/df_un_trading_info.csv')
df_un_land_info=pd.read_csv(DATA_PARENT/'data/un/df_un_land_info.csv')

In [11]:
df_un_health_info.head()
#Health personnel: Pharmacists (per 1000 population)

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Footnotes,Source
0,4,Afghanistan,2001,Health personnel: Physicians (number),4104.0,,"World Health Organisation (WHO), Geneva, WHO G..."
1,4,Afghanistan,2001,Health personnel: Physicians (per 1000 populat...,0.2,,"World Health Organisation (WHO), Geneva, WHO G..."
2,4,Afghanistan,2001,Health personnel: Pharmacists (number),525.0,,"World Health Organisation (WHO), Geneva, WHO G..."
3,4,Afghanistan,2001,Health personnel: Pharmacists (per 1000 popula...,0.0,,"World Health Organisation (WHO), Geneva, WHO G..."
4,4,Afghanistan,2005,Health personnel: Pharmacists (number),900.0,,"World Health Organisation (WHO), Geneva, WHO G..."


In [12]:
df_un_trading_info.tail(n=20)
#column Major trading partner 1 (% of exports)
#Major trading partner 1 (% of exports)
#Major trading partner 2 (% of exports)
#Major trading partner 3 (% of exports)

Unnamed: 0,Region/Country/Area,Country,Year,Series,Major trading partner 1 (% of exports),Major trading partner 1 (% of exports) footnote,Value,Footnotes,Source,Unnamed: 9
2834,894,Zambia,2018,Major trading partner 3 (% of exports),Dem.Rep. of the Congo,,9.5418,,"United Nations Statistics Division, New York, ...",
2835,894,Zambia,2005,Major trading partner 3 (% of imports),China,,3.3408,,"United Nations Statistics Division, New York, ...",
2836,894,Zambia,2010,Major trading partner 3 (% of imports),China,,5.4446,,"United Nations Statistics Division, New York, ...",
2837,894,Zambia,2018,Major trading partner 3 (% of imports),China,,13.6411,,"United Nations Statistics Division, New York, ...",
2838,716,Zimbabwe,2005,Major trading partner 1 (% of exports),South Africa,,41.4877,,"United Nations Statistics Division, New York, ...",
2839,716,Zimbabwe,2010,Major trading partner 1 (% of exports),South Africa,,54.2168,,"United Nations Statistics Division, New York, ...",
2840,716,Zimbabwe,2018,Major trading partner 1 (% of exports),South Africa,,51.4752,,"United Nations Statistics Division, New York, ...",
2841,716,Zimbabwe,2005,Major trading partner 1 (% of imports),South Africa,,14.9645,,"United Nations Statistics Division, New York, ...",
2842,716,Zimbabwe,2010,Major trading partner 1 (% of imports),South Africa,,48.0266,,"United Nations Statistics Division, New York, ...",
2843,716,Zimbabwe,2018,Major trading partner 1 (% of imports),South Africa,,39.28,,"United Nations Statistics Division, New York, ...",


In [13]:
df_population_density=df_un_pop_density_info.loc[df_un_pop_density_info['Series'] == 'Population density']

In [14]:
df_population_density.tail(n=50)
#Population aged 60+ years old (percentage)
#Population density
#Population mid-year estimates (millions)

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Footnotes,Source,Unnamed: 7
4417,788,Tunisia,2017,Population density,73.5932,,"United Nations Population Division, New York, ...",
4425,788,Tunisia,2019,Population density,75.275,,"United Nations Population Division, New York, ...",
4432,792,Turkey,2005,Population density,88.2287,,"United Nations Population Division, New York, ...",
4439,792,Turkey,2010,Population density,93.9763,,"United Nations Population Division, New York, ...",
4446,792,Turkey,2017,Population density,105.3967,,"United Nations Population Division, New York, ...",
4454,792,Turkey,2019,Population density,108.4022,,"United Nations Population Division, New York, ...",
4461,800,Uganda,2005,Population density,138.5546,,"United Nations Population Division, New York, ...",
4468,800,Uganda,2010,Population density,162.295,,"United Nations Population Division, New York, ...",
4475,800,Uganda,2017,Population density,206.0287,,"United Nations Population Division, New York, ...",
4483,800,Uganda,2019,Population density,221.5585,,"United Nations Population Division, New York, ...",


In [15]:

df_population_density.loc[df_population_density.groupby('Country')['Year'].idxmax()]

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Footnotes,Source,Unnamed: 7
28,4,Afghanistan,2019,Population density,58.2694,,"United Nations Population Division, New York, ...",
57,8,Albania,2019,Population density,105.1430,,"United Nations Population Division, New York, ...",
86,12,Algeria,2019,Population density,18.0763,,"United Nations Population Division, New York, ...",
106,20,Andorra,2019,Population density,164.1319,,"United Nations Population Division, New York, ...",
135,24,Angola,2019,Population density,25.5276,,"United Nations Population Division, New York, ...",
...,...,...,...,...,...,...,...,...
4686,862,Venezuela,2019,Population density,32.3290,,"United Nations Population Division, New York, ...",
4715,704,Vietnam,2019,Population density,311.0978,,"United Nations Population Division, New York, ...",
4143,275,West Bank and Gaza,2019,Population density,827.4784,Including East Jerusalem.,"United Nations Population Division, New York, ...",
4744,894,Zambia,2019,Population density,24.0265,,"United Nations Population Division, New York, ...",


In [16]:
df_population_density

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Footnotes,Source,Unnamed: 7
6,4,Afghanistan,2005,Population density,39.2952,,"United Nations Population Division, New York, ...",
13,4,Afghanistan,2010,Population density,44.7041,,"United Nations Population Division, New York, ...",
20,4,Afghanistan,2017,Population density,55.5956,,"United Nations Population Division, New York, ...",
28,4,Afghanistan,2019,Population density,58.2694,,"United Nations Population Division, New York, ...",
35,8,Albania,2005,Population density,112.6573,,"United Nations Population Division, New York, ...",
...,...,...,...,...,...,...,...,...
4744,894,Zambia,2019,Population density,24.0265,,"United Nations Population Division, New York, ...",
4751,716,Zimbabwe,2005,Population density,31.2180,,"United Nations Population Division, New York, ...",
4758,716,Zimbabwe,2010,Population density,32.8234,,"United Nations Population Division, New York, ...",
4765,716,Zimbabwe,2017,Population density,36.8013,,"United Nations Population Division, New York, ...",


In [17]:
### Freedom House stats

In [18]:
#Freedon House stats
def country_freedom():
    global_freedom = str(DATA_PARENT/'data/freedom_house/Global_Freedom.csv')
    df_global_free = pd.read_csv(global_freedom)
    internet_freedom = str(DATA_PARENT/'data/freedom_house/Internet_Freedom.csv')
    df_internet_free = pd.read_csv(internet_freedom)
    return df_global_free, df_internet_free
df_global_freedom, df_internet_freedom = country_freedom()

In [19]:
#csse countries
df_deaths = pd.read_csv(deaths_url, error_bad_lines=False)
df_confirmed = pd.read_csv(confirmed_url, error_bad_lines=False)
df_recovered = pd.read_csv(recovered_url, error_bad_lines=False)
csse_countries = []
for df in [df_deaths, df_confirmed, df_recovered]:
    c = set(df["Country/Region"].unique())
    csse_countries.append(c)
csse_countries = [item for sublist in csse_countries for item in sublist]
csse_countries = list(set(csse_countries))

## CSSE

In [20]:
# Get data on deaths D_t
df_deaths = pd.read_csv(deaths_url, error_bad_lines=False)
df_deaths = df_deaths.drop(columns=["Lat", "Long"])
df_deaths = df_deaths.melt(id_vars= ["Province/State", "Country/Region"])
df_deaths = pd.DataFrame(df_deaths.groupby(['Country/Region', "variable"]).sum())
df_deaths.reset_index(inplace=True)  
df_deaths = df_deaths.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_deaths"})
df_deaths['date'] =pd.to_datetime(df_deaths.date)
df_deaths = df_deaths.sort_values(by = "date")
df_deaths.loc[df_deaths.location == "US","location"] = "United States"
df_deaths.loc[df_deaths.location == "Korea, South","location"] = "South Korea"

In [21]:
#confirmed

In [22]:
df_confirmed = pd.read_csv(confirmed_url, error_bad_lines=False)
df_confirmed = df_confirmed.drop(columns=["Lat", "Long"])
df_confirmed = df_confirmed.melt(id_vars= ["Province/State", "Country/Region"])
df_confirmed = pd.DataFrame(df_confirmed.groupby(['Country/Region', "variable"]).sum())
df_confirmed.reset_index(inplace=True)  
df_confirmed = df_confirmed.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"})
df_confirmed['date'] =pd.to_datetime(df_confirmed.date)
df_confirmed = df_confirmed.sort_values(by = "date")
df_confirmed.loc[df_confirmed.location == "US","location"] = "United States"
df_confirmed.loc[df_confirmed.location == "Korea, South","location"] = "South Korea"

In [23]:
df_confirmed.head()

Unnamed: 0,location,date,total_cases
0,Afghanistan,2020-01-22,0
3685,Eswatini,2020-01-22,0
6432,Libya,2020-01-22,0
1407,Bosnia and Herzegovina,2020-01-22,0
11256,United Arab Emirates,2020-01-22,0


In [24]:
df_final = pd.merge(df_deaths,
                 df_confirmed)

In [25]:
df_final.head()

Unnamed: 0,location,date,total_deaths,total_cases
0,Afghanistan,2020-01-22,0,0
1,Eswatini,2020-01-22,0,0
2,Libya,2020-01-22,0,0
3,Bosnia and Herzegovina,2020-01-22,0,0
4,United Arab Emirates,2020-01-22,0,0


In [26]:
df_final["CFR"] = df_final["total_deaths"]/df_final["total_cases"]
df_final["total_infected"] = np.NaN
df_final = df_final.sort_values(by = ['location', 'date'])
df_final = df_final.reset_index(drop = True)

In [27]:
df_un_pop_per_country=pd.read_csv(DATA_PARENT/'data/un/df_un_pop_per_country_info.csv')

In [28]:
def get_country_list(pop_cutoff=5.0):
    pop_nmill=df_un_pop_per_country.loc[df_un_pop_per_country['Value'] >= pop_cutoff]
    countries_n_plus=pop_nmill.Country.tolist()
    return countries_n_plus

In [29]:
csse_countries.sort()
csse_countries

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Diamond Princess',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Holy See',
 'Honduras',


In [30]:
csse_countries=list(map(lambda x: x if x != 'Korea, South' else "South Kores", csse_countries))

In [31]:
countries_n_plus = get_country_list(pop_cutoff=5.0)

In [32]:
for j in countries_n_plus:
    if not j in csse_countries:
        print(j)

South Korea


In [33]:
for j in countries_n_plus:
    for i in df_final["date"].unique()[0:-8]:
        numer =  df_final.loc[(df_final.date == i + np.timedelta64(8, 'D')) & (df_final.location == j), "total_deaths"].iloc[0]
        denom = df_final.loc[(df_final.date == i + np.timedelta64(8, 'D')) & (df_final.location == j), "CFR"].iloc[0]
        df_final.loc[(df_final.date == i) & (df_final.location == j), "total_infected"] = numer/denom

  """


In [34]:
df_final.head()

Unnamed: 0,location,date,total_deaths,total_cases,CFR,total_infected
0,Afghanistan,2020-01-22,0,0,,
1,Afghanistan,2020-01-23,0,0,,
2,Afghanistan,2020-01-24,0,0,,
3,Afghanistan,2020-01-25,0,0,,
4,Afghanistan,2020-01-26,0,0,,


In [35]:
# Estimate growth rate of infected, g        
df_final['infected_g'] = np.log(df_final['total_infected'])
df_final['infected_g'] = df_final['infected_g'].diff() 

In [36]:
# Estimate number of infected given g
today = df_final.date.iloc[-1]
for j in countries_n_plus:
    for i in range(7,-1,-1):
        df_final.loc[(df_final.location == j) & (df_final.date == today - timedelta(i)), "total_infected"] = df_final.loc[df_final.location == j, "total_infected"].iloc[-i-2]*(1+df_final.loc[df_final.location == j, "infected_g"].aggregate(func = "mean"))

In [37]:
data_pc = df_final[['location', 'date', 'total_infected']].copy()

In [38]:
data_countries = []
data_countries_pc = []

In [39]:
for i in countries_n_plus:
    data_pc.loc[data_pc.location == i,"total_infected"] = data_pc.loc[data_pc.location == i,"total_infected"]

In [40]:
# Get each country time series
filter1 = data_pc["total_infected"] > 1
for i in countries_n_plus:
    filter_country = data_pc["location"]== i
    data_countries_pc.append(data_pc[filter_country & filter1])      

In [41]:
len(data_countries_pc)

105

In [42]:
data_countries_pc[0]

Unnamed: 0,location,date,total_infected
52,Afghanistan,2020-03-14,40.0
53,Afghanistan,2020-03-15,40.0
54,Afghanistan,2020-03-16,74.0
55,Afghanistan,2020-03-17,84.0
56,Afghanistan,2020-03-18,94.0
57,Afghanistan,2020-03-19,110.0
58,Afghanistan,2020-03-20,110.0
59,Afghanistan,2020-03-21,128.546017
60,Afghanistan,2020-03-22,150.218895
61,Afghanistan,2020-03-23,175.545823


## Estimated Infected Population By Country

by days since outbreak

In [43]:
# Lastest Country Estimates  
label = 'Total_Infected'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']

In [44]:
metric_name = f'{label}'
temp.columns = ['Country', 'Date', metric_name]
# temp.loc[:, 'month'] = temp.date.dt.strftime('%Y-%m')
temp.loc[:, "Total_Infected"] = temp.loc[:, "Total_Infected"].round(0)  
temp.groupby('Country').last()

Unnamed: 0_level_0,Date,Total_Infected
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,2020-03-28,383.0
Algeria,2020-03-28,1750.0
Argentina,2020-03-28,3018.0
Australia,2020-03-28,13834.0
Austria,2020-03-28,37228.0
...,...,...
United Arab Emirates,2020-03-28,1440.0
United Kingdom,2020-03-28,83653.0
Uzbekistan,2020-03-28,358.0
Venezuela,2020-03-28,267.0


## Infected vs. number of confirmed cases
> Allows you to compare how countries have been tracking the true number of infected people. 
The smaller deviation from the dashed line (45 degree line) the better job at tracking the true number of infected people.

In [45]:
data_pc = df_final.copy()

In [46]:
data_countries = []
data_countries_pc = []

In [47]:
for i in countries_n_plus:
    data_pc.loc[data_pc.location == i,"total_infected"] = data_pc.loc[data_pc.location == i,"total_infected"]
    data_pc.loc[data_pc.location == i,"total_cases"] = data_pc.loc[data_pc.location == i,"total_cases"]
    # get each country time series
filter1 = data_pc["total_infected"] > 1
for i in countries_n_plus:
    filter_country = data_pc["location"]== i
    data_countries_pc.append(data_pc[filter_country & filter1])

In [48]:
type(data_countries_pc[0])

pandas.core.frame.DataFrame

In [49]:
data_countries_pc[0]

Unnamed: 0,location,date,total_deaths,total_cases,CFR,total_infected,infected_g
52,Afghanistan,2020-03-14,0,11,0.0,40.0,
53,Afghanistan,2020-03-15,0,16,0.0,40.0,0.0
54,Afghanistan,2020-03-16,0,21,0.0,74.0,0.615186
55,Afghanistan,2020-03-17,0,22,0.0,84.0,0.126752
56,Afghanistan,2020-03-18,0,22,0.0,94.0,0.112478
57,Afghanistan,2020-03-19,0,22,0.0,110.0,0.157186
58,Afghanistan,2020-03-20,0,24,0.0,110.0,0.0
59,Afghanistan,2020-03-21,0,24,0.0,128.546017,
60,Afghanistan,2020-03-22,1,40,0.025,150.218895,
61,Afghanistan,2020-03-23,1,40,0.025,175.545823,


In [50]:
def get_df_country(country):
    for i, df in enumerate(data_countries_pc):
        if len(df.loc[df['location'] == country]):
            print(f'country: {country}, index: {i}')
        

In [51]:
get_df_country('Italy')

country: Italy, index: 47


In [52]:
data_countries_pc[47]

Unnamed: 0,location,date,total_deaths,total_cases,CFR,total_infected,infected_g
5516,Italy,2020-02-13,0,3,0.0,20.0,
5517,Italy,2020-02-14,0,3,0.0,62.0,1.131402
5518,Italy,2020-02-15,0,3,0.0,155.0,0.916291
5519,Italy,2020-02-16,0,3,0.0,229.0,0.390297
5520,Italy,2020-02-17,0,3,0.0,322.0,0.34083
5521,Italy,2020-02-18,0,3,0.0,453.0,0.341341
5522,Italy,2020-02-19,0,3,0.0,655.0,0.368743
5523,Italy,2020-02-20,0,3,0.0,888.0,0.304337
5524,Italy,2020-02-21,1,20,0.05,1128.0,0.23923
5525,Italy,2020-02-22,2,62,0.032258,1694.0,0.406646


In [79]:
df_all_data_countries_pc=pd.concat(data_countries_pc)

In [81]:
df_all_data_countries_pc.tail()

Unnamed: 0,location,date,total_deaths,total_cases,CFR,total_infected,infected_g
11854,Zimbabwe,2020-03-24,1,3,0.333333,13.092992,
11855,Zimbabwe,2020-03-25,1,3,0.333333,15.311724,
11856,Zimbabwe,2020-03-26,1,3,0.333333,17.906443,
11857,Zimbabwe,2020-03-27,1,5,0.2,20.940861,
11858,Zimbabwe,2020-03-28,1,7,0.142857,24.48949,


In [None]:
#### save all pred as one df

In [82]:
df_all_data_countries_pc.to_csv(DATA_PARENT/'data/processed/csse/df_all_data_countries_pc.csv')

In [None]:
### Combine last day only pred with un and freedom house data

In [53]:
df_country_un_stats = pd.read_csv(DATA_PARENT/'data/un/df_un_merged_stats.csv')

In [60]:
df_country_un_stats.rename(columns={'Country': 'location'}, inplace=True)

In [61]:
idx = data_countries_pc[0].groupby(['location'])['date'].transform(max) == data_countries_pc[0]['date']
sub_df=data_countries_pc[0][idx]
sub_df

Unnamed: 0,location,date,total_deaths,total_cases,CFR,total_infected,infected_g
66,Afghanistan,2020-03-28,4,110,0.036364,382.578176,


In [62]:
sub_df.iloc[0]['location']

'Afghanistan'

In [63]:
df_country_un_stats.head()

Unnamed: 0,location,Population_million,Population_density,Population_60+,Physicians_per_1000_pop
0,Afghanistan,38.0418,58.2694,4.1655,0.3
1,Albania,2.8809,105.143,20.4779,1.2
2,Algeria,43.0531,18.0763,9.6817,1.8
3,Andorra,0.0771,164.1319,19.0387,3.3
4,Angola,31.8253,25.5276,3.6243,0.2


In [None]:
### freedom house

In [72]:
df_freedomhouse_merged = pd.read_csv(DATA_PARENT/'data/freedom_house/df_freedomhouse_merged.csv')

In [73]:
df_freedomhouse_merged.head()

Unnamed: 0,Country,Global_Score,Global_Status,Political Rights,Civil Liberties,Internet_Score,Internet_Access_Obstacles,Internet_Content_Limit,Internet_User_Rights_Violations,Internet_Status
0,Canada,98,Free,40,58,87,23,33,31,Free
1,Australia,97,Free,40,57,77,23,29,25,Free
2,Japan,96,Free,40,56,73,21,28,24,Free
3,Estonia,94,Free,38,56,94,25,32,37,Free
4,Germany,94,Free,39,55,80,22,30,28,Free


In [74]:
df_freedomhouse_merged.rename(columns={'Country': 'location'}, inplace=True)

In [76]:
frames=[]
for df in data_countries_pc:
    idx = df.groupby(['location'])['date'].transform(max) == df['date']
    sub_df=df[idx]
    if len(sub_df)>0:
        #print(f'sub_df: {sub_df}')
        country=sub_df.iloc[0]['location']
        un_df=df_country_un_stats.loc[df_country_un_stats['location'] == country]
        #print(f'un_df: {un_df}')
        df_merged=pd.merge(sub_df, un_df)
        #freedom house data
        fh_df=df_freedomhouse_merged.loc[df_freedomhouse_merged['location'] == country]
        df_merged=pd.merge(df_merged, fh_df)
        frames.append(df_merged)
df_all_un_fh=pd.concat(frames)


In [77]:
df_all_un_fh.head()

Unnamed: 0,location,date,total_deaths,total_cases,CFR,total_infected,infected_g,Population_million,Population_density,Population_60+,Physicians_per_1000_pop,Global_Score,Global_Status,Political Rights,Civil Liberties,Internet_Score,Internet_Access_Obstacles,Internet_Content_Limit,Internet_User_Rights_Violations,Internet_Status
0,Argentina,2020-03-28,18,690,0.026087,3018.475269,,44.7807,16.3631,15.3941,4.0,85,Free,35,50,72,19,26,27,Free
0,Australia,2020-03-28,14,3640,0.003846,13833.806652,,25.2032,3.2807,21.4395,3.6,97,Free,40,57,77,23,29,25,Free
0,Azerbaijan,2020-03-28,4,182,0.021978,623.509155,,10.0477,121.5577,11.0734,3.4,10,Not Free,2,8,39,11,15,13,Not Free
0,Bangladesh,2020-03-28,5,48,0.104167,121.602482,,163.0462,1252.5633,7.7418,0.5,39,Partly Free,15,24,44,13,17,14,Partly Free
0,Brazil,2020-03-28,111,3904,0.028432,20072.66376,,211.0495,25.2508,13.5897,2.1,75,Free,31,44,64,18,26,20,Partly Free


In [78]:
df_all_un_fh.to_csv(DATA_PARENT/'data/processed/csse/df_data_countries_pc_latest.csv')

## Methodology

We argue that the number of infected in the past can be infered using today's number of deaths and average fatality rate from confirmed cases in the following way:

{% raw %}
$$ I_{t-j} = \frac{D_t}{{CFR}_t}$$
{% endraw %}

where {% raw %}$I_t${% endraw %} = number of infected, {% raw %}$D_t${% endraw %} = number of deaths, and {% raw %}${CFR}_t ${% endraw %} = case fatality rate = {% raw %}$\frac{D}{C}${% endraw %}. The {% raw %}$j${% endraw %} depends on the average number of days that covid patients die after having the first symptoms.

**Assumption 1**: The case fatality rate is a good proxy for the fatality rate of the infected population


Then, in order to estimate the current number of infected {% raw %}$I_t${% endraw %} we need to estimate its growth rate from {% raw %}$t-j${% endraw %} to {% raw %}$t${% endraw %}.

{% raw %}
$$I_t = (1+\hat{g})^j I_{t-j}$$
{% endraw %}

**Assumption 2**: The growth rate of infected $\hat{g}$ is an unbiased estimate of $g$ .

For now we estimate $g$ using the average growth rate since having the first infected person.

**Assumption 3**: It takes on average 8 days to day after having the first symptoms.

This analysis was conducted by [Joao B. Duarte](https://www.jbduarte.com). Relevant sources are listed below: 


1. [2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE](https://systems.jhu.edu/research/public-health/ncov/) [GitHub repository](https://github.com/CSSEGISandData/COVID-19). 

2. [Feenstra, Robert C., Robert Inklaar and Marcel P. Timmer (2015), "The Next Generation of the Penn World Table" American Economic Review, 105(10), 3150-3182](https://www.rug.nl/ggdc/productivity/pwt/related-research)
