In [1]:
import sys
sys.path.append('..')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta, datetime, date
import os
from utils import data_paths, load_config
from pathlib import Path
from nltk.metrics import edit_distance #(Levenshtein)
import pycountry

Align CSSE, UN, Freedom House country names

In [3]:
LOCAL_FILES=True
#jupyter or script
IS_SCRIPT = False

In [4]:
os.getcwd()

'/mnt/963GB/Data/Python/ACode/medical/covid19/forecaster/covidforecaster/tools'

In [5]:
if IS_SCRIPT:
    RUN_PATH = Path(os.path.realpath(__file__))
    DATA_PARENT = RUN_PATH.parent.parent
else:
    #for jupyter
    cw=!pwd
    RUN_PATH = Path(cw[0])
    DATA_PARENT = RUN_PATH.parent

In [6]:
csse_data = data_paths('csse_data_paths.yml')
#for script
#csse_data = data_paths('tools/csse_data_paths.yml')

In [7]:
if LOCAL_FILES:
    confirmed_url=csse_data.get("csse_ts_local", {}).get('confirmed', {})
    deaths_url=csse_data.get("csse_ts_local", {}).get('deaths', {})
    recovered_url=csse_data.get("csse_ts_local", {}).get('recovered', {})
    
    confirmed_url = str(DATA_PARENT/confirmed_url)
    deaths_url = str(DATA_PARENT/deaths_url)
    recovered_url = str(DATA_PARENT/recovered_url)
else:
    confirmed_url=csse_data.get("csse_ts_global", {}).get('confirmed', {})
    deaths_url=csse_data.get("csse_ts_global", {}).get('deaths', {})
    recovered_url=csse_data.get("csse_ts_global", {}).get('recovered', {})

In [8]:
#Geonames stats
def geonames_country_info():
    countryinfo = str(DATA_PARENT/'data/geonames/countryInfo_csv.csv')
    df_country_info = pd.read_csv(countryinfo)
    print(df_country_info.head())
    return df_country_info
df_country_info = geonames_country_info()

  ISO ISO3                           Country      Capital  Area(in sq km)  \
0  NG  NGA                           Nigeria        Abuja        923768.0   
1  ET  ETH                          Ethiopia  Addis Ababa       1127127.0   
2  EG  EGY                             Egypt        Cairo       1001450.0   
3  CD  COD  Democratic Republic of the Congo     Kinshasa       2345410.0   
4  ZA  ZAF                      South Africa     Pretoria       1219912.0   

   Population Continent  
0   154000000        AF  
1    88013491        AF  
2    80471869        AF  
3    70916439        AF  
4    49000000        AF  


In [9]:
#UN statistics per country
def un_stats():
    df_un_urban_growth_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB61_253_PopulationGrowthRatesinUrbanareasandCapitalcities.csv'), encoding='utf8')
    df_un_pop_density_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_1_201907_Population_SurfaceAreaandDensity.csv'), encoding='utf8')
    df_un_health_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_154_201906_HealthPersonnel.csv'), encoding='utf8')
    df_un_tourism_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_176_201904_Tourist-VisitorsArrivalandExpenditure.csv'), encoding='utf8')
    df_un_gdp_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_230_201904_GDPandGDPPerCapita.csv'), encoding='utf8')
    df_un_edu_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_245_201905_PublicExpenditureonEducation.csv'), encoding='utf8')
    df_un_pop_growth_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_246_201907_PopulationGrowthFertilityandMortalityIndicators.csv'), encoding='utf8')
    df_un_gdrp_rnd_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_286_201904_GDPonR&D.csv'), encoding='utf8')
    df_un_education_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_309_201906_Education.csv'), encoding='utf8')
    df_un_sanitation_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_315_201906_WaterandSanitationServices.csv'), encoding='utf8')
    df_un_health_expenditure_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_325_201906_ExpenditureonHealth.csv'), encoding='utf8')
    df_un_immigration_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_327_201907_InternationalMigrantsandRefugees.csv'), encoding='utf8')
    df_un_trading_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_330_201907_MajorTradingPartners.csv'), encoding='utf8')
    df_un_land_info = pd.read_csv(str(DATA_PARENT/'data/un/SYB62_145_201904_Land.csv'), encoding='utf8')
    return df_un_urban_growth_info, df_un_pop_density_info, df_un_health_info, df_un_tourism_info, \
        df_un_gdp_info, df_un_edu_info, df_un_pop_growth_info, \
        df_un_gdrp_rnd_info, df_un_education_info, df_un_sanitation_info, df_un_health_expenditure_info, \
        df_un_immigration_info, df_un_trading_info, df_un_land_info

In [10]:
df_un_urban_growth_info, df_un_pop_density_info, df_un_health_info, df_un_tourism_info, df_un_gdp_info, \
df_un_edu_info, df_un_pop_growth_info, \
df_un_gdrp_rnd_info, df_un_education_info, df_un_sanitation_info, df_un_health_expenditure_info, \
df_un_immigration_info, df_un_trading_info, df_un_land_info = un_stats()

In [11]:
df_un_land_info.head()

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Footnotes,Source
0,1,"Total, all countries or areas",2005,Land area (thousand hectares),13010426.0,,Food and Agriculture Organization of the Unite...
1,1,"Total, all countries or areas",2005,Arable land (thousand hectares),1405868.0,,Food and Agriculture Organization of the Unite...
2,1,"Total, all countries or areas",2005,Permanent crops (thousand hectares),148343.0,,Food and Agriculture Organization of the Unite...
3,1,"Total, all countries or areas",2005,Forest cover (thousand hectares),4032743.0,,Food and Agriculture Organization of the Unite...
4,1,"Total, all countries or areas",2005,Arable land (% of total land area),10.8,,Food and Agriculture Organization of the Unite...


In [12]:
df_un_pop_density_info.head()

Unnamed: 0,Region/Country/Area,Country,Year,Series,Value,Footnotes,Source,Unnamed: 7
0,1,"Total, all countries or areas",2005,Population mid-year estimates (millions),6541.907,,"United Nations Population Division, New York, ...",
1,1,"Total, all countries or areas",2005,Population mid-year estimates for males (milli...,3296.4853,,"United Nations Population Division, New York, ...",
2,1,"Total, all countries or areas",2005,Population mid-year estimates for females (mil...,3245.4217,,"United Nations Population Division, New York, ...",
3,1,"Total, all countries or areas",2005,Sex ratio (males per 100 females),101.5734,,"United Nations Population Division, New York, ...",
4,1,"Total, all countries or areas",2005,Population aged 0 to 14 years old (percentage),28.1425,,"United Nations Population Division, New York, ...",


In [18]:
#Burma = Myanmar 
#US = United States
#taiwan* = Taiwan

#Hack to use CSSE country names as default

In [19]:
#countries_n_plus=map(lambda x: x if x != 'Myanmar' else 'Burma', countries_n_plus)
#countries_n_plus=map(lambda x: x if x != 'United States' else 'US', countries_n_plus)
#countries_n_plus=map(lambda x: x if x != 'Taiwan' else 'Taiwan*', countries_n_plus)
#countries_n_plus=map(lambda x: x if x != 'Republic of Korea' else 'Korea, South', countries_n_plus)

In [20]:
un_countries = []
for df in [df_un_urban_growth_info, df_un_pop_density_info, df_un_health_info, df_un_tourism_info, df_un_gdp_info, df_un_edu_info, df_un_pop_growth_info, \
df_un_gdrp_rnd_info, df_un_education_info, df_un_sanitation_info, df_un_health_expenditure_info, \
df_un_immigration_info, df_un_trading_info]:
    c = set(df.Country.unique())
    un_countries.append(c)
un_countries = [item for sublist in un_countries for item in sublist]
un_countries = list(set(un_countries))

In [21]:
un_countries[:5]

['Czechia', 'Saint Helena', 'Nepal', 'United States Virgin Islands', 'Ghana']

In [22]:
#Freedon House stats
def country_freedom():
    global_freedom = str(DATA_PARENT/'data/freedom_house/Global_Freedom.csv')
    df_global_free = pd.read_csv(global_freedom)
    internet_freedom = str(DATA_PARENT/'data/freedom_house/Internet_Freedom.csv')
    df_internet_free = pd.read_csv(internet_freedom)
    return df_global_free, df_internet_free
df_global_freedom, df_internet_freedom = country_freedom()

In [23]:
freedom_house_countries = []
for df in [df_global_freedom, df_internet_freedom]:
    c = set(df.Country.unique())
    freedom_house_countries.append(c)
freedom_house_countries = [item for sublist in freedom_house_countries for item in sublist]
freedom_house_countries = list(set(freedom_house_countries))

In [24]:
freedom_house_countries[:5]

['Nepal', 'Ghana', 'Peru', 'Germany', 'Marshall Islands']

In [25]:
df_global_freedom.tail()

Unnamed: 0,Country,Toral Score,Status,Political Rights,Civil Liberties
205,China,10,Not Free,-1,11
206,South Ossetia*,10,Not Free,2,8
207,Uzbekistan,10,Not Free,2,8
208,Tibet*,1,Not Free,-2,3
209,Syria,0,Not Free,-3,3


In [26]:
df_internet_freedom.tail()

Unnamed: 0,Country,Total Score,Obstacles to Access,Limits on Content,Violations of User Rights,Status
60,Vietnam,24,12,7,5,Not Free
61,Cuba,22,5,10,7,Not Free
62,Syria,17,6,8,3,Not Free
63,Iran,15,7,5,3,Not Free
64,China,10,8,2,0,Not Free


In [27]:
#csse countries
df_deaths = pd.read_csv(deaths_url, error_bad_lines=False)
df_confirmed = pd.read_csv(confirmed_url, error_bad_lines=False)
df_recovered = pd.read_csv(recovered_url, error_bad_lines=False)
csse_countries = []
for df in [df_deaths, df_confirmed, df_recovered]:
    c = set(df["Country/Region"].unique())
    csse_countries.append(c)
csse_countries = [item for sublist in csse_countries for item in sublist]
csse_countries = list(set(csse_countries))

In [28]:
#Ideally use UN country names as gold standard, but easier to just work with csse countries as they are

#CSSE countries not in UN list
csse_un_country_diff = list(set(csse_countries) - set(un_countries))
print(csse_un_country_diff)

['Russia', 'Brunei', 'Laos', 'Taiwan*', 'Iran', 'Syria', 'Congo (Brazzaville)', 'Diamond Princess', 'Burma', 'Venezuela', 'Moldova', 'Tanzania', "Cote d'Ivoire", 'Korea, South', 'Vietnam', 'Bolivia', 'Congo (Kinshasa)', 'Saint Vincent and the Grenadines', 'West Bank and Gaza', 'US']


In [29]:
#CSSE countries not in freedom_house list
csse_freedom_house_country_diff = list(set(csse_countries) - set(freedom_house_countries))
print(csse_freedom_house_country_diff)

['Czechia', 'Saint Lucia', 'Taiwan*', 'Diamond Princess', 'Congo (Brazzaville)', 'Burma', "Cote d'Ivoire", 'Gambia', 'Saint Kitts and Nevis', 'Korea, South', 'Bahamas', 'Holy See', 'Congo (Kinshasa)', 'Saint Vincent and the Grenadines', 'West Bank and Gaza', 'US']


In [30]:
#Hard codings - change csse:
#Congo (Kinshasa) = Democratic Republic of the Congo' 
#Congo (Brazzaville) = Republic of the Congo 
#Burma = Myanmar 
#US = United States
#'Saint Vincent and the Grenadines'='Saint Vincent & Grenadines'

#not in UN list
#taiwan* = Taiwan

#Not a country
#N/A = diamond princess

In [31]:
#canme_change=['Congo (Kinshasa)','Congo (Brazzaville)','Burma']
#un_change=['Democratic Republic of the Congo','Republic of the Congo','Myanmar']
#for cname, uname in zip[canme_change, un_change]
#    df_deaths['Country/Region'] = df_deaths['Country/Region'].str.replace(cname,uname)
#    df_deaths['Country/Region'] = df_deaths['Country/Region'].str.replace(cname,uname)
#    df_deaths['Country/Region'] = df_deaths['Country/Region'].str.replace(cname,uname)

In [32]:
#FH countries not in CSSE list
fh_csse_country_diff = list(set(freedom_house_countries) - set(csse_countries))
print(fh_csse_country_diff)

['South Sudan', 'Burundi', 'Sierra Leone', 'Tajikistan', 'Comoros', 'Marshall Islands', 'Taiwan', 'St. Lucia', 'St. Vincent and the Grenadines', 'Czech Republic', 'Malawi', 'Solomon Islands', 'Micronesia', 'Tonga', 'The Bahamas', 'Northern Cyprus*', 'Myanmar', 'Palau', 'Vanuatu', 'Somaliland*', 'Republic of the Congo', 'Indian Kashmir*', 'Botswana', 'St. Kitts and Nevis', 'Samoa', 'United States', 'South Korea', 'Western Sahara*', 'Transnistria*', 'Nagorno-Karabakh*', 'Tibet*', 'Turkmenistan', 'West Bank*', 'Pakistani Kashmir*', 'Tuvalu', 'Gaza Strip*', "Côte d'Ivoire", 'Democratic Republic of the Congo', 'Abkhazia*', 'South Ossetia*', 'Crimea*', 'Yemen', 'Lesotho', 'The Gambia', 'São Tomé and Príncipe ', 'Kiribati', 'Eastern Donbas*', 'Nauru', 'North Korea', 'Hong Kong*']


In [33]:
#UN countries not in CSSE list
un_csse_country_diff = list(set(un_countries) - set(csse_countries))
print(un_csse_country_diff)

['Saint Helena', 'United States Virgin Islands', 'Venezuela (Boliv. Rep. of)', 'Marshall Islands', 'South America', 'Cura�ao', 'Americas', 'Melanesia', 'China, Hong Kong SAR', 'Saba', 'Vanuatu', 'Guam', 'Northern Mariana Islands', 'Europe', 'Falkland Islands (Malvinas)', 'TFYR of Macedonia', 'Iran (Islamic Republic of)', 'Congo', 'Bonaire, St. Eustatius & Saba', 'Russian Federation', 'Cayman Islands', 'Oceania', 'Tuvalu', 'Montserrat', 'Côte d’Ivoire', 'Caribbean', 'Aruba', 'Channel Islands', 'C�te d�Ivoire', 'China, Macao SAR', 'Kiribati', 'Cook Islands', 'Martinique', 'Western Europe', 'Southern Europe', 'Other non-specified areas', 'Gibraltar', 'Saint Vincent & Grenadines', 'Western Sahara', 'Viet Nam', 'Eastern Europe', 'Solomon Islands', 'British Virgin Islands', 'Greenland', 'Central Asia', 'Myanmar', 'French Polynesia', 'Sudan [former]', 'Bermuda', "Lao People's Dem. Rep.", 'Netherlands Antilles [former]', 'Syrian Arab Republic', 'United Rep. of Tanzania', 'Northern America', 'S

In [34]:
csse_countries.sort()
csse_countries

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Diamond Princess',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Holy See',
 'Honduras',


In [35]:
un_countries.sort()

In [36]:
un_countries

['Afghanistan',
 'Africa',
 'Albania',
 'Algeria',
 'American Samoa',
 'Americas',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Asia',
 'Australia',
 'Australia and New Zealand',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurin. State of)',
 'Bonaire',
 'Bonaire, St. Eustatius & Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Caribbean',
 'Cayman Islands',
 'Central African Republic',
 'Central America',
 'Central Asia',
 'Chad',
 'Channel Islands',
 'Chile',
 'China',
 'China, Hong Kong SAR',
 'China, Macao SAR',
 'Colombia',
 'Comoros',
 'Congo',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cura�ao',
 'Cyprus',
 'Czechia',
 'Côte d’Ivoire',
 'C

In [37]:
freedom_house_countries.sort()
freedom_house_countries

['Abkhazia*',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Costa Rica',
 'Crimea*',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 "Côte d'Ivoire",
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Eastern Donbas*',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gaza Strip*',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'G

In [38]:
pycountry.countries.search_fuzzy('Reunion')

[Country(alpha_2='RE', alpha_3='REU', name='Réunion', numeric='638'),
 Country(alpha_2='FR', alpha_3='FRA', name='France', numeric='250', official_name='French Republic')]

In [39]:
freedom_house_un_country_diff = list(set(freedom_house_countries) - set(un_countries))
print(freedom_house_un_country_diff)

['Iran', 'Taiwan', 'St. Lucia', 'St. Vincent and the Grenadines', 'Czech Republic', 'Russia', 'The Bahamas', 'Laos', 'Northern Cyprus*', 'Syria', 'Somaliland*', 'Republic of the Congo', 'Indian Kashmir*', 'Tanzania', 'St. Kitts and Nevis', 'Bolivia', 'United States', 'South Korea', 'Western Sahara*', 'Transnistria*', 'Nagorno-Karabakh*', 'Brunei', 'Tibet*', 'West Bank*', 'Pakistani Kashmir*', 'Gaza Strip*', "Côte d'Ivoire", 'Moldova', 'Democratic Republic of the Congo', 'Abkhazia*', 'South Ossetia*', 'Crimea*', 'The Gambia', 'São Tomé and Príncipe ', 'Venezuela', 'Eastern Donbas*', 'North Korea', 'Hong Kong*', 'Vietnam']


Replace UN country names

UN	CSSE
 	 


In [40]:
un_replace = ["Bolivia (Plurin. State of)", "Brunei Darussalam", "Dem. Rep. of the Congo",'China, Hong Kong SAR', \
'Iran (Islamic Republic of)',"Lao Peoples Dem. Rep.","Dem. Peoples Rep. Korea","Russian Federation", \
"Republic of Korea","Syrian Arab Republic","United Rep. of Tanzania",'Venezuela (Boliv. Rep. of)', \
"Viet Nam", "State of Palestine"]
replace_with = ["Bolivia","Brunei","Congo (Kinshasa)","Hong Kong*","Iran","Laos",\
              "North Korea","Russia","South Korea","Syria","Tanzania","Venezuela",\
               "Vietnam","West Bank and Gaza"]

Replace CSSE names

Korea, South	 South Korea


In [41]:
df_un_urban_growth_info.head()

Unnamed: 0,Region/Country/Area,Country,Year,Series,Capital City,Capital City footnote,Value,Footnotes,Source,Unnamed: 9
0,1,"Total, all countries or areas",2005,Urban population (percent),,,49.2,,"United Nations Population Division, New York, ...",
1,1,"Total, all countries or areas",2005,Urban population (percent growth rate per annum),,,2.3,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ...",
2,1,"Total, all countries or areas",2005,Rural population (percent growth rate per annum),,,0.3,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ...",
3,1,"Total, all countries or areas",2010,Urban population (percent),,,51.7,,"United Nations Population Division, New York, ...",
4,1,"Total, all countries or areas",2010,Urban population (percent growth rate per annum),,,2.2,Data refers to a 5-year period preceding the r...,"United Nations Population Division, New York, ...",


In [42]:
for df in [df_un_urban_growth_info, df_un_pop_density_info, df_un_health_info, df_un_tourism_info, \
        df_un_gdp_info, df_un_edu_info, df_un_pop_growth_info, \
        df_un_gdrp_rnd_info, df_un_education_info, df_un_sanitation_info, df_un_health_expenditure_info, \
        df_un_immigration_info, df_un_trading_info, df_un_land_info]:
    for un_country, csse_country in zip(un_replace, replace_with):
        df['Country'] = df['Country'].replace(un_country.strip(),csse_country.strip())
    df['Country']= df['Country'].replace("Dem. People's Rep. Korea", 'South Korea') 

In [43]:
#drop not required un couries
csse_countries=list(map(lambda x: x if x != 'Korea, South' else 'South Korea', csse_countries))

In [44]:
csse_countries

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Diamond Princess',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Holy See',
 'Honduras',


In [51]:
df_un_urban_growth_info= df_un_urban_growth_info[df_un_urban_growth_info.Country.isin(csse_countries)]
df_un_pop_density_info= df_un_pop_density_info[df_un_pop_density_info.Country.isin(csse_countries)]
df_un_health_info= df_un_health_info[df_un_health_info.Country.isin(csse_countries)] 
df_un_tourism_info= df_un_tourism_info[df_un_tourism_info.Country.isin(csse_countries)]
df_un_gdp_info= df_un_gdp_info[df_un_gdp_info.Country.isin(csse_countries)]
df_un_edu_info= df_un_edu_info[df_un_edu_info.Country.isin(csse_countries)]
df_un_pop_growth_info= df_un_pop_growth_info[df_un_pop_growth_info.Country.isin(csse_countries)]
df_un_gdrp_rnd_info= df_un_gdrp_rnd_info[df_un_gdrp_rnd_info.Country.isin(csse_countries)]
df_un_education_info= df_un_education_info[df_un_education_info.Country.isin(csse_countries)] 
df_un_sanitation_info= df_un_sanitation_info[df_un_sanitation_info.Country.isin(csse_countries)]
df_un_health_expenditure_info= df_un_health_expenditure_info[df_un_health_expenditure_info.Country.isin(csse_countries)]
df_un_immigration_info= df_un_immigration_info[df_un_immigration_info.Country.isin(csse_countries)] 
df_un_trading_info= df_un_trading_info[df_un_trading_info.Country.isin(csse_countries)] 
df_un_land_info= df_un_land_info[df_un_land_info.Country.isin(csse_countries)]


In [52]:
df_un_urban_growth_info.to_csv(DATA_PARENT/'data/un/urban_growth_info.csv', index=False)

In [54]:
df_un_pop_density_info.to_csv(DATA_PARENT/'data/un/df_un_pop_density_info.csv', index=False)
df_un_urban_growth_info.to_csv(DATA_PARENT/'data/un/urban_growth_info.csv', index=False)
df_un_health_info.to_csv(DATA_PARENT/'data/un/df_un_health_info.csv', index=False)
df_un_tourism_info.to_csv(DATA_PARENT/'data/un/df_un_tourism_info.csv', index=False)
df_un_gdp_info.to_csv(DATA_PARENT/'data/un/df_un_gdp_info.csv', index=False)
df_un_edu_info.to_csv(DATA_PARENT/'data/un/df_un_edu_info.csv', index=False)
df_un_pop_growth_info.to_csv(DATA_PARENT/'data/un/df_un_pop_growth_info.csv', index=False)
df_un_gdrp_rnd_info.to_csv(DATA_PARENT/'data/un/df_un_gdrp_rnd_info.csv', index=False)
df_un_education_info.to_csv(DATA_PARENT/'data/un/df_un_education_info.csv', index=False)
df_un_sanitation_info.to_csv(DATA_PARENT/'data/un/df_un_sanitation_info.csv', index=False)

df_un_health_expenditure_info.to_csv(DATA_PARENT/'data/un/df_un_health_expenditure_info.csv', index=False)
df_un_immigration_info.to_csv(DATA_PARENT/'data/un/df_un_immigration_info.csv', index=False)
df_un_trading_info.to_csv(DATA_PARENT/'data/un/df_un_trading_info.csv', index=False)
df_un_land_info.to_csv(DATA_PARENT/'data/un/df_un_land_info.csv', index=False)


In [46]:
un_cty_l=list(df_un_urban_growth_info.Country.unique())

In [47]:
un_cty_l.sort()

In [48]:
un_cty_l

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo (Kinshasa)',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Holy See',
 'Honduras',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Irela

In [49]:
#Write out UN df's, keeping only countries in csse list

In [55]:
df_pop=df_un_pop_density_info.loc[df_un_pop_density_info['Series'] == 'Population mid-year estimates (millions)']
max_pop = df_pop.groupby(['Country'])['Year', 'Value'].max()

  


In [57]:
max_pop=max_pop.reset_index()

In [58]:
max_pop.head()

Unnamed: 0,Country,Year,Value
0,Afghanistan,2019,38.0418
1,Albania,2019,3.0868
2,Algeria,2019,43.0531
3,Andorra,2019,0.0844
4,Angola,2019,31.8253


In [59]:
max_pop.to_csv(DATA_PARENT/'data/un/df_un_pop_per_country_info.csv', index=False)

In [15]:
def get_country_list(pop_cutoff=5.0):
    pop_nmill=max_pop.loc[max_pop['Value'] >= pop_cutoff]
    countries_n_plus=pop_nmill.Country.tolist()
    return countries_n_plus

In [16]:
countries_n_plus = get_country_list(pop_cutoff=5.0)

In [17]:
len(countries_n_plus)

149

In [49]:
#Write out UN df's, keeping only countries in csse list