In [100]:
import pandas as pd
import sqlite3
import numpy as np
from itertools import product
pd.set_option("display.max_columns", 500)

In [69]:
data = pd.read_csv('training_data.csv')

In [72]:
countries = data.loc[:, 'Area'].unique()

In [211]:
gdp = pd.read_excel('countries_gdp.xlsx')
pop = pd.read_excel('population_growth.xlsx')

gdp.sort_values(by='country', inplace=True)
pop.sort_values(by='COUNTRY', inplace=True)

In [23]:
gdp_map = {}
for c in gdp.itertuples():
    if c.country in countries:
        gdp_map[c.country] = c.country
    else:
        while gdp_map.get(c.country) not in countries and gdp_map.get(c.country) != 'nan':
            gdp_map[c.country] = input(c.country)

American Samoa[note 1]nan
BoliviaBolivia (Plurinational State of)
Cape VerdeCabo Verde
Czech RepublicCzechia
Hong Kongnan
IranIran (Islamic Republic of)
Ivory Coastnan
Kiribatinan
Kosovonan
LaosLao People's Democratic Republic
Macaunan
Marshall IslandsMarshall Islands
Marshall Islandsnan
MicronesiaMicronesia (Federated States of)
Micronesianan
Moldovanan
MontenegroMontenegro
Montenegronan
Naurunan
Palaunan
Palestinenan
Puerto Rico[note 2]Puerto Rico
Republic of CongoCongo
RussiaRussian Federation
San MarinoSan Marino
San Marinonan
SeychellesSeychelles
Seychellesnan
Solomon Islandsnan
South Korea
South KoreaRepublic of Korea
São Tomé and PríncipeSao Tome and Principe
Taiwannan
TanzaniaUnited Republic of Tanzania
The BahamasBahamas
The Bahamasnan
The GambiaGambia
Timor LesteTimor-Leste
Tonganan
Tuvalunan
United StatesUnited States of America
Vanuatunan
VenezuelaVenezuela (Bolivarian Republic of)
VietnamViet Nam


In [268]:
gdp['mapped'] = gdp['country'].apply(lambda x: gdp_map.get(x, 'nan'))
gdp = gdp.loc[gdp.loc[:,'mapped']!='nan',:]
gdp.to_csv('gdp_mapped.csv', index=False)

In [33]:
pop_map = {}
for c in pop.itertuples():
    if c.COUNTRY in countries:
        pop_map[c.COUNTRY] = c.COUNTRY
    else:
        while pop_map.get(c.COUNTRY) not in countries and pop_map.get(c.COUNTRY) != 'nan':
            pop_map[c.COUNTRY] = input(c.COUNTRY)

Andorranan
Bahamasnan
BoliviaBolivia (Plurinational State of)
BruneiBrunei Darussalam
BurmaMyanmar
Cape VerdeCabo Verde
Czech RepublicCzechia
East TimorTimor-Leste
IranIran (Islamic Republic of)
Ivory Coastnan
Kiribatinan
LaosLao People's Democratic Republic
Liechtensteinnan
Marshall Islandsnan
MicronesiaMicronesia (Federated States of)
MicronesianN
Micronesianan
MoldovaRepublic of Moldova
Monaconan
Montenegronan
Naurunan
North KoreaDemocratic People's Republic of Korea
Palaunan
Palestine/Gaza StripOccupied Palestinian Territory
RussiaRussian Federation
Samoanan
San MarinoSan Marino
San Marinonan
SeychellesSeychelles
Seychellesnan
Solomon Islandsnan
South KoreaRepublic of Korea
SyriaSyrian Arab Republic
São Tomé and PríncipeSao Tome and Principe
Taiwannan
TanzaniaUnited Republic of Tanzania
Tonga
Tonganan
Tuvalunan
United StatesUnited States of America
Vanuatunan
Vatican Citynan
VenezuelaVenezuela (Bolivarian Republic of)
VietnamViet Nam
Western Sahara(Sahrawi)nan


In [269]:
pop['mapped'] = pop['COUNTRY'].apply(lambda x: pop_map.get(x, 'nan'))
pop = pop.loc[pop.loc[:,'mapped']!='nan',:]
pop.to_csv('pop_mapped.csv', index=False)

In [214]:
pop.shape, gdp.shape, len(countries)

((177, 3), (174, 3), 167)

In [215]:
pop.set_index('mapped', inplace=True)
gdp.set_index('mapped', inplace=True)

In [234]:
pop['POPULATION'] = pop['POPULATION'].apply(lambda x: float("-" + x[1:]) if "−" in str(x) else x)

In [98]:
years = np.concatenate([data['YearBin'].unique(),[2020, 2025, 2030]])

In [238]:
def forecast(start, rate, years):
    return start*(1+rate/100)**years

In [260]:
new_data = pd.DataFrame(list(product(countries, years)), columns = ['Area', 'YearBin'])
new_data.set_index(['Area', 'YearBin'], inplace=True)
new_data = new_data.join(data.set_index(['Area', 'YearBin']), how='left')

In [261]:
All = slice(None)
col_pop = 'Total population'
for name, group in new_data.groupby('Area'):
    last_val = group.at[(name, 2015), col_pop]
    rate = pop.at[name, 'POPULATION']
    new_data.at[(name, 2020), col_pop] = forecast(last_val, rate, 5)
    new_data.at[(name, 2025), col_pop] = forecast(last_val, rate, 10)
    new_data.at[(name, 2030), col_pop] = forecast(last_val, rate, 15)

In [262]:
col_gdp = 'Gross Domestic Product (GDP)'
world_gdp = 3.2
for name, group in new_data.groupby('Area'):
    last_val = group.at[(name, 2015), col_gdp]
    if name in gdp.index:
        rate = gdp.at[name, 'gdp_rate']
    else:
        print(name)
        rate = world_gdp
    new_data.at[(name, 2020), col_gdp] = forecast(last_val, rate, 5)
    new_data.at[(name, 2025), col_gdp] = forecast(last_val, rate, 10)
    new_data.at[(name, 2030), col_gdp] = forecast(last_val, rate, 15)

Bhutan
Cuba
Syrian Arab Republic


In [263]:
new_data.loc[('Cuba', All), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Total population,Cultivated area (arable land + permanent crops),Gross Domestic Product (GDP),Total renewable surface water,SDG 6.4.2. Water Stress,Total freshwater withdrawal (primary and secondary)
Area,YearBin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cuba,1980,9926.0,3400.0,21000000000.0,31.64,17.93,5.211
Cuba,1985,10270.0,3730.0,25200000000.0,31.64,17.93,5.211
Cuba,1990,10733.0,4075.0,22100000000.0,31.64,17.93,5.211
Cuba,1995,11014.0,4184.0,25400000000.0,31.64,17.93,5.211
Cuba,2000,11218.0,4237.0,33600000000.0,31.64,17.46,5.074
Cuba,2005,11304.0,3991.0,58600000000.0,31.64,16.99,4.937
Cuba,2010,11382.0,3555.0,75100000000.0,31.64,23.94,6.959
Cuba,2015,11485.0,3472.0,75100000000.0,31.64,23.94,6.959
Cuba,2020,11519.496371,,87910030000.0,,,
Cuba,2025,11554.096355,,102905100000.0,,,


In [264]:
new_data.to_csv('training_data_rate.csv')