In [491]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

gdp_growth = pd.read_csv('../Data/gdp_growth.csv')
gfcf_growth = pd.read_csv('../Data/gfcf_growth.csv', encoding='utf-8')
inflation = pd.read_csv('../Data/inflation.csv', encoding='utf-8')
unemployment = pd.read_csv('../Data/unemployment.csv', encoding='utf-8')

Delete excessive columns, and set index to country name

In [492]:
gdp_growth = gdp_growth[gdp_growth['Series Name'] == 'GDP growth (annual %)'].drop(['Series Name', 'Series Code', 'Country Code'], axis=1).set_index('Country Name')
inflation = inflation[inflation['Series Name'] == 'Inflation, consumer prices (annual %)'].drop(['Series Name', 'Series Code', 'Country Code'], axis=1).set_index('Country Name')
gfcf_growth = gfcf_growth[gfcf_growth['Series Name'] == 'Gross fixed capital formation (annual % growth)'].drop(['Series Name', 'Series Code', 'Country Code'], axis=1).set_index('Country Name')
unemployment = unemployment[unemployment['Series Name'] == 'Unemployment, total (% of total labor force) (national estimate)'].drop(['Series Name', 'Series Code', 'Country Code'], axis=1).set_index('Country Name')

In [493]:
# .. is NaN
gdp_growth = gdp_growth.replace('..', np.nan)
gfcf_growth = gfcf_growth.replace('..', np.nan)
inflation = inflation.replace('..', np.nan)
unemployment = unemployment.replace('..', np.nan)

Cap the years from 1996 to 2019

In [494]:
gdp_growth = gdp_growth.iloc[:, 4:]
gfcf_growth = gfcf_growth.iloc[:, 4:]
inflation = inflation.iloc[:, 4:]
unemployment = unemployment.iloc[:, 4:]

Delete small countries with missing values in the gdp growth column

In [495]:
print((gdp_growth.isna().sum(axis=1) > 0).sum(), 'countries have missing values in gdp_growth')
gdp_growth = gdp_growth.loc[gdp_growth.isna().sum(axis=1).sort_values(ascending=False) == 0]

33 countries have missing values in gdp_growth


Only keep countries in gdp_growth

In [496]:
gfcf_growth = gfcf_growth[gfcf_growth.index.isin(gdp_growth.index)]
inflation = inflation[inflation.index.isin(gdp_growth.index)]
unemployment = unemployment[unemployment.index.isin(gdp_growth.index)]

Analyse inflation
- Delete countries with more than 2 inflation values missing

In [497]:
print((inflation.isna().sum(axis=1) > 2).sum(), 'countries have missing values in inflation')
inflation = inflation[inflation.isna().sum(axis=1) <= 2]

31 countries have missing values in inflation


Only keep countries in inflation

In [498]:
gdp_growth = gdp_growth[gdp_growth.index.isin(inflation.index)]
gfcf_growth = gfcf_growth[gfcf_growth.index.isin(inflation.index)]
unemployment = unemployment[unemployment.index.isin(inflation.index)]

Same for gfcf 
- Delete countries with more than 2 gfcf values missing

In [499]:
print((gfcf_growth.isna().sum(axis=1) > 2).sum(), 'countries have more than 2 missing values in gfcf_growth')
gfcf_growth =gfcf_growth[gfcf_growth.isna().sum(axis=1) <= 2]
gdp_growth = gdp_growth[gdp_growth.index.isin(gfcf_growth.index)]
inflation = inflation[inflation.index.isin(gfcf_growth.index)]
unemployment = unemployment[unemployment.index.isin(gfcf_growth.index)]

48 countries have more than 2 missing values in gfcf_growth


Unemployment

In [500]:
print((unemployment.isna().sum(axis=1) > 2).sum(), 'countries have more than 2 NaNs in unemployment')
unemployment = unemployment[unemployment.isna().sum(axis=1) <= 2]

gdp_growth = gdp_growth[gdp_growth.index.isin(unemployment.index)]
gfcf_growth = gfcf_growth[gfcf_growth.index.isin(unemployment.index)]
inflation = inflation[inflation.index.isin(unemployment.index)]

40 countries have more than 2 NaNs in unemployment


Fixing Nans

In [501]:
# Convert to float per trade
gfcf_growth = gfcf_growth.astype(float)
gdp_growth = gdp_growth.astype(float)
inflation = inflation.astype(float)
unemployment = unemployment.astype(float)

Change the columns 

In [502]:
years = [str(i) for i in range(1996, 2020)]
gfcf_growth.columns = years
gdp_growth.columns = years
inflation.columns = years
unemployment.columns = years

Create demeaned variables

In [503]:
# Fill in missing values with the previous year's value
inflation = inflation.fillna(method='ffill', axis=1)
unemployment = unemployment.fillna(method='ffill', axis=1)

inflation_growth = inflation.diff(axis=1)
unemployment_growth = unemployment.diff(axis=1)

# Cap values of inflation growth
inflation_growth = inflation_growth.clip(-100, 250)

# Remove nan 
unemployment_growth.fillna(0, inplace=True)

Fill in missing values 

In [504]:
inflation_growth = inflation_growth.iloc[:, 1:]
unemployment_growth = unemployment_growth.iloc[:, 1:]

# Fill in missing values with the means of the countries
gfcf_growth = gfcf_growth.fillna(gfcf_growth.mean(axis=0))

- GDP growth doesnt neexd to be diffed as it is already a diff
- Inflation doenst need to be diffed as it is already a diff

In [505]:
# Change the index so that we have each country 2019-1992 times
melt_gdpgr = gdp_growth.transpose().reset_index().melt(id_vars='index', var_name='Country Name', value_name='GDP Growth')
melt_gdpgr['Year'] = melt_gdpgr['index'].astype(int)
melt_gdpgr = melt_gdpgr.drop('index', axis=1)
melt_gdpgr = melt_gdpgr.set_index(['Country Name', 'Year'])

# The same for the others
melt_gfcf = gfcf_growth.transpose().reset_index().melt(id_vars='index', var_name='Country Name', value_name='GFCF')
melt_gfcf['Year'] = melt_gfcf['index'].astype(int)
melt_gfcf = melt_gfcf.drop('index', axis=1)
melt_gfcf = melt_gfcf.set_index(['Country Name', 'Year'])

melt_inflation = inflation_growth.transpose().reset_index().melt(id_vars='index', var_name='Country Name', value_name='Inflation')
melt_inflation['Year'] = melt_inflation['index'].astype(int)
melt_inflation = melt_inflation.drop('index', axis=1)
melt_inflation = melt_inflation.set_index(['Country Name', 'Year'])

melt_unemployment = unemployment_growth.transpose().reset_index().melt(id_vars='index', var_name='Country Name', value_name='Unemployment')
melt_unemployment['Year'] = melt_unemployment['index'].astype(int)
melt_unemployment = melt_unemployment.drop('index', axis=1)
melt_unemployment = melt_unemployment.set_index(['Country Name', 'Year'])

# Merge all the dataframes
df = pd.merge(melt_gdpgr, melt_gfcf, on=['Country Name', 'Year'])
df = pd.merge(df, melt_inflation, on=['Country Name', 'Year'])
df = pd.merge(df, melt_unemployment, on=['Country Name', 'Year'])

df.columns = ['GDP Growth', 'GFCF change', 'Inflation change', 'Unemployment change']

In [506]:
df.to_csv('../Data/economic_indicators.csv')