In [56]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### OECD
- "Value added in financial and insurance activities": "DV_VA"
- "General government net lending": "IV_lending"
- "Real effective exchange rates": "C_REER"
- "CPI: all items": "C_cpi"

- "Gross domestic product (output approach)": "C_gdp"

### WB
- "Foreign direct investment, net outflows (% of GDP)": "IV_fdi_outflow"
- "Net trade in goods and services (BoP, current US$)": "IV_trade_balance"
- "Broad money (% of GDP)": "C_M2"

### GDD
- "nfc_ls": "DV_nfc_ls"
- "hh_ls": "DV_hh_ls"

### Historical Puclic Finance 
- "exp": "IV_gov_exp"

In [57]:
# import OECD.csv and select features
oecd = pd.read_csv('data_source/OECD.csv')

oecd = oecd.loc[((oecd['Subject']=='Value added in financial and insurance activities')
                |(oecd['Subject']=='General government net lending')
                |oecd['Subject']=='Real effective exchange rates')
                |(oecd['Subject']=='CPI: all items')]

oecd = oecd[['Subject','Country','Year','Value']]

In [58]:
# import gdp.csv/OECD
gdp = pd.read_csv('data_source/gdp.csv')
gdp = gdp.loc[gdp['MEASURE']=='VXVOB']
gdp = gdp[['Transaction','Country','Year','Value']]
gdp = gdp.rename(columns={'Transaction':'Subject'})

In [59]:
def wb_clean(df):
    # drop unnecessary columns
    df = df.drop(['Unnamed: 0', 'Country Code', 'Indicator Code', 'V65'], axis=1)
    # change from wide to long format
    df = df.pivot_table(index='Country Name', columns='Indicator Name').unstack().reset_index()
    # rename columns
    df = df.rename(columns={'level_0':'Year', 'Indicator Name':'Subject',
                                       'Country Name':'Country', 0:'Value'})
    convert_dict = {'Year': int, 
                'Value': float
               } 
  
    df = df.astype(convert_dict)
    df['Country'] = df['Country'].str.replace('Russian Federation', 'Russia')
    df['Country'] = df['Country'].str.replace('Korea, Rep.', 'Korea')
    df['Country'] = df['Country'].str.replace('China', "China (People's Republic of)")
    
    return df

In [60]:
fdi_outflow = pd.read_csv('data_source/outflow.csv')
fdi_outflow = wb_clean(fdi_outflow)

trade_balance = pd.read_csv('data_source/netgoodservices.csv')
trade_balance = wb_clean(trade_balance)

m2 = pd.read_csv('data_source/broad_money.csv')
m2 = wb_clean(m2)

  df['Country'] = df['Country'].str.replace('Korea, Rep.', 'Korea')
  df['Country'] = df['Country'].str.replace('Korea, Rep.', 'Korea')
  df['Country'] = df['Country'].str.replace('Korea, Rep.', 'Korea')


In [61]:
oecd = oecd.append([gdp,fdi_outflow, trade_balance, m2])
oecd = oecd.sort_values(by=['Country', 'Subject', 'Year'])
oecd = oecd.pivot_table(values='Value', index=['Country', 'Year'], columns=['Subject']).reset_index()

In [62]:
gdd = pd.read_csv('data_source/global_debt_database.csv')
gdd = gdd[['country', 'year', 'nfc_ls', 'hh_ls']]

def gdd_clean(df):
    # rename columns
    df = df.rename(columns={'year':'Year', 'country':'Country'})
    # change country name
    df['Country'] = df['Country'].str.replace('Russian Federation', 'Russia')
    df['Country'] = df['Country'].str.replace('Korea, Republic of', 'Korea')
    df['Country'] = df['Country'].str.replace('China, Mainland', "China (People's Republic of)")
    
    # replace 0 with NAs
    df.replace(0, np.nan, inplace=True)
    
    return df
gdd = gdd_clean(gdd)
oecd = pd.merge(oecd, gdd, on=['Country', 'Year'], how='outer')

In [63]:
public_finance = pd.read_csv('data_source/historical_public_finance_dataset.csv')
public_finance = public_finance[['country', 'year', 'exp']]

# clean public_finance dataframe
def public_finance_clean(df):
    # rename columns
    df = df.rename(columns={'year':'Year', 'country':'Country'})
    
    # change country name
    df['Country'] = df['Country'].str.replace('Russian Federation', 'Russia')
    df['Country'] = df['Country'].str.replace('South Korea', 'Korea')
    df['Country'] = df['Country'].str.replace('China', "China (People's Republic of)")
    
    # replace 0 with NAs
    df.replace(0, np.nan, inplace=True)
    
    return df
public_finance = public_finance_clean(public_finance)
oecd = pd.merge(oecd, public_finance, on=['Country', 'Year'], how='outer')

In [64]:
gini = pd.read_csv('data_source/gini.csv')
gini = gini[['V1', 'V4', 'V5']]

def gini_clean(df):
    # rename columns
    df = df.rename(columns={'V4':'Year', 'V1':'Country', 'V5':'gini'})
    
    # change country name
    df['Country'] = df['Country'].str.replace('South Korea', 'Korea')
    df['Country'] = df['Country'].str.replace('China', "China (People's Republic of)")
    
    # replace 0 with NAs
    df.replace(0, np.nan, inplace=True)
    
    return df
gini = gini_clean(gini)
oecd = pd.merge(oecd, gini, on=['Country', 'Year'], how='outer')

In [65]:
oecd = oecd.rename(columns={
    "Value added in financial and insurance activities": "DV_VA",
    "nfc_ls": "DV_nfc_ls",
    "hh_ls": "DV_hh_ls",
    "General government net lending": "IV_lending",
    "exp": "IV_gov_exp",
    "gini": "IV_gini",
    "Net trade in goods and services (BoP, current US$)": "IV_trade_balance",
    "Foreign direct investment, net outflows (% of GDP)": "IV_fdi_outflow",
    "Real effective exchange rates": "C_REER",
    "CPI: all items": "C_cpi",
    "Gross domestic product (output approach)": "C_gdp",
    "Broad money (% of GDP)": "C_M2"
})

In [66]:
oecd.head()

Unnamed: 0,Country,Year,C_M2,C_cpi,IV_fdi_outflow,C_gdp,IV_trade_balance,DV_nfc_ls,DV_hh_ls,IV_gov_exp,IV_gini
0,Afghanistan,1960,13.450412,,,,,,,,
1,Afghanistan,1961,14.534413,,,,,,,,
2,Afghanistan,1962,17.195122,,,,,,,,
3,Afghanistan,1963,14.464496,,,,,,,,
4,Afghanistan,1964,16.688888,,,,,,,,


In [68]:
country_ls = ['Australia', 'Brazil', 'Canada', 'Chile', "China (People's Republic of)", 
            'Denmark', 'Hungary', 'Iceland', 'Israel', 'Japan', 'Korea', 'Mexico', 
            'New Zealand', 'Norway', 'Poland', 'South Africa', 'Sweden', 'Switzerland', 
            'Turkey', 'United Kingdom', 'United States']
oecd = oecd[oecd.Country.isin(country_ls)] 

In [70]:
oecd.describe()

Unnamed: 0,Year,C_M2,C_cpi,IV_fdi_outflow,C_gdp,IV_trade_balance,DV_nfc_ls,DV_hh_ls,IV_gov_exp,IV_gini
count,4618.0,1135.0,1167.0,966.0,916.0,899.0,776.0,792.0,2414.0,801.0
mean,1909.452577,65.124371,21.40561,1.79365,1347113.0,-6763908000.0,75.465373,49.774606,19.904186,0.474926
std,63.487593,42.857235,136.614265,4.9726,2833417.0,89696220000.0,47.51037,30.654107,15.092522,0.119841
min,1800.0,8.603164,-3.481539,-43.510564,9624.076,-761715000000.0,13.067228,0.084322,0.684444,0.123687
25%,1854.25,36.316614,2.143572,0.237921,172617.6,-3547675000.0,47.252332,27.634867,7.417718,0.392839
50%,1909.0,54.220514,4.271796,0.879725,359165.5,313742800.0,71.990357,44.733832,15.56845,0.445601
75%,1964.0,77.539311,9.664251,2.163012,1085692.0,8476086000.0,93.801311,69.540886,30.45795,0.549357
max,2019.0,252.081567,2947.733,52.308685,18915090.0,357870800000.0,569.084027,139.426459,71.8412,0.73863
