# Wellbeing Dashboard Python
## A - Merging Data
This Jupyter Notebook takes data from seven different data sets and merges it together for further imputation

In [None]:
# Doing all major library imports
import matplotlib.pyplot as plt
import scikitplot as skplt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import re

from sklearn import datasets, metrics
from sklearn.linear_model import LinearRegression, LogisticRegression,LogisticRegressionCV 
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from matplotlib.colors import ListedColormap
from sklearn.pipeline import Pipeline, make_pipeline

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import scikitplot as skplt
from matplotlib.colors import ListedColormap
from sklearn.metrics import classification_report, confusion_matrix

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.float_format = '{:.2f}'.format

# World Bank as its own data wrapper so leveraging that
import wbdata as wb
import requests

**Process & Structure:**

1) All data was collected from various sources (7 - see 'raw_data' folder).

2) Each dataset was cleaned in Python or Excel and converted to a CSV.

3) The CSVs were merged in this sheet on Country Code and Date to create a master data set for modeling purposes.


In [None]:
# Starting with UNDP data
undp = pd.read_csv ('../raw_data/undp_dataset/undp_output.csv')
print (undp.shape)
undp.head()

In [None]:
# Merging in WorldBank data
worldbank = pd.read_csv ('../raw_data/wb_dataset/wb_output.csv')
print (worldbank.shape)
worldbank.head()

In [None]:
master = pd.merge (undp, worldbank, how = 'left', on = ['country_code', 'date'])
print (master.shape)
master.head()

In [None]:
# Merging in IMF data
imf = pd.read_csv ('../raw_data/imf_dataset/imf_2019_output.csv')
print (imf.shape)
imf.head()

In [None]:
master = pd.merge (master, imf, how = 'left', on = ['country_code', 'date'])
print (master.shape)
master.head()

In [None]:
# Merging in High Income Low Income CLassification data

hili = pd.read_csv ('../raw_data/high_income_low_income_dataset/high_inc_low_inc_class_output.csv')
print (hili.shape)
hili.head()

In [None]:
master = pd.merge (master, hili, how = 'left', on = ['country_code', 'date'])
print (master.shape)
master.head()

In [None]:
#Adding IMF's Export Quality data set
eq = pd.read_csv ('../raw_data/imf_dataset/eq_imf_output.csv')
print (eq.shape)
eq.head()

In [None]:
master = pd.merge (master, eq, how = 'left', on = ['country_code', 'date'])
print (master.shape)
master.head()

In [None]:
#Adding IMF's Government Revenue dataset
gr = pd.read_csv ('../raw_data/imf_dataset/gov_rev_imf_output.csv')
print (gr.shape)
gr.head()

In [None]:
master = pd.merge (master, gr, how = 'left', on = ['country_code', 'date'])
print (master.shape)
master.head()

In [None]:
#Adding Global Hunger Index
gh = pd.read_csv ('../raw_data/hunger_dataset/hunger_index_output.csv')
print (gh.shape)
gh.head()

In [None]:
master = pd.merge (master, gh, how = 'left', on = ['country_code', 'date'])
print (master.shape)
master.head()

In [None]:
#Adding Corruption Index
cpi = pd.read_csv ('../raw_data/corruption_dataset/corruption_index_output.csv')
print (cpi.shape)
cpi.head()

In [None]:
master = pd.merge (master, cpi, how = 'left', on = ['country_code', 'date'])
print (master.shape)
master.head()

In [None]:
#Adding compilied UN Data
un = pd.read_csv ('../raw_data/un_dataset/un_compiled_output.csv')
print (un.shape)
un.head()

In [None]:
master = pd.merge (master, un, how = 'left', on = ['country_code', 'date'])
print (master.shape)
master.head()

In [None]:
# Marking null values
master.replace (0,np.nan, inplace=True)
master.isnull().sum()

In [None]:
# Creating a separate list of just variables or 'predictors'
predictors = list(master.columns)[3:]
predictors [:10]

In [None]:
# Sorting list of predictors for easy reading
predictors.sort()
predictors [:10]

In [None]:
# Removing any duplicate predictors
predictors_nod = list(set(predictors))
predictors_nod.sort()
predictors_nod[:10]

In [None]:
# Adjusting the dataframe to ensure everything is in order
new_cols = ['country_code', 'country_x', 'date'] + predictors_nod
master = master[new_cols]
master.head()

In [None]:
# Dropping columns that are not needed / superfluous
master.drop(columns=[
    'country',
    'country_y',
    'WEO Country Code',
    'Country Name_x',
    'Country Name_y',
    'Country_x',
    'Country_x',
    'Country_y',
    'Country_y'
], inplace=True)

print (master.shape)
master.head()

In [None]:
# Renaming columns appropriately
master.columns = ['country_code', 'country', 'date'] + list(master.columns)[3:]
print (master.shape)
master.head()

In [None]:
# Exporting data for modeling and cleaning purposes
master.to_csv ('../raw_data/poverty_data.csv', index= False)