# Wellbeing Dashboard Python
## B - Imputing Missing Values
This Jupyter Notebook imputes "N/A" or missing values using the sklearns experimental iterative imputer. Make sure you update your sklearn and make the appropriate installations for importing the imputer library.

In [None]:
# Doing all major library imports
import matplotlib.pyplot as plt
import scikitplot as skplt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import re

from sklearn import datasets, metrics
from sklearn.linear_model import LinearRegression, LogisticRegression,LogisticRegressionCV 
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from matplotlib.colors import ListedColormap
from sklearn.pipeline import Pipeline, make_pipeline

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import scikitplot as skplt
from matplotlib.colors import ListedColormap
from sklearn.metrics import classification_report, confusion_matrix

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.float_format = '{:.4f}'.format

In [None]:
# Importing data that was merged in the previous step (A - Merging Data)
master = pd.read_csv ('../raw_data/poverty_data.csv')
print (master.shape)
master.head()

In [None]:
dtypes = pd.DataFrame(master.dtypes, columns=['dtype'])
dtypes.head()

In [None]:
# Making sure all the predictors are float
print (dtypes[dtypes.dtype == object].shape)
dtypes[dtypes.dtype == object]

In [None]:
master.head()

In [None]:
# Initiating Iterative Imputer (docs: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html#sklearn.impute.IterativeImputer)
# First need to import this experimental feature
from sklearn.experimental import enable_iterative_imputer  # Make sure your Sklearn is updated - https://stackoverflow.com/questions/58332191/modulenotfounderror-no-module-named-sklearn-experimental
# Bow you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer

In [None]:
#Isolating countries that do not have enough values
vals_per_country = pd.DataFrame(master.groupby('country_code').count().sum(axis=1))
vals_per_country.sort_values(by=0)

In [None]:
# Only including countries that have more than 15000 values
countries_to_include = vals_per_country[vals_per_country[0] >= 15000].index
print (len(countries_to_include))
m2 = master[master.country_code.isin(countries_to_include)]
print (len(m2.country_code.unique()))

In [None]:
# Seeing before and after country exclusion
print (master.shape)
print (m2.shape)

In [None]:
# Further trying to exclude those countries that don't have enough data
avg_vals_per_country = pd.DataFrame(m2.groupby('country_code').count().mean())
avg_vals_per_country.sort_values(by=0).head(500)

In [None]:
# Removing any columns that have less than an average of 5 values per country
predictors_to_include = avg_vals_per_country[avg_vals_per_country[0] >= 5.0].index[2:]
predictors_to_include = pd.DataFrame(predictors_to_include, columns = ['ind'])

In [None]:
# Removing all columns that have "LCU" or local currency units
predictors_to_include = predictors_to_include[~predictors_to_include.ind.str.contains ('LCU')] 
predictors_to_include

In [None]:
# Removing other superfluous columns
remove_superfluous_cols = ['Agriculture, forestry, and fishing, value added (current US$)',
                           'Changes in inventories (current US$)',
                           'Changes in stocks (petajoules)',
                           'Consumer price index: General',
                           'Current health expenditure (% of GDP)_x',
                           'Current health expenditure (% of GDP)_y',
                           'Foreign direct investment, net inflows (% of GDP)_x',
                           'GDP (current US$)'
                          ]
predictors_to_include = predictors_to_include[~predictors_to_include.ind.isin (remove_superfluous_cols)] 
list(predictors_to_include.ind.values)

In [None]:
# Creating master list of columns for imputation
new_columns = list(master.columns [0:3]) + list(predictors_to_include.ind.values)

In [None]:
# Creating new dataframe with just the list of new columns
m3 = m2[new_columns]
m3.reset_index(drop = True, inplace=True)
m3.head()

In [None]:
# Finally Populating / Imputing Nulls!
# Step 1 is to create all the functions that will be needed to run the imputer.
# We will be extracting each country and imputing values for each country based on real values that are available for...
# ...that country

def extractor(df, country_code):
    """Extracting a country-specific dataset"""

    extract = df[df.country_code == country_code]
    extract.reset_index(inplace=True, drop = True)
    extract = extract.iloc[:,3:]
    extract.dropna(axis=1, inplace=True, how='any', thresh = 5) #removing columns that don't have atleast 5 values
    return extract

def imputer (c_extract, imputer_engine):
    """Imputing values for the extracted country"""
    
    imputed = pd.DataFrame(imputer_engine.fit_transform(c_extract))
    imputed.columns = c_extract.columns
    return imputed
    
def replacer (df, country_code, imputed):
    """Extracting country-specific dataset again to ensure that all columns are included"""
    
    extract_all_columns = df[df.country_code == country_code] 
    extract_all_columns.reset_index (inplace=True, drop = True)
    for col in imputed:
        extract_all_columns[col] = imputed[col].copy()
    return extract_all_columns

from tqdm import tqdm_notebook
def master_imputer (df, imputer_engine):
    """Master function that combines all the earlier functions to impute values for all countries and return one combined dataset"""
    
    df_imputed = pd.DataFrame()
    
    for country_code in tqdm_notebook(df.country_code.unique()):
        c_extract = extractor (df, country_code)
        imputed = imputer (c_extract, imputer_engine)
        replaced = replacer (df, country_code, imputed)
        df_imputed = pd.concat ([df_imputed, replaced], axis = 0)
    
    df_imputed.reset_index (inplace=True, drop = True)
    
    return df_imputed

In [None]:
# Imputing all nulls

# The model that we would be using for imputation
model = LassoCV(normalize=True, cv=3, n_alphas=100)

# The iterator we would be using with the relevant parameters
ii = IterativeImputer (estimator = model, tol = 0.00000000001, n_nearest_features=100)

# Calling the master_imputer function created earlier
working = master_imputer (m3, ii)

# Analyzing our results
print (working.shape)
print (working.isnull().sum())

In [None]:
working.describe()

In [None]:
# Exporting imputed file
working.to_csv('../raw_data/iterated.csv', index=False)

# Exporting original file (with same columns)
m3.to_csv ('../raw_data/wo_iterated.csv', index=False)

In [None]:
working.head(500)