# Primary WMDA Data Imputation

In [46]:
import pandas as pd
import numpy as np

#%matplotlib notebook to make interactive within same window as JN
#%matplotlib
#%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D

#SciKit Learn
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

from sklearn import preprocessing
from sklearn.decomposition import PCA

#Plotly
import plotly.express as px
import plotly.graph_objects as go

import seaborn as sns
import scipy as sci
import statsmodels as stats

from impyute.imputation.cs import mice
from collections import Counter
import warnings  


## Load Data

In [47]:
WorldData = pd.read_csv("WorldMarketData.csv", na_values = '..')
df_original = pd.DataFrame(WorldData)
pd.options.display.float_format = '{:,.2f}'.format
df_original.columns = ['country', 'country_code', 'series', 'series_code', '1994', '1995', '1996', '1997', '1998', 
             '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', 
             '2012', '2013', '2014', '2015', '2016', '2017', '2018']

df = df_original.drop(columns=['country_code', 'series_code'])
df = df.drop(df.index[3800:])
df1 = pd.melt(df,
                       ["country", 'series'],
                       var_name="year",
                       value_name="value")
#df1 = df1.sort_values(by=["year"], ascending = True)
df1 = df1.set_index('country').drop(index = ['World', 'Low income', 'Middle income', 'High income'])
df1 = df1.reset_index()
df1['year'] = df1['year'].astype(int)
df1.head()

Unnamed: 0,country,series,year,value
0,United States,Agricultural raw materials exports (% of merch...,1994,3.49
1,United States,Agricultural raw materials imports (% of merch...,1994,2.04
2,United States,Average time to clear exports through customs ...,1994,
3,United States,Bribery incidence (% of firms experiencing at ...,1994,
4,United States,Changes in inventories (current US$),1994,63785000000.0


In [48]:
warnings.filterwarnings('ignore')


df_stack = df1.set_index(['series', 'country'])
df_stack_all = df_stack

dfsa = df_stack_all.reset_index()
dfsa = dfsa.pivot_table(index = ['country', 'year'], columns = 'series', values='value', dropna=False)
dfsa = dfsa.reset_index()

dfsa = dfsa.dropna(thresh=len(dfsa) - 1000, axis=1)

dfsa14 = dfsa[dfsa['year'] == '2014']
dfsa14 = dfsa14.drop(columns = ['year'])
dfsa14.set_index('country', inplace = True)

#dfsa14.head()
dfsa.head()

series,country,year,Agricultural raw materials exports (% of merchandise exports),Agricultural raw materials imports (% of merchandise imports),Changes in inventories (current US$),Commercial service exports (current US$),Commercial service imports (current US$),"Computer, communications and other services (% of commercial service exports)","Computer, communications and other services (% of commercial service imports)",Cost of business start-up procedures (% of GNI per capita),...,"Tariff rate, applied, simple mean, all products (%)","Tariff rate, most favored nation, weighted mean, all products (%)",Technicians in R&D (per million people),Time required to build a warehouse (days),Time required to start a business (days),"Time to export, border compliance (hours)","Time to export, documentary compliance (hours)","Time to import, border compliance (hours)","Time to import, documentary compliance (hours)",Trade (% of GDP)
0,Argentina,1994,3.36,1.62,68602700.0,3180600000.0,6856100000.0,10.34,15.34,,...,,,,,,,,,,18.13
1,Argentina,1995,4.32,2.03,1451852800.0,3664500000.0,6961200000.0,11.16,15.46,,...,12.7,11.41,,,,,,,,19.77
2,Argentina,1996,3.81,1.91,4195708700.0,4239554642.49,7623604066.22,12.05,15.61,,...,14.45,13.4,,,,,,,,21.51
3,Argentina,1997,2.84,1.55,4263816100.0,4430771471.36,8675186708.79,10.91,15.94,,...,14.44,13.93,159.12,,,,,,,23.34
4,Argentina,1998,2.11,1.5,3096528400.0,4694769562.61,9047117536.8,10.48,16.9,,...,16.73,15.67,169.89,,,,,,,23.35


Note the current number of NaN values

In [49]:
dfsa.isnull().sum().sum() 

27140

>Reshape dataframe and apply a filter to remove any series that are missing all their values, from the entire dataframe

In [50]:
#Reshape
df2 = pd.melt(dfsa,["country", 'year'],var_name="series",value_name="value")
df2 = df2.pivot_table(index = ['country','series'], columns = 'year', values='value', dropna = False)

#Filter
m = df2.notna().sum(1).groupby(level=1).transform(lambda x: x.ge(1).all())
df2 = df2[m]

#Reset Shape
df2 = df2.reset_index()
df2 = pd.melt(df2,["country", 'series'],var_name="year",value_name="value")
df2.head(5)

Unnamed: 0,country,series,year,value
0,Argentina,Cost of business start-up procedures (% of GNI...,1994,
1,Argentina,"Cost to export, border compliance (US$)",1994,
2,Argentina,"Cost to export, documentary compliance (US$)",1994,
3,Argentina,"Cost to import, border compliance (US$)",1994,
4,Argentina,"Cost to import, documentary compliance (US$)",1994,


In [51]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54050 entries, 0 to 54049
Data columns (total 4 columns):
country    54050 non-null object
series     54050 non-null object
year       54050 non-null object
value      33596 non-null float64
dtypes: float64(1), object(3)
memory usage: 1.6+ MB


In [72]:
#Ensure that all series have the same value count
series_count = pd.DataFrame(Counter(df2.series).keys(), Counter(df2.series).values())
series_count

Unnamed: 0,0
1150,Cost of business start-up procedures (% of GNI...
1150,"Cost to export, border compliance (US$)"
1150,"Cost to export, documentary compliance (US$)"
1150,"Cost to import, border compliance (US$)"
1150,"Cost to import, documentary compliance (US$)"
1150,Export value index (2000 = 100)
1150,Exports of goods and services (% of GDP)
1150,Exports of goods and services (current US$)
1150,External balance on goods and services (curren...
1150,GDP (current US$)


In [73]:
#Ensure all countries have the same value count
country_count = pd.DataFrame(Counter(df2.country).keys(), Counter(df2.country).values())
country_count

Unnamed: 0,0
1175,Argentina
1175,Australia
1175,Bahrain
1175,Belgium
1175,Brazil
1175,Canada
1175,Chile
1175,China
1175,Colombia
1175,"Congo, Dem. Rep."


## Notes on the Imputation Loop:
> - The loop takes several minutes to complete
> - Multivariate Chained Imputation Equations (MICE) is the imputation used
> - To reduce compute cost and run speed, store the values into a `dict` and then form the df at the end
>> - I felt that the `append` method was more visually intuitive for the average reader
> - The imputation is done vai the grouping method so as to take advantage of the time series trend, ***With respect to each country and series***
>> - This ensures that the values are more realistic

In [54]:
#Set 'year' to Float so that `mice` can be applied
df2['year'] = df2['year'].astype(float)

#Group df2 by country-series pairs
dfgrp = df2.groupby(['country', 'series']) 

#Suppression of warnings (if not completed earlier):
#import warnings  
#ignore permanently:
#warnings.filterwarnings('ignore')
#reset wanrnings:
#warnings.resetwarnings()

#Create a blank dataframe for the imputed data
ImpData = pd.DataFrame([])

#call the group in a for loop
for dfgrp_index, group in dfgrp:

        #if any values within the group are null then proceed
        if group.isnull().values.any() == True:
            
            #Apply the imputation to the group's values
            group['value'] = ((mice(group.apply({'year': lambda x: x.values, 'value': lambda y: y.values})))[1]).values
            
            #Append the imputed group to the ImpData dataframe
            ImpData = ImpData.append(pd.DataFrame({'country': group.country.values, 'series': group.series.values, 'year': group.year.values, 'value': group.value.values}))
                
        #if no values in the group are null then append the groups values to the ImpData dataframe        
        else:
            
            #d = d.append(pd.DataFrame({'A': group.A.values, 'B': group.B.values, 'C': group.C.values, 'D': group.D.values}))
            ImpData = ImpData.append(pd.DataFrame({'country': group.country.values, 'series': group.series.values, 'year': group.year.values, 'value': group.value.values}))
            
            #Advance to the next group
            continue

#Reset the index of the dataframe after everything has been filled
ImpData = ImpData.reset_index()
#drop the created 'index' column (these two steps are unnecessary, but resetting the index is just a precaution)
ImpData = ImpData.drop(columns = ['index'])

#Change years back to Int from Float64 for readability
ImpData['year'] = ImpData['year'].astype(int)
#Display the total null values within the dataframe (should be 0)
ImpData.isnull().sum().sum() 

0

In [55]:
ImpData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54050 entries, 0 to 54049
Data columns (total 4 columns):
country    54050 non-null object
series     54050 non-null object
year       54050 non-null int64
value      54050 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.6+ MB


In [56]:
pd.DataFrame(Counter(ImpData.country).keys(), Counter(ImpData.country).values())

Unnamed: 0,0
1175,Argentina
1175,Australia
1175,Bahrain
1175,Belgium
1175,Brazil
1175,Canada
1175,Chile
1175,China
1175,Colombia
1175,"Congo, Dem. Rep."


copy the ImpData to a new dataframe for easier accessibility

In [57]:
imp_df = ImpData.copy()
imp_df.head()

Unnamed: 0,country,series,year,value
0,Argentina,Cost of business start-up procedures (% of GNI...,1994,20.39
1,Argentina,Cost of business start-up procedures (% of GNI...,1995,20.05
2,Argentina,Cost of business start-up procedures (% of GNI...,1996,19.71
3,Argentina,Cost of business start-up procedures (% of GNI...,1997,19.38
4,Argentina,Cost of business start-up procedures (% of GNI...,1998,19.04


### All previously missing values have been imputed

In [None]:
import pandas as pd
import numpy as np

#%matplotlib notebook to make interactive within same window as JN
#%matplotlib
#%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D

#SciKit Learn
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

from sklearn import preprocessing
from sklearn.decomposition import PCA

#Plotly
import plotly.express as px
import plotly.graph_objects as go

import seaborn as sns
import scipy as sci
import statsmodels as stats

from impyute.imputation.cs import mice
from collections import Counter
import warnings  

WorldData = pd.read_csv("WorldMarketData.csv", na_values = '..')
df_original = pd.DataFrame(WorldData)
pd.options.display.float_format = '{:,.2f}'.format
df_original.columns = ['country', 'country_code', 'series', 'series_code', '1994', '1995', '1996', '1997', '1998', 
             '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', 
             '2012', '2013', '2014', '2015', '2016', '2017', '2018']

df = df_original.drop(columns=['country_code', 'series_code'])
df = df.drop(df.index[3800:])
df1 = pd.melt(df,
                       ["country", 'series'],
                       var_name="year",
                       value_name="value")
#df1 = df1.sort_values(by=["year"], ascending = True)
df1 = df1.set_index('country').drop(index = ['World', 'Low income', 'Middle income', 'High income'])
df1 = df1.reset_index()
df1['year'] = df1['year'].astype(int)
df1.head()


warnings.filterwarnings('ignore')


df_stack = df1.set_index(['series', 'country'])
df_stack_all = df_stack

dfsa = df_stack_all.reset_index()
dfsa = dfsa.pivot_table(index = ['country', 'year'], columns = 'series', values='value', dropna=False)
dfsa = dfsa.reset_index()

dfsa = dfsa.dropna(thresh=len(dfsa) - 1000, axis=1)

dfsa14 = dfsa[dfsa['year'] == '2014']
dfsa14 = dfsa14.drop(columns = ['year'])
dfsa14.set_index('country', inplace = True)

#dfsa14.head()
dfsa.head()

#Reshape
df2 = pd.melt(dfsa,["country", 'year'],var_name="series",value_name="value")
df2 = df2.pivot_table(index = ['country','series'], columns = 'year', values='value', dropna = False)

#Filter
m = df2.notna().sum(1).groupby(level=1).transform(lambda x: x.ge(1).all())
df2 = df2[m]

#Reset Shape
df2 = df2.reset_index()
df2 = pd.melt(df2,["country", 'series'],var_name="year",value_name="value")
df2.head(5)

#Set 'year' to Float so that `mice` can be applied
df2['year'] = df2['year'].astype(float)

#Group df2 by country-series pairs
dfgrp = df2.groupby(['country', 'series']) 

#Suppression of warnings (if not completed earlier):
#import warnings  
#ignore permanently:
#warnings.filterwarnings('ignore')
#reset wanrnings:
#warnings.resetwarnings()

#Create a blank dataframe for the imputed data
ImpData = pd.DataFrame([])

#call the group in a for loop
for dfgrp_index, group in dfgrp:

        #if any values within the group are null then proceed
        if group.isnull().values.any() == True:
            
            #Apply the imputation to the group's values
            group['value'] = ((mice(group.apply({'year': lambda x: x.values, 'value': lambda y: y.values})))[1]).values
            
            #Append the imputed group to the ImpData dataframe
            ImpData = ImpData.append(pd.DataFrame({'country': group.country.values, 'series': group.series.values, 'year': group.year.values, 'value': group.value.values}))
                
        #if no values in the group are null then append the groups values to the ImpData dataframe        
        else:
            
            #d = d.append(pd.DataFrame({'A': group.A.values, 'B': group.B.values, 'C': group.C.values, 'D': group.D.values}))
            ImpData = ImpData.append(pd.DataFrame({'country': group.country.values, 'series': group.series.values, 'year': group.year.values, 'value': group.value.values}))
            
            #Advance to the next group
            continue

#Reset the index of the dataframe after everything has been filled
ImpData = ImpData.reset_index()
#drop the created 'index' column (these two steps are unnecessary, but resetting the index is just a precaution)
ImpData = ImpData.drop(columns = ['index'])

#Change years back to Int from Float64 for readability
ImpData['year'] = ImpData['year'].astype(int)
#Display the total null values within the dataframe (should be 0)
ImpData.isnull().sum().sum() 


imp_df = ImpData.copy()
imp_df.head()

In [None]:
s = imp_df.pivot_table(index = ['country', 'year'], columns = 'series', values= 'value')

sa = s.values #returns a numpy array, can also use to_numpy
min_max_scaler = preprocessing.MinMaxScaler()
sa_scaled = min_max_scaler.fit_transform(sa)
s2 = pd.DataFrame(sa_scaled, columns = s.columns, index = s.index)

pca = PCA(.95)
pca.fit(s2)
pca.n_components_
print(pca.explained_variance_ratio_)
pca.transform(s2)