# Imputing Data on 2014

In [1]:
import pandas as pd
import numpy as np


#%matplotlib notebook to make interactive within same window as JN
#%matplotlib
#%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D

#SciKit Learn
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

from sklearn import preprocessing
from sklearn.decomposition import PCA

#Plotly
import plotly.express as px
import plotly.graph_objects as go

import seaborn as sns
import scipy as sci
import statsmodels as stats

## Data

In [11]:
# Data

WorldData = pd.read_csv("WorldMarketData.csv", na_values = '..')
df_original = pd.DataFrame(WorldData)
pd.options.display.float_format = '{:,.2f}'.format
df_original.columns = ['country', 'country_code', 'series', 'series_code', '1994', '1995', '1996', '1997', '1998', 
             '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', 
             '2012', '2013', '2014', '2015', '2016', '2017', '2018']

df = df_original.drop(columns=['country_code', 'series_code'])
df = df.drop(df.index[3800:])
df1 = pd.melt(df,
                       ["country", 'series'],
                       var_name="year",
                       value_name="value")
#df1 = df1.sort_values(by=["year"], ascending = True)
df1 = df1.set_index('country').drop(index = ['World', 'Low income', 'Middle income', 'High income'])
df1 = df1.reset_index()
df1.head()

Unnamed: 0,country,series,year,value
0,United States,Agricultural raw materials exports (% of merch...,1994,3.49
1,United States,Agricultural raw materials imports (% of merch...,1994,2.04
2,United States,Average time to clear exports through customs ...,1994,
3,United States,Bribery incidence (% of firms experiencing at ...,1994,
4,United States,Changes in inventories (current US$),1994,63785000000.0


In [13]:
df_stack = df1.set_index(['series', 'country'])
df_stack_all = df_stack

dfsa = df_stack_all.reset_index()
dfsa = dfsa.pivot_table(index = ['country', 'year'], columns = 'series', values='value')
dfsa = dfsa.reset_index()
dfsa = dfsa.dropna(thresh=len(dfsa) - 1000, axis=1)

dfsa14 = dfsa[dfsa['year'] == '2014']
dfsa14 = dfsa14.drop(columns = ['year'])
dfsa14.set_index('country', inplace = True)

dfsa14.head()

series,Agricultural raw materials exports (% of merchandise exports),Agricultural raw materials imports (% of merchandise imports),Changes in inventories (current US$),Commercial service exports (current US$),Commercial service imports (current US$),"Computer, communications and other services (% of commercial service exports)","Computer, communications and other services (% of commercial service imports)",Cost of business start-up procedures (% of GNI per capita),"Cost to export, border compliance (US$)","Cost to export, documentary compliance (US$)",...,"Tariff rate, applied, simple mean, all products (%)","Tariff rate, most favored nation, weighted mean, all products (%)",Technicians in R&D (per million people),Time required to build a warehouse (days),Time required to start a business (days),"Time to export, border compliance (hours)","Time to export, documentary compliance (hours)","Time to import, border compliance (hours)","Time to import, documentary compliance (hours)",Trade (% of GDP)
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Argentina,1.07,0.89,6752296832.26,13189837804.76,17628881981.13,43.11,37.89,17.7,150.0,60.0,...,12.58,11.5,318.81,342.5,24.5,21.0,30.0,60.0,336.0,28.41
Australia,2.58,0.68,-2035251996.7,55610991575.62,66749264533.55,22.6,28.32,0.7,749.0,264.0,...,2.84,2.69,,112.0,2.5,36.0,7.0,37.0,3.0,42.51
Bahrain,0.03,1.04,414893617.02,8571276595.74,6764414893.62,25.26,15.41,0.8,47.0,100.0,...,3.75,3.99,17.21,174.0,9.3,79.0,24.0,54.0,84.0,175.56
Belgium,1.39,1.27,1714139993.32,123026103755.97,117578947480.15,59.37,52.46,5.2,0.0,0.0,...,2.58,2.81,,212.0,4.0,0.0,1.0,0.0,1.0,164.7
Brazil,4.01,0.99,16587335316.62,39046701888.6,85915546867.55,62.78,50.47,5.5,862.0,226.4,...,13.74,9.92,,434.0,83.6,61.0,30.0,51.1,146.1,24.69


In [5]:
DF = dfsa14.copy()
fill_NaN = SimpleImputer(missing_values=np.nan, strategy='median')
imputed_DF = pd.DataFrame(fill_NaN.fit_transform(DF))
imputed_DF.columns = DF.columns
imputed_DF.index = DF.index
#Imputes the mean for the series at 2014, not for the country over the years...
#Because this is just for 2014, we will press on with these values, even though they are wildly innacurate.

In [15]:
DF.isnull().sum().sum() 

196

In [16]:
#All NaN values have been imputed
imputed_DF.isnull().sum().sum() 

0