# Simple inputer is used to fill na values before feeeding to ML models

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data/income_evaluation.csv', na_values = ' ?')
# here we tell pandas that ? found in data is a missing value
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

# Let's create some more missing values

In [7]:
# hours per week missing values
np.random.seed(seed=0)
h = np.random.choice(a=df.index, replace=False, size=20) #df.index is basically the row no. in default
df.loc[h, ' hours-per-week'] = np.nan

In [8]:
# age missing values
np.random.seed(seed=10)
a = np.random.choice(a=df.index, replace=False, size=28)
df.loc[a, 'age'] = np.nan

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', axis=1),
                                                    df[' income'], test_size=0.2,
                                                    random_state=30)

In [10]:
#-------------------------------- Imputing Age's Missing Values---------------------------

In [12]:
#create object
si_age = SimpleImputer(strategy='mean', add_indicator=True)

In [15]:
# fit object on training set
a = pd.DataFrame(si_age.fit_transform(X_train[['age']]))
a

Unnamed: 0,0,1
0,64.0,0.0
1,40.0,0.0
2,36.0,0.0
3,33.0,0.0
4,48.0,0.0
...,...,...
26043,23.0,0.0
26044,61.0,0.0
26045,31.0,0.0
26046,47.0,0.0


In [16]:
# by which value model has imputed the missing values
si_age.statistics_

array([38.54201729])

In [17]:
# a has two col: first - value of age, 2nd- telling a bool if value 
# was missing in that row if yes it is assigned as 1

In [18]:
# let's see that bool
a[a[1] == 1]

Unnamed: 0,0,1
2969,38.542017,1.0
3219,38.542017,1.0
3522,38.542017,1.0
4925,38.542017,1.0
5543,38.542017,1.0
5754,38.542017,1.0
6305,38.542017,1.0
7237,38.542017,1.0
8587,38.542017,1.0
11314,38.542017,1.0


In [20]:
# create the same for occupation
si_occ = SimpleImputer(strategy='constant', add_indicator=True, fill_value='not available')
# since occ is not a number, we will use strategy as 'constant'/'most_frequent'
# here constant means that we will be replacing nan with particular value

In [21]:
si_occ.fit_transform(X_train[[' occupation']])

array([[' Exec-managerial', False],
       [' Transport-moving', False],
       [' Transport-moving', False],
       ...,
       [' Other-service', False],
       [' Sales', False],
       [' Tech-support', False]], dtype=object)

In [24]:

# see data
b = pd.DataFrame(si_occ.fit_transform(X_train[[' occupation']]))
b

Unnamed: 0,0,1
0,Exec-managerial,False
1,Transport-moving,False
2,Transport-moving,False
3,Craft-repair,False
4,Adm-clerical,False
...,...,...
26043,Farming-fishing,False
26044,Adm-clerical,False
26045,Other-service,False
26046,Sales,False


In [28]:
b[b[1] == True]

Unnamed: 0,0,1
26,not available,True
29,not available,True
51,not available,True
61,not available,True
73,not available,True
...,...,...
25968,not available,True
25974,not available,True
25998,not available,True
26004,not available,True


In [29]:
#------------------ filling same mean in age test set as well-------------------------

In [33]:
si_age.statistics_

array([38.54201729])

In [30]:
si_age.transform(X_test[['age']])

array([[48.,  0.],
       [63.,  0.],
       [33.,  0.],
       ...,
       [48.,  0.],
       [54.,  0.],
       [58.,  0.]])

In [31]:
c = pd.DataFrame(si_age.transform(X_test[['age']]))

In [32]:
c[c[1] == 1]

Unnamed: 0,0,1
2526,38.542017,1.0
4068,38.542017,1.0
4111,38.542017,1.0
5324,38.542017,1.0
5930,38.542017,1.0
