# KNN imputer Algo Steps

Step 1. Choose a missing value to fill

Step 2. Select the other values in that row

Step 3. Choose number of neighbors, we chose 2 here

Step 4. Calculate nan_euclidian distance from all the other corresponding row elements


So if there are 5 rows, we selected 1 row.
total calculations will be 4 (distance of selected row to other rows in the data)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# to carry out the KNN imputer, we have to convert col in numerical cols
# Let's use ordinal encoder
df[' education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [4]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [5]:
# --------------- infuse more missing values-------------------------

In [6]:
# hours per week
np.random.seed(seed=0)
h = np.random.choice(a=df.index, replace=False, size=20)
df.loc[h, ' hours-per-week'] = np.nan

In [7]:
# age
np.random.seed(seed=10)
a = np.random.choice(a=df.index, replace=False, size=28)
df.loc[a, 'age'] = np.nan

In [8]:
df.isna().sum()

age                  28
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week      20
 native-country     583
 income               0
dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', 1), df[' income'],
                                                   test_size=0.2, random_state=5)


In [11]:
from sklearn.impute import KNNImputer

In [14]:
knn = KNNImputer(n_neighbors=5, add_indicator=True)

In [16]:
knn.fit(X_train)

ValueError: could not convert string to float: ' Private'

# could not convert string to float: ' Private'

Since X_train workclass contains 'Private' which needs to be assigned as 0 or 1 or 2 first

In [17]:
# let's check data type of each col
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 21425 to 2915
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              26025 non-null  float64
 1    workclass       24580 non-null  object 
 2    fnlwgt          26048 non-null  int64  
 3    education       26048 non-null  object 
 4    education-num   26048 non-null  int64  
 5    marital-status  26048 non-null  object 
 6    occupation      24573 non-null  object 
 7    relationship    26048 non-null  object 
 8    race            26048 non-null  object 
 9    sex             26048 non-null  object 
 10   capital-gain    26048 non-null  int64  
 11   capital-loss    26048 non-null  int64  
 12   hours-per-week  26029 non-null  float64
 13   native-country  25574 non-null  object 
dtypes: float64(2), int64(4), object(8)
memory usage: 3.0+ MB


In [19]:
# let's get all col with no 'Object' that is int/float
num = [col for col in X_train.columns if X_train[col].dtypes != 'O']
num

['age',
 ' fnlwgt',
 ' education-num',
 ' capital-gain',
 ' capital-loss',
 ' hours-per-week']

In [22]:
# let's see desired num col only which are not object
X_train[num]

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
21425,55.0,238216,9,0,0,40.0
28707,24.0,306460,9,0,0,40.0
4455,48.0,213140,4,0,0,40.0
2231,36.0,127306,13,0,0,40.0
18864,53.0,103586,13,0,0,55.0
...,...,...,...,...,...,...
3046,54.0,220115,10,0,0,30.0
26301,42.0,211517,7,0,0,40.0
20463,85.0,166027,9,0,0,50.0
18638,36.0,469056,9,0,0,25.0


# Method -- fit-- tranform

In [23]:
# now since we have got num col which are float/integer we can process in fit
knn.fit(X_train[num])

KNNImputer(add_indicator=True)

In [24]:
knn.transform(X_train[num])

array([[5.50000e+01, 2.38216e+05, 9.00000e+00, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00],
       [2.40000e+01, 3.06460e+05, 9.00000e+00, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00],
       [4.80000e+01, 2.13140e+05, 4.00000e+00, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00],
       ...,
       [8.50000e+01, 1.66027e+05, 9.00000e+00, ..., 5.00000e+01,
        0.00000e+00, 0.00000e+00],
       [3.60000e+01, 4.69056e+05, 9.00000e+00, ..., 2.50000e+01,
        0.00000e+00, 0.00000e+00],
       [2.60000e+01, 1.98163e+05, 1.40000e+01, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00]])

In [25]:
pd.DataFrame(knn.transform(X_train[num]))

Unnamed: 0,0,1,2,3,4,5,6,7
0,55.0,238216.0,9.0,0.0,0.0,40.0,0.0,0.0
1,24.0,306460.0,9.0,0.0,0.0,40.0,0.0,0.0
2,48.0,213140.0,4.0,0.0,0.0,40.0,0.0,0.0
3,36.0,127306.0,13.0,0.0,0.0,40.0,0.0,0.0
4,53.0,103586.0,13.0,0.0,0.0,55.0,0.0,0.0
...,...,...,...,...,...,...,...,...
26043,54.0,220115.0,10.0,0.0,0.0,30.0,0.0,0.0
26044,42.0,211517.0,7.0,0.0,0.0,40.0,0.0,0.0
26045,85.0,166027.0,9.0,0.0,0.0,50.0,0.0,0.0
26046,36.0,469056.0,9.0,0.0,0.0,25.0,0.0,0.0


Initially we had 6 col, now we have 8 col

First 6 are original cols

last 2 are indicator cols

6 for age,7 for hours per week

In [29]:
pd.DataFrame(knn.transform(X_train[num])).isna().sum().sum()

0

In [30]:
# ------------------------- Let's do it for Test sets as well---------------------

In [31]:
X_test[num]

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
32264,32.0,260954,7,0,2042,30.0
28024,31.0,236391,10,0,0,40.0
9347,59.0,175689,10,0,0,14.0
10427,37.0,114765,10,0,0,40.0
4810,40.0,179717,13,0,1564,60.0
...,...,...,...,...,...,...
22533,48.0,125892,5,0,0,40.0
934,37.0,186934,7,3103,0,44.0
6546,26.0,177482,12,0,0,45.0
31812,47.0,258498,10,0,0,52.0


In [32]:
X_test[num].isna().sum()

age                5
 fnlwgt            0
 education-num     0
 capital-gain      0
 capital-loss      0
 hours-per-week    1
dtype: int64

# Question arises if we do KNN imputer on test, will it copy the outcomes of 
# train to test or will it excute to fill from scratch

# yes it will not learn anything from test, it will work from scratch

In [33]:
knn.transform(X_test[num])

array([[3.20000e+01, 2.60954e+05, 7.00000e+00, ..., 3.00000e+01,
        0.00000e+00, 0.00000e+00],
       [3.10000e+01, 2.36391e+05, 1.00000e+01, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00],
       [5.90000e+01, 1.75689e+05, 1.00000e+01, ..., 1.40000e+01,
        0.00000e+00, 0.00000e+00],
       ...,
       [2.60000e+01, 1.77482e+05, 1.20000e+01, ..., 4.50000e+01,
        0.00000e+00, 0.00000e+00],
       [4.70000e+01, 2.58498e+05, 1.00000e+01, ..., 5.20000e+01,
        0.00000e+00, 0.00000e+00],
       [4.50000e+01, 1.60962e+05, 1.00000e+01, ..., 3.50000e+01,
        0.00000e+00, 0.00000e+00]])

In [35]:
pd.DataFrame(knn.transform(X_test[num])).isna().sum().sum()

0

In [36]:
pd.DataFrame(knn.transform(X_test[num]))

Unnamed: 0,0,1,2,3,4,5,6,7
0,32.0,260954.0,7.0,0.0,2042.0,30.0,0.0,0.0
1,31.0,236391.0,10.0,0.0,0.0,40.0,0.0,0.0
2,59.0,175689.0,10.0,0.0,0.0,14.0,0.0,0.0
3,37.0,114765.0,10.0,0.0,0.0,40.0,0.0,0.0
4,40.0,179717.0,13.0,0.0,1564.0,60.0,0.0,0.0
...,...,...,...,...,...,...,...,...
6508,48.0,125892.0,5.0,0.0,0.0,40.0,0.0,0.0
6509,37.0,186934.0,7.0,3103.0,0.0,44.0,0.0,0.0
6510,26.0,177482.0,12.0,0.0,0.0,45.0,0.0,0.0
6511,47.0,258498.0,10.0,0.0,0.0,52.0,0.0,0.0
