## Using KNN to impute missing values in a dataset

#### 1. Load the data and libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("C:/Users/deepe/OneDrive/Desktop/INSAID/datasets/ablone data set/abalone-missing-values.csv")
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
Sex               4177 non-null object
Length            4177 non-null float64
Diameter          4177 non-null float64
Height            4177 non-null float64
Whole weight      4177 non-null float64
Shucked weight    4177 non-null float64
Viscera weight    4177 non-null float64
Shell weight      4139 non-null float64
Rings             4177 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [4]:
4177 - data.count()

Sex                0
Length             0
Diameter           0
Height             0
Whole weight       0
Shucked weight     0
Viscera weight     0
Shell weight      38
Rings              0
dtype: int64

In [6]:
data.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Rings'],
      dtype='object')

In [7]:
data = pd.get_dummies(data, columns=['Sex'], drop_first=True)

In [8]:
data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,1,0


#### 2. Lets Seprate the Explanatory and Target Variables in X and Y

In [9]:
X = data.copy()

In [10]:
X.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,1,0


In [11]:
X2 = X[X.isnull().any(axis=1)]

In [12]:
X2.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_I,Sex_M
38,0.575,0.445,0.135,0.883,0.381,0.2035,,11,0,0
173,0.52,0.405,0.115,0.776,0.32,0.1845,,8,0,0
303,0.36,0.27,0.1,0.217,0.0885,0.0495,,6,0,1
439,0.5,0.415,0.165,0.6885,0.249,0.138,,13,0,1
572,0.59,0.455,0.155,1.066,0.382,0.2275,,20,0,0


In [13]:
X2.count()

Length            38
Diameter          38
Height            38
Whole weight      38
Shucked weight    38
Viscera weight    38
Shell weight       0
Rings             38
Sex_I             38
Sex_M             38
dtype: int64

In [14]:
X1 = X[X['Shell weight'].notnull()]

In [15]:
X1.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,1,0


In [16]:
X1.count()

Length            4139
Diameter          4139
Height            4139
Whole weight      4139
Shucked weight    4139
Viscera weight    4139
Shell weight      4139
Rings             4139
Sex_I             4139
Sex_M             4139
dtype: int64

In [17]:
4139+38

4177

#### 3. Split the X set into X1 and X2. Where X2 is the set with the missing values

In [18]:
Y1 = X1.pop('Rings')
Y2 = X2.pop('Rings')

In [20]:
Y2.head()

38     11
173     8
303     6
439    13
572    20
Name: Rings, dtype: int64

In [21]:
X1_SW=X1.pop('Shell weight')
X2_SW=X2.pop('Shell weight')

In [23]:
X2.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Sex_I,Sex_M
38,0.575,0.445,0.135,0.883,0.381,0.2035,0,0
173,0.52,0.405,0.115,0.776,0.32,0.1845,0,0
303,0.36,0.27,0.1,0.217,0.0885,0.0495,0,1
439,0.5,0.415,0.165,0.6885,0.249,0.138,0,1
572,0.59,0.455,0.155,1.066,0.382,0.2275,0,0


#### 4.  Scale the dataset

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [25]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X1)                                                                                   # fit the data to Standardscaler

X1s = sc.transform(X1)                                                               # transform the data

print('After standardizing our features, the first 5 rows of our data now look like this:\n')   # print the first 5 value after standardization
print(pd.DataFrame(X1s, columns=X1.columns).head()) 

After standardizing our features, the first 5 rows of our data now look like this:

     Length  Diameter    Height  Whole weight  Shucked weight  Viscera weight  \
0 -0.575623 -0.433462 -1.064625     -0.643300       -0.608952       -0.727490   
1 -1.449676 -1.440817 -1.184061     -1.231503       -1.172316       -1.206587   
2  0.048701  0.120583 -0.109135     -0.310971       -0.464731       -0.357901   
3 -0.700488 -0.433462 -0.348007     -0.639223       -0.649514       -0.608857   
4 -1.616163 -1.541552 -1.422934     -1.273299       -1.217385       -1.288718   

      Sex_I     Sex_M  
0 -0.687716  1.315375  
1 -0.687716  1.315375  
2 -0.687716 -0.760240  
3 -0.687716  1.315375  
4  1.454088 -0.760240  


In [27]:
X1s = pd.DataFrame(X1s, columns=X1.columns)

In [28]:
X2s = sc.transform(X2) 
X2s = pd.DataFrame(X2s, columns=X1.columns)
X2s.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Sex_I,Sex_M
0,0.423295,0.372422,-0.109135,0.109029,0.096379,0.207889,-0.687716,-0.76024
1,-0.034542,-0.03052,-0.58688,-0.109126,-0.178543,0.034502,-0.687716,-0.76024
2,-1.366433,-1.390449,-0.945189,-1.248833,-1.221892,-1.197461,-0.687716,1.315375
3,-0.201029,0.070215,0.607483,-0.287524,-0.498533,-0.389841,-0.687716,1.315375
4,0.54816,0.473157,0.368611,0.482135,0.100885,0.426905,-0.687716,-0.76024


#### 5. Use K=5 for imputing the missig values

In [29]:
from sklearn.neighbors import KNeighborsRegressor

In [30]:
# Instantiate learning model (k = 5)
regressor = KNeighborsRegressor(n_neighbors=5)

# Fitting the model
regressor.fit(X1s, X1_SW)

# Predicting the Test set results
X2_SW_pred = regressor.predict(X2s)

In [31]:
X2_SW = X2_SW_pred
X2_SW

array([0.2669, 0.2025, 0.0738, 0.2716, 0.3287, 0.0608, 0.0766, 0.2924,
       0.1732, 0.5429, 0.237 , 0.4213, 0.1693, 0.3521, 0.2791, 0.2935,
       0.158 , 0.093 , 0.1093, 0.0989, 0.0076, 0.3861, 0.1132, 0.2036,
       0.1555, 0.2236, 0.3501, 0.0626, 0.1587, 0.144 , 0.339 , 0.1501,
       0.0578, 0.1414, 0.1889, 0.2709, 0.4959, 0.131 ])

In [33]:
len(X2_SW)

38

#### 6. Fill in the missing values in the Xs table

In [34]:
X1['Shell weight'] = X1_SW
X2['Shell weight'] = X2_SW

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [36]:
X2.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Sex_I,Sex_M,Shell weight
38,0.575,0.445,0.135,0.883,0.381,0.2035,0,0,0.2669
173,0.52,0.405,0.115,0.776,0.32,0.1845,0,0,0.2025
303,0.36,0.27,0.1,0.217,0.0885,0.0495,0,1,0.0738
439,0.5,0.415,0.165,0.6885,0.249,0.138,0,1,0.2716
572,0.59,0.455,0.155,1.066,0.382,0.2275,0,0,0.3287


In [37]:
X1['Rings'] = Y1
X2['Rings'] = Y2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [38]:
X2.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Sex_I,Sex_M,Shell weight,Rings
38,0.575,0.445,0.135,0.883,0.381,0.2035,0,0,0.2669,11
173,0.52,0.405,0.115,0.776,0.32,0.1845,0,0,0.2025,8
303,0.36,0.27,0.1,0.217,0.0885,0.0495,0,1,0.0738,6
439,0.5,0.415,0.165,0.6885,0.249,0.138,0,1,0.2716,13
572,0.59,0.455,0.155,1.066,0.382,0.2275,0,0,0.3287,20


In [43]:
X1.append(X2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4177 entries, 0 to 4121
Data columns (total 10 columns):
Length            4177 non-null float64
Diameter          4177 non-null float64
Height            4177 non-null float64
Whole weight      4177 non-null float64
Shucked weight    4177 non-null float64
Viscera weight    4177 non-null float64
Sex_I             4177 non-null uint8
Sex_M             4177 non-null uint8
Shell weight      4177 non-null float64
Rings             4177 non-null int64
dtypes: float64(7), int64(1), uint8(2)
memory usage: 301.9 KB


In [45]:
X = X1.append(X2)
X

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Sex_I,Sex_M,Shell weight,Rings
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0,1,0.1500,15
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0,1,0.0700,7
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0,0,0.2100,9
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0,1,0.1550,10
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,1,0,0.0550,7
...,...,...,...,...,...,...,...,...,...,...
3647,0.485,0.365,0.125,0.4260,0.1630,0.0965,1,0,0.1414,8
3757,0.520,0.410,0.140,0.6990,0.3395,0.1290,1,0,0.1889,10
3865,0.525,0.395,0.165,0.7820,0.2850,0.1405,0,1,0.2709,19
3960,0.690,0.550,0.195,1.7770,0.7690,0.3800,0,1,0.4959,11


In [49]:
X.reindex(range(4177))

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Sex_I,Sex_M,Shell weight,Rings
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0,1,0.1500,15
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0,1,0.0700,7
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0,0,0.2100,9
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0,1,0.1550,10
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,1,0,0.0550,7
...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0,0,0.2490,11
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0,1,0.2605,10
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0,1,0.3080,9
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0,0,0.2960,10


In [50]:
data

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_I,Sex_M
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,0,1
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,0,1
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,0,0
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,0,1
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,1,0
...,...,...,...,...,...,...,...,...,...,...
4172,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,0,0
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,0,1
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,0,1
4175,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,0,0


#### 7. Use Support Vector Regression to Predict Y

In [51]:
X

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Sex_I,Sex_M,Shell weight,Rings
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0,1,0.1500,15
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0,1,0.0700,7
2,0.530,0.420,0.135,0.6770,0.2565,0.1415,0,0,0.2100,9
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0,1,0.1550,10
4,0.330,0.255,0.080,0.2050,0.0895,0.0395,1,0,0.0550,7
...,...,...,...,...,...,...,...,...,...,...
3647,0.485,0.365,0.125,0.4260,0.1630,0.0965,1,0,0.1414,8
3757,0.520,0.410,0.140,0.6990,0.3395,0.1290,1,0,0.1889,10
3865,0.525,0.395,0.165,0.7820,0.2850,0.1405,0,1,0.2709,19
3960,0.690,0.550,0.195,1.7770,0.7690,0.3800,0,1,0.4959,11


In [52]:
Y = X.pop('Rings')

In [53]:
X.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Sex_I,Sex_M,Shell weight
0,0.455,0.365,0.095,0.514,0.2245,0.101,0,1,0.15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0,1,0.07
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0,0,0.21
3,0.44,0.365,0.125,0.516,0.2155,0.114,0,1,0.155
4,0.33,0.255,0.08,0.205,0.0895,0.0395,1,0,0.055


In [54]:
sc.fit(X)                                                                                   # fit the data to Standardscaler

Xs = sc.transform(X)                                                               # transform the data

print('After standardizing our features, the first 5 rows of our data now look like this:\n')   # print the first 5 value after standardization
print(pd.DataFrame(Xs, columns=X.columns).head()) 

After standardizing our features, the first 5 rows of our data now look like this:

     Length  Diameter    Height  Whole weight  Shucked weight  Viscera weight  \
0 -0.574558 -0.432149 -1.064424     -0.641898       -0.607685       -0.726212   
1 -1.448986 -1.439929 -1.183978     -1.230277       -1.170910       -1.205221   
2  0.050033  0.122130 -0.107991     -0.309469       -0.463500       -0.356690   
3 -0.699476 -0.432149 -0.347099     -0.637819       -0.648238       -0.607600   
4 -1.615544 -1.540707 -1.423087     -1.272086       -1.215968       -1.287337   

      Sex_I     Sex_M  Shell weight  
0 -0.688018  1.316677     -0.637982  
1 -0.688018  1.316677     -1.212648  
2 -0.688018 -0.759488     -0.206983  
3 -0.688018  1.316677     -0.602066  
4  1.453451 -0.759488     -1.320398  


In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, Y, test_size=0.20, random_state=1)

In [58]:
X_train

array([[-0.1998034 , -0.28098176,  0.37022577, ..., -0.68801788,
        -0.75948762, -0.24289988],
       [ 0.54970607,  0.47485339,  0.25067161, ..., -0.68801788,
        -0.75948762,  0.27070743],
       [ 0.13331192, -0.18020374, -0.34709919, ...,  1.45345059,
        -0.75948762, -0.35064967],
       ...,
       [-1.53226468, -1.59109603, -1.66219495, ...,  1.45345059,
        -0.75948762, -1.32039774],
       [ 1.13265788,  0.97874349,  1.2071049 , ..., -0.68801788,
         1.31667716, -0.4368495 ],
       [-3.28112011, -3.10276633, -2.61862823, ...,  1.45345059,
        -0.75948762, -1.68674701]])

In [62]:
from sklearn.svm import SVR
clf = SVR(C=0.8, epsilon=0.2, kernel='poly')
#regr = RandomForestRegressor(max_depth=3)

In [63]:
clf.fit(X_train, y_train)
Y_predict_test = clf.predict(X_test)
#Ypredict = regr.predict(Xtest_pca)



In [68]:
from sklearn import metrics
MSE_test = metrics.mean_squared_error(y_test, Y_predict_test)
MSE_test

5.931345801740869

In [70]:
np.sqrt(MSE_test)

2.4354354439690797

In [71]:
data.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings,Sex_I,Sex_M
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4139.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.239055,9.933684,0.321283,0.365813
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139333,3.224169,0.467025,0.481715
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0,0.0,0.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0,0.0,0.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.235,9.0,0.0,0.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.3295,11.0,1.0,1.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0,1.0,1.0
