# Methods To Handle Missing Values

1) Drop Na
2) Fill na with mean
3) Linear Regression
4) SimpleInterpolation
5) Multiple Interpolation
6) Multiple Imputation

### Basic Imports

In [1]:
import numpy as np
import matplotlib as plt
import pandas as pd

### Method 1: Drop NA
This method drops all the values that are missing.

In [2]:
#importing a in-built dataset
from sklearn.datasets import fetch_california_housing
#reading the data
data = fetch_california_housing()
#getting the data in DF form
df = pd.DataFrame(data.data, columns = data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [3]:
#introducing missing values
df.iloc[10:600:3] = np.nan #Says that, from row rage 10-600 with step of 3 fill such row values as null .
df.head(30)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25
7,3.12,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25


In [4]:
#this is the method
df.dropna(inplace = True)
df.head(69)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
92,0.7500,52.0,2.823529,0.911765,191.0,5.617647,37.80,-122.28
93,2.6354,27.0,3.493377,1.149007,718.0,2.377483,37.79,-122.27
95,2.0096,36.0,2.294016,1.066294,3469.0,1.493328,37.80,-122.26
96,2.8345,31.0,3.894915,1.127966,2048.0,1.735593,37.82,-122.26


In [5]:
df.isnull().sum() #Missing Values Gone....

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

### Method 2 : Fillna using mean
There are two ways here, one by using pandas and other using simpleimputer

In [6]:
#importing a in-built dataset
from sklearn.datasets import fetch_california_housing
#reading the data
data = fetch_california_housing()
#getting the data in DF form
df = pd.DataFrame(data.data, columns = data.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [7]:
df_p = df 

In [8]:
#Uses existing pandas mean method to fill the data up
df_p.fillna(df_p.mean(), inplace = True)
df_p.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [9]:
df_s = df 

In [10]:
#Another method is to use SimpleImputer available under sklearn.impute
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()

In [11]:
arr_imputed = imputer.fit_transform(df_s)
arr_imputed

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [12]:
#Now to convert the above array into dataframe
df_imputed = pd.DataFrame(arr_imputed, columns = df_s.columns) #here we get the imputed array as data and the column names of the df_s as column names  
df_imputed

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [13]:
df_imputed.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

### Method 3 Linear Regression
We use Linear Regression

In [14]:
from sklearn.datasets import load_iris
data = load_iris()
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [15]:
#Converting data into dataframe
df = pd.DataFrame(data.data, columns=data.feature_names)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [16]:
#checking null values
df.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [17]:
#Introducing Null Values
df.iloc[10:100:4,0] = np.nan
df.isnull().sum()

sepal length (cm)    23
sepal width (cm)      0
petal length (cm)     0
petal width (cm)      0
dtype: int64

In [18]:
#Splitting data such that one set has just the missing values and other doesnt
df_miss = df[df.isna().any(axis=1)]
df_nomis = df.dropna()


In [19]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
x_train = df_nomis.drop(columns = ['sepal length (cm)'])
y_train = df_nomis['sepal length (cm)']
x_test = df_miss.drop(columns = ['sepal length (cm)'])
regressor.fit(x_train,y_train)

In [20]:
df_miss['sepal length (cm)'] = regressor.predict(x_test)
df_miss

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_miss['sepal length (cm)'] = regressor.predict(x_test)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
10,5.197308,3.7,1.5,0.2
14,5.180866,4.0,1.2,0.2
18,5.346803,3.8,1.7,0.3
22,4.793145,3.6,1.0,0.2
26,4.977509,3.4,1.6,0.4
30,4.888822,3.1,1.6,0.2
34,4.820548,3.1,1.5,0.2
38,4.621207,3.0,1.3,0.2
42,4.746793,3.2,1.3,0.2
46,5.328376,3.8,1.6,0.2


In [21]:
df_final = pd.concat([df_miss, df_nomis], axis = 0)
df_final.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

### Method 4: Linear Interpolation


In [22]:
from sklearn.datasets import load_iris
iris=  load_iris()
x = pd.DataFrame(iris.data, columns = iris.feature_names)
y = iris.target

In [23]:
#Adding missing values
x.iloc[10:20, 0] = None
x.iloc[20:30, 2] = None
x.isnull().sum()

sepal length (cm)    10
sepal width (cm)      0
petal length (cm)    10
petal width (cm)      0
dtype: int64

In [24]:
x = x.interpolate(method='linear')

In [25]:
x.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

### Method 5: Multiple Interpolation

In [26]:
from sklearn.datasets import load_iris
iris = load_iris()
x = pd.DataFrame(iris.data, columns = iris.feature_names)
y = iris.target

In [27]:
# Putting some missing values
x.iloc[10:20,0] = None
x.isnull().sum()

sepal length (cm)    10
sepal width (cm)      0
petal length (cm)     0
petal width (cm)      0
dtype: int64

In [28]:
# Interpolate missing values using different interpolation methods
x['sepal length (cm)'] = x['sepal length (cm)'].interpolate(method='linear')
x['sepal width (cm)'] = x['sepal width (cm)'].interpolate(method='quadratic')

In [29]:
x.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

### Method 6: Multiple Imputation Technique

In [30]:
from sklearn.datasets import load_iris
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer

In [31]:
data = load_iris()
x = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [32]:
# Adding some missing values
x.iloc[10:40:2,0] = None

In [33]:
imp = IterativeImputer(max_iter = 10, random_state=0)
x_imputed = imp.fit_transform(x) #is an array now

In [34]:
pd.DataFrame(x_imputed).isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64