# Missing Value Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [18]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
data.shape

(11, 4)

## To check the total missing values in each column of the data

In [20]:
data.isnull().sum()

Country      1
Age          1
Salary       1
Purchased    0
dtype: int64

## To check the percentage of missing values in each column of the data

In [21]:
data.isnull().sum() / len(data) * 100

Country      9.090909
Age          9.090909
Salary       9.090909
Purchased    0.000000
dtype: float64

## Missing value Treatment

- Missing value treatment is a simple process of getting rid of the missing present in the data

## Method 1 : Drop the missing value from the data

In [22]:
data_dropped = data.dropna()
data_dropped

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes
10,France,49.0,98000.0,Yes


In [23]:
data_dropped.shape

(8, 4)

### Dropping the missing values deletes the complete row in the data hence, dropping the missing values is not a good strategy

## Method 2: Mean Imputation : 

- Replace the missing values with the mean of that column(applicable only for numerical variables)

In [24]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [25]:
data['Age'].mean()

39.8

In [26]:
data['Salary'].mean()

67200.0

In [27]:
data['Age'] = data['Age'].fillna(40)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.0,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [28]:
data['Salary'] = data['Salary'].fillna(67200.0)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,67200.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.0,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Method 3 : Median Imputation

- Replace the missing values in the data with median value of that column(applicable only for numerical variables)

In [30]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [31]:
data['Age'].median()

39.0

In [32]:
data['Salary'].median()

64000.0

In [33]:
data_median = data.copy()

In [34]:
data_median['Age'] = data_median['Age'].fillna(39)
data_median['Salary'] = data_median['Salary'].fillna(64000)
data_median

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.0,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Method 4 : Mode Imputation

- Replace the missing values in the data with the mode of that columns(applicable for both object and numerical data)

In [36]:
data_mode = data.copy()

In [37]:
data_mode

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [41]:
data['Country'].mode()

0    France
Name: Country, dtype: object

In [42]:
data['Country'].mode()[0]

'France'

In [43]:
data_mode['Country'] = data_mode['Country'].fillna(data['Country'].mode()[0])
data_mode

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [44]:
data_mode['Age'] = data_mode['Age'].fillna(data_mode['Age'].mode()[0])
data_mode['Salary'] = data_mode['Salary'].fillna(data_mode['Salary'].mode()[0])
data_mode

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Method 5 : Forward fill Imputation
- Value present in the previous row is filled in place of the missing values(applicable for both numerical and object variables)

In [45]:
data_forward = data.copy()
data_forward

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [47]:
data_forward['Country'] = data_forward['Country'].fillna(method = 'ffill')
data_forward['Age'] = data_forward['Age'].fillna(method = 'ffill')
data_forward['Salary'] = data_forward['Salary'].fillna(method = 'ffill')
data_forward

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,35.0,52000.0,No
7,Spain,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Method 6 : Backward fill

- The next row value is replaced in place of missing value (applicable for both numerical and object variables)

In [49]:
data_bw = data.copy()
data_bw

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [51]:
data_bw['Country'] = data_bw['Country'].fillna(method = 'bfill')
data_bw['Age'] = data_bw['Age'].fillna(method = 'bfill')
data_bw['Salary'] = data_bw['Salary'].fillna(method = 'bfill')
data_bw

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,58000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,48.0,52000.0,No
7,Germany,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Summary of Missing value treatment :

- Numerical variables - Mean and median imputation using fillna() or sklearn impute package
- Object - Mode imputation using fillna() or sklearn impute package
- Numerical and object both - backward fill, forward fill, Mode(but only recommended for object)
- Dropna - deleting the missing(not recommended because the entire row in the data gets deleted)

#### Which is the best method out of all this?

- Numerical variable - Median()
- object variable - Mode()
- (if date is having missing value then go for dropna())

## Method 7: Mean, Median and Mode Imputation using sklearn library

In [52]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'mean')
si

In [53]:
data['Age'] = si.fit_transform(data[['Age']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.8,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [55]:
## Median imputation using sklear impute on Salary variable

from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'median')
si

In [56]:
data['Salary'] = si.fit_transform(data[['Salary']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.8,52000.0,No
7,,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [59]:
## Mode Imputation for Country variable using sklearn impute

from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'most_frequent')
si

In [60]:
data['Country'] = si.fit_transform(data[['Country']])
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,64000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,39.8,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Method 8 : Dropping the column

- When the missing value percentage of a column is greater than 30%, then you must drop the column

In [None]:
## Method 8 : Dropping the column

- When the missing value percentage of a column is greater than 30%, then you must drop the column

## Applying some of these methods on the Loan data

In [62]:
loan = pd.read_csv('loan_data_set.csv')
loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## To check total missing values in each column of the data

In [63]:
loan.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## To check the percentage missing values in the data

In [64]:
loan.isnull().sum() / len(loan) * 100

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        5.211726
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           3.583062
Loan_Amount_Term     2.280130
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [66]:
## Apply median imputation technique on Loan Amount variable

loan['LoanAmount'] = loan['LoanAmount'].fillna(loan['LoanAmount'].median())
loan.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [69]:
## Apply mode imputation on the Gender and Dependents column
## loan[['Gender', 'Dependents']]= loan[['Gender', 'Dependents']].fillna(loan[['Gender', 'Dependents']].mode()[0])

loan['Gender']= loan['Gender'].fillna(loan['Gender'].mode()[0])
loan['Dependents']= loan['Dependents'].fillna(loan['Dependents'].mode()[0])
loan.isnull().sum()

Loan_ID               0
Gender                0
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [71]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='most_frequent')
loan[['Gender', 'Dependents']]=si.fit_transform(loan[['Gender', 'Dependents']])

## ========================================