In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [39]:
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [40]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [41]:
df_train['Gender'].unique()

array(['Male', 'Female', nan], dtype=object)

In [42]:
df_train['Gender'].isnull().sum()

13

In [43]:
df_train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [44]:
df_train['Gender'] = df_train['Gender'].fillna(df_train['Gender'].mode()[0])
df_test['Gender'] = df_test['Gender'].fillna(df_train['Gender'].mode()[0])

In [45]:
df_train['Gender'].isnull().sum()

0

In [46]:
df_train['Married'] = df_train['Married'].fillna(df_train['Married'].mode()[0])
df_test['Married'] = df_test['Married'].fillna(df_train['Married'].mode()[0])

In [47]:
df_train['Married'].isnull().sum()

0

In [48]:
df_train['Dependents'].unique()

array(['0', '1', '2', '3+', nan], dtype=object)

In [49]:
df_train['Dependents'].value_counts()

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [50]:
df_train['Dependents'] = df_train['Dependents'].fillna(0)
df_test['Dependents'] = df_test['Dependents'].fillna(0)

In [51]:
df_train['Dependents'].isnull().sum()

0

In [52]:
df_train['Self_Employed'].unique()

array(['No', 'Yes', nan], dtype=object)

In [53]:
df_train['Self_Employed'] = df_train['Self_Employed'].fillna(df_train['Self_Employed'].mode()[0])
df_test['Self_Employed'] = df_test['Self_Employed'].fillna(df_train['Self_Employed'].mode()[0])

In [54]:
df_train['Self_Employed'].isnull().sum()

0

In [55]:
df_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [57]:
# if we see here the lean amount has different mean and median, mean is greater than median that means the data is 
# right skewed. Also mean is sensitive to outliers and the max value is way too off with 75 percentile, that means,
# 700 is very off. It better to replace the missing vlaues with median.
df_train['LoanAmount'] = df_train['LoanAmount'].fillna(df_train['LoanAmount'].median())
df_test['LoanAmount'] = df_test['LoanAmount'].fillna(df_train['LoanAmount'].median())

In [58]:
df_train['Loan_Amount_Term'].unique()

array([360., 120., 240.,  nan, 180.,  60., 300., 480.,  36.,  84.,  12.])

In [59]:
# loan term is the number of days the loan is given , so categorical variable that can be measured. 
# Using mode would be the correct option here
df_train['Loan_Amount_Term'] = df_train['Loan_Amount_Term'].fillna(df_train['Loan_Amount_Term'].mode()[0])
df_test['Loan_Amount_Term'] = df_test['Loan_Amount_Term'].fillna(df_train['Loan_Amount_Term'].mode()[0])

In [60]:
df_train['Credit_History'].unique()

array([ 1.,  0., nan])

In [61]:
# credit histiry is like yes or no so we should replace it with the mode
df_train['Credit_History'] = df_train['Credit_History'].fillna(df_train['Credit_History'].mode()[0])
df_test['Credit_History'] = df_test['Credit_History'].fillna(df_train['Credit_History'].mode()[0])

In [62]:
df_train.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [63]:
df_test.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64