In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

test_data = pd.read_csv(Path("Resources/test_Y3wMUE5_7gLdaTN.csv"))
train_data = pd.read_csv(Path("Resources/train_u6lujuX_CVtuZ9i.csv"))

In [2]:
test_data.head()
#test_data.isnull().sum()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [3]:
train_data.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Gather Basic Info on Dataset

In [4]:
train_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
train_data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [6]:
train_data.columns.tolist()

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

In [7]:
# Check for null values in the dataset:
train_data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
# Are there any null values in the Loan_Status column?
train_data['Loan_Status'].isnull().sum()

0

In [9]:
# Check for an imbalanced dataset using value counts of Loan_Status column
# NOTE: consider plotting this data to visualize
train_data['Loan_Status'].value_counts()

Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [10]:
# Check column value counts for possible use of GetDummies for preprocessing
# train_data['Loan_Amount_Term'].value_counts() (MAYBE)
# train_data['LoanAmount'].value_counts() (NO)
# train_data['Credit_History'].value_counts() (alredy numerical)
# train_data['Dependents'].value_counts() (MAYBE)
# train_data['Property_Area'].value_counts() (YES)
train_data['Education'].value_counts()

Education
Graduate        480
Not Graduate    134
Name: count, dtype: int64

The Dataset is Imbalanced. We will need to balance the dataset.

Loan_ID: DROP COLUMN

Gender: encode numerically by defining encoding function
- Drop All Null

Married: encode numerically by defining encoding function
- Drop All Null

Dependents: Get Dummies
- Look into logic of encoding all null values based on Marriage value

Education: encode numerically by defining encoding function
- no null values exist

Self_Employed: encode numerically by defining encoding function
- Change all null to not self employed

ApplicantIncome: already numerical (confirm all numerical values)
- combine with CoapplicantIncome

CoapplicantIncome: already numerical (confirm all numerical values)
- combine with ApplicatIncome

LoanAmount: already numerical (confirm all numerical values)
- K Nearest Neighbor?

Loan_Amount_Term: encode numerically by defining encoding function
- Change all null to 360. Prediction would rsult in the same

Credit_History: already numerical
- use K Nearest Neighbors where loan status = No to predict replacements for null values and apply.

Property_Area: Get Dummies.

Loan_Status: NOTHING TO CHANGE

# Data Transformation Section

In [30]:
train_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [31]:
preprocessing_df = train_data.copy()
preprocessing_df['Total_Income'] = preprocessing_df['ApplicantIncome'] + preprocessing_df['CoapplicantIncome']
preprocessing_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,5849.0
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0


In [35]:
preprocessing_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
       'Loan_Status', 'Total_Income'],
      dtype='object')

In [34]:
columns_to_drop = ['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome']
preprocessing_df.drop(columns=columns_to_drop, inplace=True)
preprocessing_df = preprocessing_df[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Total_Income', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']]
preprocessing_df.head()

KeyError: "['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome'] not found in axis"

In [None]:
preprocessing_df.drop(columns=['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome'], inplace=True)

preprocessing_df.head()

## Gender 

In [16]:
preprocessing_df['Gender'].value_counts()

Gender
Male      489
Female    112
Name: count, dtype: int64

In [17]:
gender_drop = preprocessing_df[preprocessing_df['Gender'].isnull()]
preprocessing_df.drop(index=gender_drop.index, inplace=True)
preprocessing_df.reset_index(drop=True, inplace=True)
preprocessing_df['Gender'].isnull().sum()

0

In [18]:
# Encoding Gender Column numerically by setting Female = 1 and Male = 0
def encode_gender(gender):
    if gender == "Female":
        return 1
    else:
        return 0

# Call the encode_gender function on the gender column
preprocessing_df["Gender"] = preprocessing_df["Gender"].apply(encode_gender)

# Review the DataFrame 
preprocessing_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,0,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,0,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,0,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,0,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Marriage 

In [19]:
# Remove all null values
marriage_drop = preprocessing_df[preprocessing_df['Married'].isnull()]
preprocessing_df.drop(index=marriage_drop.index, inplace=True)
preprocessing_df.reset_index(drop=True, inplace=True)
preprocessing_df['Married'].isnull().sum()

0

In [20]:
# Encoding Marriage Column numerically by setting yes = 1 and no = 0
def encode_marriage(marriage):
    if marriage == "Yes":
        return 1
    else:
        return 0

# Call the encode_marriage function on the marriage column
preprocessing_df["Married"] = preprocessing_df["Married"].apply(encode_marriage)

# Review the DataFrame 
preprocessing_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,0,1,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,0,1,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,0,1,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,0,0,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Education

In [21]:
# No null values exist for Education
# Encoding Marriage Column by setting yes as 1 and no as 0
def encode_edu(edu):
    if edu == "Graduate":
        return 1
    else:
        return 0

# Call the encode_edu function on the Education column
preprocessing_df["Education"] = preprocessing_df["Education"].apply(encode_edu)

# Review the DataFrame 
preprocessing_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,1,No,5849,0.0,,360.0,1.0,Urban,Y
1,0,1,1,1,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,0,1,0,1,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,0,1,0,0,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,0,0,0,1,No,6000,0.0,141.0,360.0,1.0,Urban,Y
