

# Dataset Information

Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan. Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers.
   
Variable | Description
----------|--------------
Loan_ID | Unique Loan ID
Gender | Male/ Female
Married | Applicant married (Y/N)
Dependents | Number of dependents
Education | Applicant Education (Graduate/ Under Graduate)
Self_Employed | Self employed (Y/N)
ApplicantIncome | Applicant income
CoapplicantIncome | Coapplicant income
LoanAmount | Loan amount in thousands
Loan_Amount_Term | Term of loan in months
Credit_History | credit history meets guidelines
Property_Area | Urban/ Semi Urban/ Rural
Loan_Status | Loan approved (Y/N)

In [None]:
# importing all the required libraries use in the project.

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')


# Retrieving Data

In [None]:
# loading the data from cvs file
# showing the first 5 rows from the data

df = pd.read_csv('train_loan_dataset.csv')
df.head()

In [None]:
# rows and columns of the dataset

df.shape

In [None]:
# statistical visualization

df.describe()

In [None]:
# data type of the features

df.info()

# Data Preparation

In [None]:
# find missing values of each feature of the dataset

df.isnull().sum()

In [None]:
# First we will the filling the missing values for numerical terms - By using mean

df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mean())

In [None]:
# missing values of the dataset

df.isnull().sum()

In [None]:
# THen we will the filling the missing values for categorical terms - By using mode operation (frequently occurring values)

df['Gender'] = df["Gender"].fillna(df['Gender'].mode()[0])
df['Married'] = df["Married"].fillna(df['Married'].mode()[0])
df['Dependents'] = df["Dependents"].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df["Self_Employed"].fillna(df['Self_Employed'].mode()[0])

In [None]:
# Missing values in the dataset

df.isnull().sum()

In [None]:
# Label encoding to convert string or object to numeric values

le = LabelEncoder()
df["Loan_Status"] = le.fit_transform(df["Loan_Status"])
df["Gender"] = le.fit_transform(df["Gender"])
df["Married"] = le.fit_transform(df["Married"])
df["Education"] = le.fit_transform(df["Education"])
df["Self_Employed"] = le.fit_transform(df["Self_Employed"])
df["Property_Area"] = le.fit_transform(df["Property_Area"])
df = df.replace(to_replace='3+',value=4)

In [None]:
# Dataset

df.head()


In [None]:
# data type of the features
 
df.info()

In [None]:
# Creating new feature to find more accurate predictions

df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']

In [None]:
# Dataset

df.head()

# Data Exploration | Exploratory Data Analysis

In [None]:
# visualizing the different features of the dataset

sns.countplot(x='Education',hue='Loan_Status',data=df)

In [None]:
sns.countplot(x='Married',hue='Loan_Status',data=df)

In [None]:
sns.countplot(x='Dependents',hue='Loan_Status',data=df)


In [None]:
sns.countplot(x='Self_Employed',hue='Loan_Status',data=df)


In [None]:
sns.countplot(x="Loan_Status",data=df)

In [None]:
sns.distplot(df["LoanAmount"])

In [None]:
# Drop unnecessary columns to make processing fast

df = df.drop(columns=["Loan_ID","ApplicantIncome","CoapplicantIncome"], axis=1)


In [None]:
df.head()

# Data Modeling

In [None]:
# Select the input and output variables

x = df.drop(columns=['Loan_ID','Loan_Status'], axis=1)
y = df['Loan_Status']

In [None]:
# Data modeling

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
model = LogisticRegression()
model.fit(x_train, y_train)
print("Accuracy is", model.score(x_test, y_test)*100)

In [None]:
# predicting model

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy is", accuracy_score(y_test,y_pred)*100)

In [None]:
# PCA to reduce dimensions

pca = PCA(0.95)
x_pca = pca.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.25, random_state=42)
model = LogisticRegression()
model.fit(x_train, y_train)
print("Accuracy is", model.score(x_test, y_test)*100)

In [None]:
# predicting model

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy is", accuracy_score(y_test,y_pred)*100)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
print("Accuracy is", model.score(x_test, y_test)*100)

In [None]:
# predicting model

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy is", accuracy_score(y_test,y_pred)*100)

# Presentation and Automation

In [None]:
# predicting model

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

cm = confusion_matrix(y_test,y_pred)

In [None]:
sns.heatmap(cm, annot=True)