In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import metrics

In [2]:
dataset = pd.read_csv('dataset.csv')

In [3]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
num_columns = dataset.select_dtypes(include = ['int64','float64']).columns.tolist()
cat_columns = dataset.select_dtypes(include = ['object']).columns.tolist()
cat_columns.remove('Loan_ID')
cat_columns.remove('Loan_Status')

In [5]:
# Filling categorical columns with mode
for col in cat_columns:
    dataset[col].fillna(dataset[col].mode()[0],inplace = True)

# Filling Numerical columns with median
for col in num_columns:
    dataset[col].fillna(dataset[col].median(),inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(dataset[col].mode()[0],inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(dataset[col].median(),inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whic

In [6]:
#Clipping Outliers
dataset[num_columns] = dataset[num_columns].apply(lambda x: x.clip(*x.quantile([0.05,0.95])))

In [7]:
#Log transformation to reduce right skewness
dataset['LoanAmount'] = np.log(dataset['LoanAmount']).copy()
dataset['TotalIncome'] = dataset['ApplicantIncome'] + dataset['CoapplicantIncome']
dataset['TotalIncome'] = np.log(dataset['TotalIncome']).copy()
dataset = dataset.drop(columns=['ApplicantIncome','CoapplicantIncome'])

dataset = dataset.drop(columns = ['Loan_ID'])

In [8]:
#Using Encoders to transform Categorical Columns

lab_enc_cols = ['Gender','Married','Education','Self_Employed','Loan_Status']
one_hot_cols = ['Dependents', 'Property_Area']

le = LabelEncoder()
for col in lab_enc_cols:
    dataset[col] = le.fit_transform(dataset[col])

one_encoder = OneHotEncoder(sparse_output = False)
one_hot_encoded = one_encoder.fit_transform(dataset[one_hot_cols])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns = one_encoder.get_feature_names_out(one_hot_cols))

dataset = dataset.drop(columns = one_hot_cols)
transformed_dataset = pd.concat([dataset,one_hot_encoded_df],axis = 1)

In [9]:
#Splitting the now transformed dataset into Train and Test Set


X = transformed_dataset.drop(columns = ['Loan_Status'])
y = transformed_dataset['Loan_Status']

#Scaling the data for ease of convergence
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3,random_state = 42)


In [10]:
#Training the models
RANDOM_SEED = 10

#Random Forest
rf_classifier = RandomForestClassifier(RANDOM_SEED)
param_grid_forest = {
    'n_estimators': [200, 400, 600],
    'max_depth': [10,20,30],
    'criterion':['gini','entropy'],
    'max_leaf_nodes': [50,100]
}
grid_search_forest = GridSearchCV(
    estimator = rf_classifier,
    param_grid = param_grid_forest,
    cv = 5,
    n_jobs = -1,
    scoring = 'f1'
)

model_forest = rf_classifier.fit(X_train,y_train)

#Logistic Regression
log_classifier = LogisticRegression(RANDOM_SEED)
param_grid_log = {
    'C': [100, 10, 1, 0.1],
    'penalty': ['l2']
}
grid_search_log = GridSearchCV(
    estimator = log_classifier,
    param_grid = param_grid_log,
    cv = 5,
    n_jobs = -1,
    scoring = 'f1'
)
model_log = grid_search_log.fit(X_train,y_train)

#Support Vector Machine Classifier
svm_classifier = SVC()
param_grid_svm = {
    'kernel': ['rbf'],
    'C':[ 0.01, 1, 100],
    'gamma': [0.01, 1, 10]
}
grid_search_svm = GridSearchCV(
    estimator = svm_classifier,
    param_grid = param_grid_svm,
    cv = 5,
    n_jobs = -1,
    scoring = 'f1'
)
model_svm = grid_search_svm.fit(X_train, y_train)

In [16]:
#Model Evaluation

def eval_metrics(model, X, y, name):
    pred = model.predict(X)

    F1_score = metrics.f1_score(y, pred)
    recall = metrics.recall_score(y, pred)
    precision = metrics.precision_score(y, pred)
    

    print(f' {name}  F1 Score {F1_score:.2f}  Precison {precision:.2f}  Recall {recall:.2f}')
eval_metrics(model_log, X_test, y_test, 'Logistic Regression')
eval_metrics(model_forest, X_test, y_test, 'Random Forest Classifier')
eval_metrics(model_svm, X_test, y_test, 'SVM Classifier Regession')

 Logistic Regression  F1 Score 0.86  Precison 0.76  Recall 0.98
 Random Forest Classifier  F1 Score 0.82  Precison 0.79  Recall 0.86
 SVM Classifier Regession  F1 Score 0.86  Precison 0.76  Recall 0.98
