In [73]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [74]:
df = pd.read_csv("loan_data.csv")

In [75]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [76]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
count,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0
mean,27.764178,80319.05,5.410333,9583.157556,11.006606,0.139725,5.867489,632.608756,0.222222
std,6.045108,80422.5,6.063532,6314.886691,2.978808,0.087212,3.879702,50.435865,0.415744
min,20.0,8000.0,0.0,500.0,5.42,0.0,2.0,390.0,0.0
25%,24.0,47204.0,1.0,5000.0,8.59,0.07,3.0,601.0,0.0
50%,26.0,67048.0,4.0,8000.0,11.01,0.12,4.0,640.0,0.0
75%,30.0,95789.25,8.0,12237.25,12.99,0.19,8.0,670.0,0.0
max,144.0,7200766.0,125.0,35000.0,20.0,0.66,30.0,850.0,1.0


In [77]:
df.isnull().sum()

Unnamed: 0,0
person_age,0
person_gender,0
person_education,0
person_income,0
person_emp_exp,0
person_home_ownership,0
loan_amnt,0
loan_intent,0
loan_int_rate,0
loan_percent_income,0


In [78]:
df.isnull().values.any()

np.False_

In [79]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [80]:
df.columns

Index(['person_age', 'person_gender', 'person_education', 'person_income',
       'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'previous_loan_defaults_on_file', 'loan_status'],
      dtype='object')

In [81]:
encoded_df = pd.get_dummies(df, columns=['person_gender', 'person_education','person_home_ownership','loan_intent','previous_loan_defaults_on_file'])

In [82]:
encoded_df

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,person_gender_female,...,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_No,previous_loan_defaults_on_file_Yes
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1,True,...,False,True,False,False,False,False,True,False,True,False
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0,True,...,True,False,False,True,False,False,False,False,False,True
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,1,True,...,False,False,False,False,False,True,False,False,True,False
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,1,True,...,False,True,False,False,False,True,False,False,True,False
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,1,False,...,False,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,47971.0,6,15000.0,15.66,0.31,3.0,645,1,False,...,False,True,False,False,False,True,False,False,True,False
44996,37.0,65800.0,17,9000.0,14.07,0.14,11.0,621,1,True,...,False,True,False,False,True,False,False,False,True,False
44997,33.0,56942.0,7,2771.0,10.02,0.05,10.0,668,1,False,...,False,True,True,False,False,False,False,False,True,False
44998,29.0,33164.0,4,12000.0,13.23,0.36,6.0,604,1,False,...,False,True,False,True,False,False,False,False,True,False


In [83]:
X = encoded_df.drop(columns=['loan_status'])

In [84]:
y = encoded_df['loan_status']

In [85]:
X.shape

(45000, 27)

In [86]:
y.shape

(45000,)

In [87]:
x_train , x_test, y_train , y_test = train_test_split(X,y, test_size=0.2,random_state=1)

In [88]:
x_train.shape

(36000, 27)

In [89]:
x_test.shape

(9000, 27)

In [90]:
y_train.shape

(36000,)

In [91]:
y_test.shape

(9000,)

In [92]:
scaler = StandardScaler()

In [93]:
x_train = scaler.fit_transform(x_train)

In [94]:
x_test = scaler.fit_transform(x_test)

In [95]:
model = LogisticRegression()
model.fit(x_train,y_train)

In [96]:
y_pred = model.predict(x_test)

In [97]:
y_pred

array([0, 0, 1, ..., 1, 0, 0])

In [98]:
y

Unnamed: 0,loan_status
0,1
1,0
2,1
3,1
4,1
...,...
44995,1
44996,1
44997,1
44998,1


In [99]:
y_prob = model.predict_proba(x_test)[:, 1]

In [100]:
y_prob

array([5.23115596e-06, 1.78735962e-04, 9.39350838e-01, ...,
       7.75742419e-01, 2.67557516e-01, 2.40470048e-04])

In [101]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

Accuracy: 0.8937777777777778
F1 Score: 0.7583417593528817
ROC-AUC: 0.9530012550128908


In [104]:
"""
If i don't use Standard Scaler, performance metrics are:
Accuracy: 0.8374444444444444
F1 Score: 0.5735937044593413
ROC-AUC: 0.841767846808894

With Standard Scaler:
Accuracy: 0.8937777777777778
F1 Score: 0.7583417593528817
ROC-AUC: 0.9530012550128908
"""

"\nIf i don't use Standard Scaler metrics are:\nAccuracy: 0.8374444444444444\nF1 Score: 0.5735937044593413\nROC-AUC: 0.841767846808894\n\nWith Standard Scaler:\nAccuracy: 0.8937777777777778\nF1 Score: 0.7583417593528817\nROC-AUC: 0.9530012550128908\n"