In [1]:
# Import libraries and modules
# import datasets
# pre-process dataset (cleaning, missing data, encoding, train/test splitting)
# train models
# test models

In [2]:
# Step 1 import all libraries
import pandas as pd
import numpy as np

In [3]:
# Step 2 import your dataset
# Loading the data from the excel file
def load_requests_data(filename):
    return pd.read_excel(filename)

requests_data = load_requests_data("loan_requests.xlsx")
requests_data.head()
# requests_data = requests_data.set_index("Loan Request ID")

Unnamed: 0,Loan Request ID,Gender,Age,Salary,Approval
0,732,Female,39,137000,Yes
1,258,Female,41,65500,No
2,1059,Male,42,146500,Yes
3,1075,Male,62,83000,Yes
4,853,Male,42,75000,No


In [4]:
# pre-process dataset (cleaning, missing data, encoding, train/test splitting)
requests_data.info() # Check that we have no missing data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Loan Request ID  1000 non-null   int64 
 1   Gender           1000 non-null   object
 2   Age              1000 non-null   int64 
 3   Salary           1000 non-null   int64 
 4   Approval         1000 non-null   object
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


In [5]:
# Creating dummy variables for Gender
gender_dummy = pd.get_dummies(requests_data['Gender'])
# Add dummy variable for Approval
approval_dummy = pd.get_dummies(requests_data['Approval'])
gender_dummy.head()

Unnamed: 0,Female,Male
0,1,0
1,1,0
2,0,1
3,0,1
4,0,1


In [6]:
# Add gender dummy variables columns to the request data
requests_data = pd.concat((requests_data, gender_dummy), axis=1)
# Add approval dummy variables columns to the request data
requests_data = pd.concat((requests_data, approval_dummy), axis=1)
requests_data.head()

Unnamed: 0,Loan Request ID,Gender,Age,Salary,Approval,Female,Male,No,Yes
0,732,Female,39,137000,Yes,1,0,0,1
1,258,Female,41,65500,No,1,0,1,0
2,1059,Male,42,146500,Yes,0,1,0,1
3,1075,Male,62,83000,Yes,0,1,0,1
4,853,Male,42,75000,No,0,1,1,0


In [7]:
requests_data.rename(columns={"Yes":"isApproved", "Female":"isFemale"}, inplace = True)
requests_data = requests_data.drop(['Gender', 'Male', 'No', 'Approval'], axis=1)

In [8]:
requests_data.head()

Unnamed: 0,Loan Request ID,Age,Salary,isFemale,isApproved
0,732,39,137000,1,1
1,258,41,65500,1,0
2,1059,42,146500,0,1
3,1075,62,83000,0,1
4,853,42,75000,0,0


In [9]:
# descriptive statistics over the data
requests_data.describe()

Unnamed: 0,Loan Request ID,Age,Salary,isFemale,isApproved
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,625.5,42.106,72689.0,0.516,0.402
std,288.819436,10.707073,34488.341867,0.499994,0.490547
min,126.0,20.0,15000.0,0.0,0.0
25%,375.75,34.0,46375.0,0.0,0.0
50%,625.5,42.0,72000.0,1.0,0.0
75%,875.25,50.0,90000.0,1.0,1.0
max,1125.0,65.0,152500.0,1.0,1.0


In [10]:
# How many requests are approved or not
requests_data["isApproved"].value_counts()

0    598
1    402
Name: isApproved, dtype: int64

In [11]:
# Step 3 import models and modules
from sklearn.linear_model import LogisticRegression
# For splitting the train and test data
from sklearn.model_selection import train_test_split

In [12]:
# Step 4 instantiate the models with hyperparamaterers
modelLogisticR = LogisticRegression() # default in this case

In [13]:
# Step 5 train the test split
# split x's and y ( .iloc[rows, columns])
# ignoring the request_id for now
x = requests_data.iloc[:,1:4] # age, salary, isfemale as x
y = requests_data.iloc[:,4] # isApproved as y

In [14]:
# Split train and test data (add random_state = 1 if you want to reproduce the same result again)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [15]:
# Step 6 train the models
modelLogisticR.fit(x_train, y_train)

LogisticRegression()

In [16]:
# Get the rSquare for the trained model
# print(modelLogisticR.score(x_test, y_test))


In [25]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
# confusion_matrix(y_train, y_train_pred)

In [18]:
# https://scikit-learn.org/stable/modules/model_evaluation.html
from sklearn.metrics import precision_score, recall_score

In [19]:
# Reciever Operating Characcteristic and Area Under Curve
from sklearn.metrics import roc_curve,  roc_auc_score

In [28]:
# Step 7 sklearn metrics
LogisticRPred = modelLogisticR.predict(x_test)
print(precision_score(LogisticRPred, y_test))

0.0


In [26]:
roc_auc_score(y_test, LogisticRPred, average=None)

0.5