In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
# Import the csv file
train_df = pd.read_csv(Path('Resources/lending_data.csv'))

In [3]:
# Display train_df
train_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [4]:
# Display info from train_df
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB


In [5]:
# Null values
train_nonull_df = train_df.isnull()
print(train_nonull_df)


       loan_size  interest_rate  borrower_income  debt_to_income  \
0          False          False            False           False   
1          False          False            False           False   
2          False          False            False           False   
3          False          False            False           False   
4          False          False            False           False   
...          ...            ...              ...             ...   
77531      False          False            False           False   
77532      False          False            False           False   
77533      False          False            False           False   
77534      False          False            False           False   
77535      False          False            False           False   

       num_of_accounts  derogatory_marks  total_debt  loan_status  
0                False             False       False        False  
1                False             False       

In [6]:
# Duplicate entries dropped, if any
clean_train_df = train_df.drop_duplicates()
clean_train_df.shape
print(clean_train_df)


       loan_size  interest_rate  borrower_income  debt_to_income  \
0        10700.0          7.672            52800        0.431818   
1         8400.0          6.692            43600        0.311927   
2         9000.0          6.963            46100        0.349241   
3        10700.0          7.664            52700        0.430740   
4        10800.0          7.698            53000        0.433962   
...          ...            ...              ...             ...   
77524    16900.0         10.302            77500        0.612903   
77526    18300.0         10.895            83100        0.638989   
77528    15100.0          9.557            70500        0.574468   
77531    19100.0         11.261            86600        0.653580   
77534    16300.0         10.068            75300        0.601594   

       num_of_accounts  derogatory_marks  total_debt  loan_status  
0                    5                 1       22800            0  
1                    3                 0       

In [7]:
clean_train_df['loan_status'].value_counts()

0    3653
1    1576
Name: loan_status, dtype: int64

In [8]:
target = clean_train_df['loan_status']
target_names = ['negative', 'positive']

In [9]:
# Cleaned data displayed, removing column "loan_status"

cleaned_data = clean_train_df.drop('loan_status', axis = 1)
cleaned_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


Logistic Regression: Prediction
The logistic regression will be better predictor than the random forest classifier for this dataset. 

In [10]:
# Split the data into X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(clean_train_df, target, random_state=42)

X_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
5036,7900.0,6.488,41600,0.278846,2,0,11600,0
457,13500.0,8.848,63900,0.530516,7,1,33900,0
49687,12200.0,8.314,58800,0.489796,6,1,28800,0
8441,8300.0,6.658,43200,0.305556,2,0,13200,0
76946,20600.0,11.887,92500,0.675676,14,3,62500,1


Linear Regression Model Prediction (Unscaled)

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
# Create the Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model

LogisticRegression(max_iter=1000)

In [13]:
# Fit model
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [14]:
training_data_score = model.score(X_train, y_train)
testing_data_score = model.score(X_test, y_test)
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.8923743942871717
Testing Data Score: 0.918960244648318


In [15]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 0, 1])

In [16]:
# Dependencies
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [17]:
metrics.confusion_matrix(y_test, y_pred)

array([[833,  73],
       [ 33, 369]])

In [18]:
# Calculating accuracy
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

Accuracy: 0.918960244648318


In [19]:
# Manually calculated the accuracy of the model
TP = 369
FP = 73
TN = 833
FN = 33
accuracy = (TP + TN) / (TP + FP + TN + FN)
print(accuracy)

0.918960244648318


In [20]:
# Calculate the classification report
print(classification_report(y_test, y_pred, target_names=target_names))


              precision    recall  f1-score   support

    negative       0.96      0.92      0.94       906
    positive       0.83      0.92      0.87       402

    accuracy                           0.92      1308
   macro avg       0.90      0.92      0.91      1308
weighted avg       0.92      0.92      0.92      1308



Random Forest Classifier Prediction (Unscaled)

In [32]:
# Import a Random Forests classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [26]:
# Create a classifier
classifier = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets
classifier.fit(X_train, y_train)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [25]:
# Fit a model, and then print a classification report
# Not sure why the numbers are equating 1.0

clf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       906
    positive       1.00      1.00      1.00       402

    accuracy                           1.00      1308
   macro avg       1.00      1.00      1.00      1308
weighted avg       1.00      1.00      1.00      1308

Training Score: 1.0
Testing Score: 1.0


In [34]:
metrics.confusion_matrix(y_test, y_pred)

array([[906,   0],
       [  0, 402]])

In [29]:
y_pred = clf.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0])

In [33]:
# Calculate accuracy

print(f"Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [35]:
# Calculate accuracy
#Not sure why I am getting a 1.0
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


In [36]:
# Calculate the classification report
print(classification_report(y_test, y_pred, target_names=target_names))


              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       906
    positive       1.00      1.00      1.00       402

    accuracy                           1.00      1308
   macro avg       1.00      1.00      1.00      1308
weighted avg       1.00      1.00      1.00      1308



Logistic Regression Prediction (Scaled)

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
model2 = LogisticRegression()
model2.fit(X_train_scaled, y_train)
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [40]:
# Confusion matrix

y_pred_scaled = model2.predict(X_test_scaled)
metrics.confusion_matrix(y_test, y_pred_scaled)

array([[906,   0],
       [  0, 402]])

In [41]:
# Calculate the classification report
print(classification_report(y_test, y_pred_scaled, target_names=target_names))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       906
    positive       1.00      1.00      1.00       402

    accuracy                           1.00      1308
   macro avg       1.00      1.00      1.00      1308
weighted avg       1.00      1.00      1.00      1308



Random Forest Classifier (Scaled)

In [42]:
classifier2 = RandomForestClassifier(n_estimators=100)
classifier2.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier2.score(X_test_scaled, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [43]:
# Prediction on test data
y_pred_scaled = classifier2.predict(X_test_scaled)
metrics.confusion_matrix(y_test, y_pred_scaled)

array([[906,   0],
       [  0, 402]])

In [44]:
# Calculate the classification report
print(classification_report(y_test, y_pred_scaled, target_names=target_names))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       906
    positive       1.00      1.00      1.00       402

    accuracy                           1.00      1308
   macro avg       1.00      1.00      1.00      1308
weighted avg       1.00      1.00      1.00      1308



Predictions and Results (LinearRegression-LR, RandomForestClassifier-RFC)

Unscaled:
Prediction - Linear Regression will prove to be a better model since the loan status outcomes are either 0, 1 and 2.
Results LR - Training Data Score: 0.8923743942871717
             Testing Data Score: 0.918960244648318
Results RFC - Training Data Score: 1.0
              Testing Data Score: 1.0  
Seems that the RFC is a solid predictor with a testing data score of 1.0**
              
Scaled:
Results LR - Training Data Score: 1.0
             Testing Data Score: 1.0
Results RFC - Training Data Score: 1.0
              Testing Data Score: 1.0
Seems both of the models are testing at 1.0**

** Something is not calculating correctly with the modeling...

