In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("Resources/lending_data.csv")
df_lending = pd.read_csv(file_path)
# Review the DataFrame
df_lending.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_lending["loan_status"]
target_names = ["healthy loan", "high-risk loan"]

# Separate the X variable, the features
X = df_lending.drop("loan_status", axis=1)
feature_names = X.columns
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [4]:
# Review the y variable Series
print(y[:-5])
print(y[:5])

0        0
1        0
2        0
3        0
4        0
        ..
77526    1
77527    1
77528    1
77529    1
77530    1
Name: loan_status, Length: 77531, dtype: int64
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [5]:
# Review the y variable Series
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 77536 entries, 0 to 77535
Series name: loan_status
Non-Null Count  Dtype
--------------  -----
77536 non-null  int64
dtypes: int64(1)
memory usage: 605.9 KB


In [6]:
# Review the X variable DataFrame
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Machine Learning Model 1: Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [9]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
60914,0,0
36843,0,0
1966,0,0
70137,0,0
27237,0,0
...,...,...
45639,0,0
11301,0,0
51614,0,0
4598,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [10]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual healthy loan", "Actual high-risk loan"], columns=["Predicted healthy loan", "Predicted high-risk loan"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [11]:
# Print the classification report for the model
print("** Logistic Regression Model")
print("* Confusion Matrix")
display(cm_df)
print(f"* Accuracy Score : {acc_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions, target_names=target_names))

** Logistic Regression Model
* Confusion Matrix


Unnamed: 0,Predicted healthy loan,Predicted high-risk loan
Actual healthy loan,18655,110
Actual high-risk loan,36,583


* Accuracy Score : 0.9924680148576145

* Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      0.99      1.00     18765
high-risk loan       0.84      0.94      0.89       619

      accuracy                           0.99     19384
     macro avg       0.92      0.97      0.94     19384
  weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

The confusion matrix and classification report provide important insights into the performance of the logistic regression model in predicting healthy loans (0) and high-risk loans (1). 

- True Positives (TP): 583 high-risk loans correctly predicted as high-risk loans (Actual high-risk loan, Predicted high-risk loan).
- True Negatives (TN): 18655 healthy loans correctly predicted as healthy loans (Actual healthy loan, Predicted healthy loan).
- False Positives (FP): 110 healthy loans incorrectly predicted as high-risk loans (Actual healthy loan, Predicted high-risk loan).
- False Negatives (FN): 36 high-risk loans incorrectly predicted as healthy loans (Actual high-risk loan, Predicted healthy loan).

Accuracy:
Overall Accuracy: 0.99 — This means that the model predicts the correct label (healthy loan or high-risk loan) 99% of the time, indicating strong overall performance.

---

## Machine Learning Model 2: Support Vector Machines (SVM) Model

In [12]:
# Fit to the training data and validate with the test data
from sklearn.svm import SVC
model_svm = SVC(kernel='linear')
model_svm.fit(X_train, y_train)

In [13]:
# Make a prediction using the testing data
predictions_svm = model_svm.predict(X_test)

In [14]:
# Generate a confusion matrix for the model
cm_svm = confusion_matrix(y_test, predictions_svm)
cm_svm_df = pd.DataFrame(
    cm_svm, index=["Actual healthy loan", "Actual high-risk loan"], columns=["Predicted healthy loan", "Predicted high-risk loan"]
)

# Calculating the accuracy score
acc_svm_score = accuracy_score(y_test, predictions_svm)

In [15]:
# Print the classification report for the model
print("** Support Vector Machines (SVM) Model")
print("* Confusion Matrix")
display(cm_svm_df)
print(f"* Accuracy Score : {acc_svm_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_svm, target_names=target_names))

** Support Vector Machines (SVM) Model
* Confusion Matrix


Unnamed: 0,Predicted healthy loan,Predicted high-risk loan
Actual healthy loan,18654,111
Actual high-risk loan,15,604


* Accuracy Score : 0.9934997936442427

* Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      0.99      1.00     18765
high-risk loan       0.84      0.98      0.91       619

      accuracy                           0.99     19384
     macro avg       0.92      0.98      0.95     19384
  weighted avg       0.99      0.99      0.99     19384



---

## Machine Learning Model 3: Decision Tree Model

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn import tree
# Creating StandardScaler instance
scaler = StandardScaler()

In [17]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [18]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Creating the decision tree classifier instance
model_tree = tree.DecisionTreeClassifier()

In [20]:
# Fitting the model
model_tree = model_tree.fit(X_train_scaled, y_train)

In [21]:
# Making predictions using the testing data
predictions_tree = model_tree.predict(X_test_scaled)

In [22]:
# Generate a confusion matrix for the model
cm_tree = confusion_matrix(y_test, predictions_tree)
cm_tree_df = pd.DataFrame(
    cm_tree, index=["Actual healthy loan", "Actual high-risk loan"], columns=["Predicted healthy loan", "Predicted high-risk loan"]
)

# Calculating the accuracy score
acc_tree_score = accuracy_score(y_test, predictions_tree)

In [23]:
# Print the classification report for the model
print("** Decision Tree Model")
print("* Confusion Matrix")
display(cm_tree_df)
print(f"* Accuracy Score : {acc_tree_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_tree, target_names=target_names))

** Decision Tree Model
* Confusion Matrix


Unnamed: 0,Predicted healthy loan,Predicted high-risk loan
Actual healthy loan,18668,97
Actual high-risk loan,91,528


* Accuracy Score : 0.9903012794056955

* Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      0.99      0.99     18765
high-risk loan       0.84      0.85      0.85       619

      accuracy                           0.99     19384
     macro avg       0.92      0.92      0.92     19384
  weighted avg       0.99      0.99      0.99     19384



---

## Machine Learning Model 4: K-Nearest Neighbors (KNN) Model

In [24]:
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the model with k = 3 neighbors
model_knn = KNeighborsClassifier(n_neighbors=3)

In [25]:
# Train the model
model_knn.fit(X_train_scaled, y_train)

In [26]:
# Create predictions
predictions_knn = model_knn.predict(X_test_scaled)

# Review the predictions
predictions_knn

array([0, 0, 0, ..., 0, 0, 0])

In [27]:
# Generate a confusion matrix for the model
cm_knn = confusion_matrix(predictions_knn, y_test)
cm_knn_df = pd.DataFrame(
    cm_knn, index=["Actual healthy loan", "Actual high-risk loan"], columns=["Predicted healthy loan", "Predicted high-risk loan"]
)

# Calculating the accuracy score
acc_knn_score = accuracy_score(y_test, predictions_knn)

In [28]:
# Print the classification report for the model
print("** K-Nearest Neighbors (KNN) Model")
print("* Confusion Matrix")
display(cm_knn_df)
print(f"* Accuracy Score : {acc_knn_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_knn, target_names=target_names))

** K-Nearest Neighbors (KNN) Model
* Confusion Matrix


Unnamed: 0,Predicted healthy loan,Predicted high-risk loan
Actual healthy loan,18657,44
Actual high-risk loan,108,575


* Accuracy Score : 0.9921584812216261

* Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      0.99      1.00     18765
high-risk loan       0.84      0.93      0.88       619

      accuracy                           0.99     19384
     macro avg       0.92      0.96      0.94     19384
  weighted avg       0.99      0.99      0.99     19384



---

## Machine Learning Model 5: Random Forest Model

In [29]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
model_rf = RandomForestClassifier(n_estimators=500, random_state=78)

In [30]:
# Fitting the model
model_rf = model_rf.fit(X_train_scaled, y_train)

In [31]:
# Making predictions using the testing data
predictions_rf = model_rf.predict(X_test_scaled)

In [32]:
# Generate a confusion matrix for the model
cm_rf = confusion_matrix(predictions_knn, y_test)
cm_rf_df = pd.DataFrame(
    cm_rf, index=["Actual healthy loan", "Actual high-risk loan"], columns=["Predicted healthy loan", "Predicted high-risk loan"]
)

# Calculating the accuracy score
acc_rf_score = accuracy_score(y_test, predictions_rf)

In [33]:
# Print the classification report for the model
print("** Random Forest Model")
print("* Confusion Matrix")
display(cm_rf_df)
print(f"* Accuracy Score : {acc_rf_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_rf, target_names=target_names))

** Random Forest Model
* Confusion Matrix


Unnamed: 0,Predicted healthy loan,Predicted high-risk loan
Actual healthy loan,18657,44
Actual high-risk loan,108,575


* Accuracy Score : 0.9917457697069748

* Classification Report
                precision    recall  f1-score   support

  healthy loan       1.00      0.99      1.00     18765
high-risk loan       0.85      0.90      0.87       619

      accuracy                           0.99     19384
     macro avg       0.92      0.95      0.94     19384
  weighted avg       0.99      0.99      0.99     19384

