In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# All the Machine Learning models are saved inside the package called sklearn (scikit-learn).
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("./Resources/lending_data.csv")
df_lending_data = pd.read_csv(file_path)

# Review the DataFrame
df_lending_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [4]:
# Separate the data into labels and features
print(df_lending_data.columns)
# Separate the y variable, the labels
y = df_lending_data['loan_status']

# Separate the X variable, the features
X = df_lending_data.drop(columns =['loan_status'])
print(X.columns)

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')
Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt'],
      dtype='object')


In [5]:
# Review the y variable Series
print(f"Target {y[:5]}")

Target 0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [6]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=1)
# Assign a random_state of 1 to the function


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
LR = LogisticRegression()
KNN = KNeighborsClassifier()
LSVM = LinearSVC()
NLSVM = SVC(kernel='rbf')
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()


In [9]:
LR_fit = LR.fit(X_train, Y_train)
KNN_fit = KNN.fit(X_train, Y_train)
LSVM_fit = LSVM.fit(X_train, Y_train)
NLSVM_fit = NLSVM.fit(X_train, Y_train)
DT_fit = DT.fit(X_train, Y_train)
RF_fit = RF.fit(X_train, Y_train)



### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [10]:
# Make a prediction using the testing data
LR_pred = LR_fit.predict(X_test)
KNN_pred = KNN_fit.predict(X_test)
LSVM_pred = LSVM_fit.predict(X_test)
NLSVM_pred = NLSVM_fit.predict(X_test)
DT_pred = DT_fit.predict(X_test)
RF_pred = RF_fit.predict(X_test)


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [11]:
# Generate a confusion matrix for the model
# from sklearn.metrics import confusion_matrix
# confusion_matrix(y_test, predictions)

In [18]:
# Print the classification report for the model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
target_names = ["Healthy Loan", "High-Risk Loan"]

print("Logistic Regression is %f percent accurate" % (accuracy_score(LR_pred, Y_test)*100))
print(confusion_matrix(Y_test, LR_pred))
print(classification_report(Y_test, LR_pred, target_names=target_names))

print("KNN is %f percent accurate" % (accuracy_score(KNN_pred, Y_test)*100))
print(confusion_matrix(Y_test, KNN_pred))
print(classification_report(Y_test, KNN_pred, target_names=target_names))

print("Linear SVMs is %f percent accurate" % (accuracy_score(LSVM_pred, Y_test)*100))
print(confusion_matrix(Y_test, LSVM_pred))
print(classification_report(Y_test, LSVM_pred, target_names=target_names))

print("Non Linear SVMs is %f percent accurate" % (accuracy_score(NLSVM_pred, Y_test)*100))
print(confusion_matrix(Y_test, NLSVM_pred))
print(classification_report(Y_test, NLSVM_pred, target_names=target_names))

print("Decision Trees is %f percent accurate" % (accuracy_score(DT_pred, Y_test)*100))
print(confusion_matrix(Y_test, DT_pred))
print(classification_report(Y_test, DT_pred, target_names=target_names))

print("Random Forests is %f percent accurate" % (accuracy_score(RF_pred, Y_test)*100))
print(confusion_matrix(Y_test, RF_pred))
print(classification_report(Y_test, RF_pred, target_names=target_names))

Logistic Regression is 99.246801 percent accurate
[[18655   110]
 [   36   583]]
                precision    recall  f1-score   support

  Healthy Loan       1.00      0.99      1.00     18765
High-Risk Loan       0.84      0.94      0.89       619

      accuracy                           0.99     19384
     macro avg       0.92      0.97      0.94     19384
  weighted avg       0.99      0.99      0.99     19384

KNN is 99.329344 percent accurate
[[18651   114]
 [   16   603]]
                precision    recall  f1-score   support

  Healthy Loan       1.00      0.99      1.00     18765
High-Risk Loan       0.84      0.97      0.90       619

      accuracy                           0.99     19384
     macro avg       0.92      0.98      0.95     19384
  weighted avg       0.99      0.99      0.99     19384

Linear SVMs is 99.380933 percent accurate
[[18649   116]
 [    4   615]]
                precision    recall  f1-score   support

  Healthy Loan       1.00      0.99      1.00 

In [None]:
from sklearn.metrics import classification_report
target_names = ["Healthy Loan", "High-Risk Loan"]
print(classification_report(Y_test, LR_pred, target_names=target_names))

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Healthy Loan prediction score was a perfect 1.0 i.e. no false positives while 84% of the predictions for High-Risk Loan turned out correct  which indicates few false positives. 

---