In [1]:
#importing and pre-processing data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler,OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import tensorflow as tf


In [2]:
# Read the data into a Pandas DataFrame
loan_info_df = pd.read_csv('Resources/loan_training_dataset.csv')

loan_info_df.head()


Unnamed: 0,INCOME,AGE,EXPERIENCE,MARITAL_STATUS,HOUSE_OWNERSHIP,CAR_OWNERSHIP,PROFESSION,STATE,CURRENT_JOB_YRS,RISK_FLAG
0,1303834,23,3,single,rented,no,Mechanical Engineer,Madhya Pradesh,3,0
1,7574516,40,10,single,rented,no,Software Developer,Maharashtra,9,0
2,3991815,66,4,married,rented,no,Technical Writer,Kerala,4,0
3,6256451,41,2,single,rented,yes,Software Developer,Odisha,2,1
4,5768871,47,11,single,rented,no,Civil Servant,Tamil Nadu,3,1


### Step 2: Create the labels set (`y`)  from the “Risk_Flag” column, and then create the features (`X`) DataFrame from the remaining columns.

In [4]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = loan_info_df['RISK_FLAG']

# Separate the X variable, the features
X = loan_info_df.drop(columns=['RISK_FLAG'])

In [5]:
# Review the y variable Series
y.head()

0    0
1    0
2    0
3    1
4    1
Name: RISK_FLAG, dtype: int64

In [6]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,INCOME,AGE,EXPERIENCE,MARITAL_STATUS,HOUSE_OWNERSHIP,CAR_OWNERSHIP,PROFESSION,STATE,CURRENT_JOB_YRS
0,1303834,23,3,single,rented,no,Mechanical Engineer,Madhya Pradesh,3
1,7574516,40,10,single,rented,no,Software Developer,Maharashtra,9
2,3991815,66,4,married,rented,no,Technical Writer,Kerala,4
3,6256451,41,2,single,rented,yes,Software Developer,Odisha,2
4,5768871,47,11,single,rented,no,Civil Servant,Tamil Nadu,3


### 3. Encode the categorical variables from the features data using `get_dummies`.

In [7]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features data
X.head()

Unnamed: 0,INCOME,AGE,EXPERIENCE,CURRENT_JOB_YRS,MARITAL_STATUS_married,MARITAL_STATUS_single,HOUSE_OWNERSHIP_norent_noown,HOUSE_OWNERSHIP_owned,HOUSE_OWNERSHIP_rented,CAR_OWNERSHIP_no,...,STATE_Puducherry,STATE_Punjab,STATE_Rajasthan,STATE_Sikkim,STATE_Tamil Nadu,STATE_Telangana,STATE_Tripura,STATE_Uttar Pradesh,STATE_Uttarakhand,STATE_West Bengal
0,1303834,23,3,3,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,7574516,40,10,9,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,3991815,66,4,4,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,6256451,41,2,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5768871,47,11,3,0,1,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [8]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Create a Random Forest Model with the Original Data

In [9]:
from sklearn.ensemble import RandomForestClassifier

#Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [11]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [12]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [13]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,52378,2830
Actual 1,3541,4251


Accuracy Score : 0.8988730158730158
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.95      0.94     55208
           1       0.60      0.55      0.57      7792

    accuracy                           0.90     63000
   macro avg       0.77      0.75      0.76     63000
weighted avg       0.90      0.90      0.90     63000



**Question:** How well does the logistic regression model predict both the `0` (non-risk loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model was 88% accurate at predicting the healthy vs high-risk loan labels

In [16]:
# Create a MinMaxScaler instances
scaler = MinMaxScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [17]:
# Define the model - deep neural net
number_input_features = X_train.shape[1] 
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
# Compile the model
nn.compile(loss = "binary_crossentropy", optimizer = "adam", metrics=["accuracy"])

In [19]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 256us/step - accuracy: 0.8738 - loss: 0.3805
Epoch 2/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 242us/step - accuracy: 0.8770 - loss: 0.3642
Epoch 3/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 247us/step - accuracy: 0.8771 - loss: 0.3610
Epoch 4/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 240us/step - accuracy: 0.8771 - loss: 0.3584
Epoch 5/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 241us/step - accuracy: 0.8764 - loss: 0.3581
Epoch 6/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 254us/step - accuracy: 0.8783 - loss: 0.3521
Epoch 7/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 246us/step - accuracy: 0.8768 - loss: 0.3539
Epoch 8/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 244us/step - accuracy: 0.8771 - loss: 0.3516


In [20]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1969/1969 - 0s - 202us/step - accuracy: 0.8769 - loss: 0.3378
Loss: 0.33776915073394775, Accuracy: 0.8768888711929321


# Result Explanation

# Loss: 0.3378
The loss value represents how well or poorly the model's predictions match the true values in the test data. A lower loss indicates better performance.
The loss value of 0.3414 suggests that, on average, the model's predictions are fairly close to the actual risk flags in the test data.

# Accuracy: 0.8769
Accuracy is the proportion of correct predictions out of the total number of predictions made. An accuracy of 0.8763 means that the model correctly predicted the risk flag for approximately 88.29% of the test samples.
In the context of loan risk prediction, an accuracy of 88.29% is generally considered quite good, indicating that the model is correctly identifying risky and non-risky applicants most of the time. However, it's important to consider the balance of the dataset and other metrics like precision, recall, and F1-score to ensure that the model is not biased towards the majority class.