In [2]:
#importing and pre-processing data
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler,OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import tensorflow as tf


In [4]:
# Read the data into a Pandas DataFrame
loan_info_df = pd.read_csv('Resources/loan_training_dataset.csv')

loan_info_df.head()


Unnamed: 0,INCOME,AGE,EXPERIENCE,MARITAL_STATUS,HOUSE_OWNERSHIP,CAR_OWNERSHIP,PROFESSION,STATE,CURRENT_JOB_YRS,RISK_FLAG
0,1303834,23,3,single,rented,no,Mechanical Engineer,Madhya Pradesh,3,0
1,7574516,40,10,single,rented,no,Software Developer,Maharashtra,9,0
2,3991815,66,4,married,rented,no,Technical Writer,Kerala,4,0
3,6256451,41,2,single,rented,yes,Software Developer,Odisha,2,1
4,5768871,47,11,single,rented,no,Civil Servant,Tamil Nadu,3,1


In [6]:
loan_info_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
INCOME,252000.0,,,,4997116.665325,2878311.013611,10310.0,2503015.0,5000694.5,7477502.0,9999938.0
AGE,252000.0,,,,49.954071,17.063855,21.0,35.0,50.0,65.0,79.0
EXPERIENCE,252000.0,,,,10.084437,6.00259,0.0,5.0,10.0,15.0,20.0
MARITAL_STATUS,252000.0,2.0,single,226272.0,,,,,,,
HOUSE_OWNERSHIP,252000.0,3.0,rented,231898.0,,,,,,,
CAR_OWNERSHIP,252000.0,2.0,no,176000.0,,,,,,,
PROFESSION,252000.0,51.0,Physician,5957.0,,,,,,,
STATE,252000.0,28.0,Uttar Pradesh,29143.0,,,,,,,
CURRENT_JOB_YRS,252000.0,,,,6.333877,3.647053,0.0,3.0,6.0,9.0,14.0
RISK_FLAG,252000.0,,,,0.123,0.328438,0.0,0.0,0.0,0.0,1.0


In [7]:
loan_info_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
INCOME,252000.0,4997117.0,2878311.0,10310.0,2503015.0,5000694.5,7477502.0,9999938.0
AGE,252000.0,49.95407,17.06385,21.0,35.0,50.0,65.0,79.0
EXPERIENCE,252000.0,10.08444,6.00259,0.0,5.0,10.0,15.0,20.0
CURRENT_JOB_YRS,252000.0,6.333877,3.647053,0.0,3.0,6.0,9.0,14.0
RISK_FLAG,252000.0,0.123,0.3284379,0.0,0.0,0.0,0.0,1.0


### Step 2: Create the labels set (`y`)  from the “Risk_Flag” column, and then create the features (`X`) DataFrame from the remaining columns.

In [9]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = loan_info_df['RISK_FLAG']

# Separate the X variable, the features
X = loan_info_df.drop(columns=['RISK_FLAG'])

In [10]:
# Review the y variable Series
y.head()

0    0
1    0
2    0
3    1
4    1
Name: RISK_FLAG, dtype: int64

In [11]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,INCOME,AGE,EXPERIENCE,MARITAL_STATUS,HOUSE_OWNERSHIP,CAR_OWNERSHIP,PROFESSION,STATE,CURRENT_JOB_YRS
0,1303834,23,3,single,rented,no,Mechanical Engineer,Madhya Pradesh,3
1,7574516,40,10,single,rented,no,Software Developer,Maharashtra,9
2,3991815,66,4,married,rented,no,Technical Writer,Kerala,4
3,6256451,41,2,single,rented,yes,Software Developer,Odisha,2
4,5768871,47,11,single,rented,no,Civil Servant,Tamil Nadu,3


### 3. Encode the categorical variables from the features data using `get_dummies`.

In [13]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features data
X.head()

Unnamed: 0,INCOME,AGE,EXPERIENCE,CURRENT_JOB_YRS,MARITAL_STATUS_married,MARITAL_STATUS_single,HOUSE_OWNERSHIP_norent_noown,HOUSE_OWNERSHIP_owned,HOUSE_OWNERSHIP_rented,CAR_OWNERSHIP_no,...,STATE_Puducherry,STATE_Punjab,STATE_Rajasthan,STATE_Sikkim,STATE_Tamil Nadu,STATE_Telangana,STATE_Tripura,STATE_Uttar Pradesh,STATE_Uttarakhand,STATE_West Bengal
0,1303834,23,3,3,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,7574516,40,10,9,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,3991815,66,4,4,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,6256451,41,2,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5768871,47,11,3,0,1,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [14]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### 5. Scale the data using `StandardScaler`

In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [16]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(solver='lbfgs', random_state=42)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [17]:
# Make a prediction using the testing data
prediction_test = logistic_regression_model.predict(X_test)
pd.DataFrame({'Predictions': prediction_test, 'Actual': y_test})

Unnamed: 0,Predictions,Actual
75255,0,0
192435,0,0
154839,0,0
59774,0,0
63788,0,0
...,...,...
8107,0,1
79394,0,0
3538,0,1
220663,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [18]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, prediction_test)

array([[55208,     0],
       [ 7792,     0]])

In [19]:
# Print the classification report for the model
target_names = ['Non Risky Application', 'Flagged Risky Application']
print(classification_report(y_test, prediction_test, target_names=target_names))

                           precision    recall  f1-score   support

    Non Risky Application       0.88      1.00      0.93     55208
Flagged Risky Application       0.00      0.00      0.00      7792

                 accuracy                           0.88     63000
                macro avg       0.44      0.50      0.47     63000
             weighted avg       0.77      0.88      0.82     63000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (non-risk loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model was 88% accurate at predicting the healthy vs high-risk loan labels

In [20]:
# Define the model - deep neural net
number_input_features = X_train.shape[1] 
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
# Compile the model
nn.compile(loss = "binary_crossentropy", optimizer = "adam", metrics=["accuracy"])

In [22]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 250us/step - accuracy: 0.8713 - loss: 0.3911
Epoch 2/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 253us/step - accuracy: 0.8775 - loss: 0.3574
Epoch 3/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 247us/step - accuracy: 0.8764 - loss: 0.3554
Epoch 4/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 247us/step - accuracy: 0.8779 - loss: 0.3494
Epoch 5/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 261us/step - accuracy: 0.8786 - loss: 0.3443
Epoch 6/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 248us/step - accuracy: 0.8796 - loss: 0.3388
Epoch 7/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 251us/step - accuracy: 0.8772 - loss: 0.3421
Epoch 8/100
[1m5907/5907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 249us/step - accuracy: 0.8772 - loss: 0.3403


In [23]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1969/1969 - 0s - 200us/step - accuracy: 0.8768 - loss: 0.3409
Loss: 0.3408711850643158, Accuracy: 0.876793622970581


# Result Explanation

# Loss: 0.3409
The loss value represents how well or poorly the model's predictions match the true values in the test data. A lower loss indicates better performance.
The loss value of 0.3409 suggests that, on average, the model's predictions are fairly close to the actual risk flags in the test data, but lower result preffered.

# Accuracy: 0.8768
Accuracy is the proportion of correct predictions out of the total number of predictions made. An accuracy of 0.8829 means that the model correctly predicted the risk flag for approximately 88.29% of the test samples.
In the context of loan risk prediction, an accuracy of 87.68% is generally considered quite good, indicating that the model is correctly identifying risky and non-risky applicants most of the time. However, it's important to consider the balance of the dataset and other metrics like precision, recall, and F1-score to ensure that the model is not biased towards the majority class.