# Student Loan Risk with Deep Learning

In [71]:
# Imports
import pandas as pd
import tensorflow as tf
import sklearn as skl
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from pathlib import Path
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adagrad

---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student-loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [2]:
# Read the csv into a Pandas DataFrame
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m18/lms/datasets/student-loans.csv"
loans_df = pd.read_csv(file_path)


In [3]:
# Review the data types associated with the columns
loans_df.dtypes

payment_history           float64
location_parameter        float64
stem_degree_score         float64
gpa_ranking               float64
alumni_success            float64
study_major_code          float64
time_to_completion        float64
finance_workshop_score    float64
cohort_ranking            float64
total_loan_score          float64
financial_aid_score       float64
credit_ranking              int64
dtype: object

In [4]:
# Check the credit_ranking value counts
loans_df["credit_ranking"].value_counts()

credit_ranking
1    855
0    744
Name: count, dtype: int64

In [5]:
loans_df.describe()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,0.534709
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.49895
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,0.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,0.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,1.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,1.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,1.0


In [6]:
# Find the number of unique values in each column
unique_counts = loans_df.nunique()

# Print the number of unique values for each column
print("Number of unique values in each column:")
print(unique_counts)

Number of unique values in each column:
payment_history            96
location_parameter        143
stem_degree_score          80
gpa_ranking                91
alumni_success            153
study_major_code           60
time_to_completion        144
finance_workshop_score    436
cohort_ranking             89
total_loan_score           96
financial_aid_score        65
credit_ranking              2
dtype: int64


### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [7]:
# Define the target set y using the credit_ranking column
y=loans_df['credit_ranking']

# Display a sample of y
y[0:5]

0    0
1    0
2    0
3    1
4    0
Name: credit_ranking, dtype: int64

In [8]:
# Define features set X by selecting all columns but credit_ranking
X=loans_df.drop('credit_ranking', axis=1)

# Review the features DataFrame
X.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


### Step 3: Split the features and target sets into training and testing datasets.


In [9]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [10]:
# Create a StandardScaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler to the features training dataset
X_scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile and Evaluate a Model Using a Neural Network



### Step 1: Using Keras Tuner, created and optimize the hyperparameters of a Seqeuntial neural network.

#### Created a function that defined a range of hyperparameter attributes for both fitting and compiling the model using Keras Tuner



In [69]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    # Two activation parameters were used.  For the hidden layers and the last layer.
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    activation_last = hp.Choice('activation',['softmax','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer.  A min, max and step fuction where defined.
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=3,
        max_value=11,
        step=2), activation=activation, input_dim=len(X.columns)))

    # Allow kerastuner to decide number of hidden layers and neurons in the respective hidden layers
    for i in range(hp.Int('num_of_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('num_of_neurons_in_layer_' + str(i),
            min_value=1,
            max_value=16,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation=activation_last))

    # Compile Optimmization - Define optimizer as a hyperparameter
    optimizer = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd', 'adagrad'])
    if optimizer == 'adam':
        optimizer = Adam(learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='log'))
    elif optimizer == 'rmsprop':
        optimizer = RMSprop(learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='log'))
    elif optimizer == 'sgd':
        optimizer = SGD(learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='log'))
    elif optimizer == 'adagrad':
        optimizer = Adagrad(learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='log'))
    
    #  Compile Optimmization - Define loss function as a hyperparameter
    loss = hp.Choice('loss', ['binary_crossentropy', 'mean_squared_error', 'mean_absolute_error'])
    
    # Compile the model
    nn_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    return nn_model

In [72]:
# Import kerastuner library
import keras_tuner as kt

#  Initialize the tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=50,
    hyperband_iterations=3)

In [73]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=50,validation_data=(X_test_scaled,y_test))

Trial 270 Complete [00h 00m 05s]
val_accuracy: 0.5299999713897705

Best val_accuracy So Far: 0.7774999737739563
Total elapsed time: 00h 13m 57s


In [74]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'relu',
 'first_units': 11,
 'num_of_layers': 3,
 'num_of_neurons_in_layer_0': 3,
 'optimizer': 'rmsprop',
 'learning_rate': 0.008243668311854,
 'loss': 'mean_squared_error',
 'num_of_neurons_in_layer_1': 11,
 'num_of_neurons_in_layer_2': 11,
 'num_of_neurons_in_layer_3': 11,
 'num_of_neurons_in_layer_4': 1,
 'num_of_neurons_in_layer_5': 1,
 'tuner/epochs': 50,
 'tuner/initial_epoch': 17,
 'tuner/bracket': 2,
 'tuner/round': 2,
 'tuner/trial_id': '0068'}

### Step 2:  Evaluate the best model using the test data to determine the model’s loss and accuracy.

In [75]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

13/13 - 0s - 6ms/step - accuracy: 0.7775 - loss: 0.1732
Loss: 0.17320585250854492, Accuracy: 0.7774999737739563


###  Step 3: Use Cross Validation to evaluate if the Best Model is/isn't an outlier

In [76]:

import numpy as np
from sklearn.model_selection import cross_val_score
from scikeras.wrappers import KerasClassifier

# Wrap the model for use in scikit-learn (for the cross validation)
model = KerasClassifier(model=best_model, epochs=50, batch_size=10, verbose=0)

# Perform k-fold cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean Cross-Validation Score: {cv_scores.mean()}')

  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))


Cross-Validation Scores: [0.79583333 0.75       0.8125     0.77916667 0.72803347]
Mean Cross-Validation Score: 0.7731066945606695


### Step 4: Save and export your model to a keras file, and name the file `student_loans.keras`.


In [77]:
# Set the model's file path
file_path = Path("student_loans.keras")

# Export your model to a keras file
best_model.save(file_path)

---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [78]:
# Set the model's file path

file_path = Path("student_loans.keras")

# Load the model to a new object
nn_imported = tf.keras.models.load_model(file_path)

  saveable.load_own_variables(weights_store.get(inner_path))


### Step 2: Make predictions on the testing data and save the predictions to a DataFrame.

In [79]:
# Make predictions with the test data
predictions = best_model.predict(X_test_scaled,verbose=2)


# Display a sample of the predictions

predictions[0:5]

13/13 - 0s - 3ms/step


array([[0.24457751],
       [0.17919442],
       [0.7858474 ],
       [0.7525889 ],
       [1.0362395 ]], dtype=float32)

In [80]:
# Save the predictions to a DataFrame and round the predictions to binary results
predictions_df = pd.DataFrame(columns=["predictions"], data=predictions)
predictions_df["predictions"] = round(predictions_df["predictions"],0)
predictions_df

Unnamed: 0,predictions
0,0.0
1,0.0
2,1.0
3,1.0
4,1.0
...,...
395,1.0
396,0.0
397,1.0
398,0.0


### Step 4: Display a classification report with the y test data and predictions

In [81]:
# Print the classification report with the y test data and predictions
print(classification_report(y_test, predictions_df["predictions"].values))

              precision    recall  f1-score   support

           0       0.76      0.77      0.77       188
           1       0.79      0.78      0.79       212

    accuracy                           0.78       400
   macro avg       0.78      0.78      0.78       400
weighted avg       0.78      0.78      0.78       400



### Step 5:  Evaluate what features have the most importance on the model using SHAP (SHapley Additive exPlanations)

In [93]:
import shap
import pandas as pd
import numpy as np


feature_names = X.columns.tolist()

# Convert X_test_scaled to DataFrame for better handling of feature names
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

# Initialize the SHAP KernelExplainer
explainer = shap.KernelExplainer(best_model.predict, X_train_scaled[:100])  # Using a subset of training data for the explainer

# Compute SHAP values for a subset of test data
shap_values = explainer.shap_values(X_test_scaled[:100])

# Check the shape of SHAP values
print("SHAP values shape:", np.array(shap_values).shape)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 671us/step


  0%|          | 0/100 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m6394/6394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 176us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m6394/6394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 191us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m6394/6394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 190us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m6394/6394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 190us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m6394/6394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 194us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m6394/6394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 193us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m6394/6394[0m [32m━

In [94]:
# Print SHAP values for the first instance
instance_index = 0
print(f"SHAP values for instance {instance_index}:")
for feature, shap_value in zip(feature_names, shap_values[instance_index]):
    print(f"{feature}: {shap_value}")

SHAP values for instance 0:
payment_history: [-0.02680195]
location_parameter: [0.04598802]
stem_degree_score: [-0.09967395]
gpa_ranking: [-0.01388944]
alumni_success: [-0.11599654]
study_major_code: [-0.08766167]
time_to_completion: [0.01034897]
finance_workshop_score: [-0.0927712]
cohort_ranking: [-0.01596968]
total_loan_score: [0.06163191]
financial_aid_score: [0.04312455]


In [95]:
# If shap_values is a list of arrays, select the first one (for binary classification)
if isinstance(shap_values, list):
    shap_values = shap_values[0]

# Ensure shap_values is 2D
shap_values = np.array(shap_values)
if shap_values.ndim == 3:
    shap_values = shap_values[:, :, 0]

# Check the adjusted shape
print("Adjusted SHAP values shape:", shap_values.shape)

# Compute mean absolute SHAP values for each feature
mean_abs_shap_values = pd.DataFrame(shap_values, columns=feature_names).abs().mean()

# Sort features by importance
feature_importance = mean_abs_shap_values.sort_values(ascending=False)

print("Feature importance based on mean absolute SHAP values:")
print(feature_importance)

Adjusted SHAP values shape: (100, 11)
Feature importance based on mean absolute SHAP values:
financial_aid_score       0.148943
total_loan_score          0.114568
time_to_completion        0.082562
location_parameter        0.078672
study_major_code          0.047691
alumni_success            0.033198
payment_history           0.032153
stem_degree_score         0.031567
finance_workshop_score    0.018223
gpa_ranking               0.016592
cohort_ranking            0.010942
dtype: float64


###  Step 6: Using the Variance Inflation Factor (VIF), evaluate the Best Model for multicolliniarty between features with high infulence.¶

In [97]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Assuming X_train_scaled is your scaled training data and feature_names is the list of feature names
feature_names = X.columns.tolist()  # Adjust this based on your actual feature names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)

# Compute VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X_train_scaled_df.columns
vif_data["VIF"] = [variance_inflation_factor(X_train_scaled_df.values, i) for i in range(X_train_scaled_df.shape[1])]

print(vif_data)

                   feature       VIF
0          payment_history  8.137628
1       location_parameter  1.750431
2        stem_degree_score  3.156775
3              gpa_ranking  1.681367
4           alumni_success  1.478022
5         study_major_code  1.993762
6       time_to_completion  2.207052
7   finance_workshop_score  6.499946
8           cohort_ranking  3.378052
9         total_loan_score  1.424388
10     financial_aid_score  3.060195


---
## Discuss creating a recommendation system for student loans

Briefly answer the following questions in the space provided:

1. Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.

   RESPONSE:  Based upon SHAP analysis, the most significant features requried are:  (i) the financial aid score; (ii) total loan score; (iii) time to completion and; (iv) the location parameter.  These features have the largest impact on the deep neural network.   While I saw a risk of multicolinarity between the financial aid and loan score (not knowing how they are computed but assuming they have similar elements), I did a VIF anlaysis.  Based upon the findings, there doesn't appear to be multicolinearity between the material features.

3. Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.

    RESPONSE:  The data used is content-based filtering, because its based upon atributes unique to the borrowers, which are used to recommend good borrowers.

4. Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.

    RESPONSE: (i) The model was potentially built upon data as of a point in time, vs. over the period of the loan.  The challenge may be the ability to update the features over time.  (ii)The approach being used may not be a fair representation of a students ability to pay back the loan.  (iii) The model might be biased against a group the lendor may want to focus on (for social purposes.  (iv)  An accuracy score of 78% may not be high enough for this industry vs loan losses.

**1. Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.**


**2. Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.**


**3. Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.**

