In [85]:
import pandas as pd 
import random 
random.seed(42) #in case we will use random somewhere
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv("../data/processed/processed_credit_risk_dataset.csv")

## Random Forest
Best model predictor during Team Project 1

In [2]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9372
Precision: 0.9674
Recall: 0.7351
F1-Score: 0.8354


## Deep Learning

Feedforward neural network (also known as a fully connected or dense neural network) can be a good choice. Given the numerical nature of the dataset, we can design a neural network with multiple layers to capture the relationships between features effectively.

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [4]:
# Define the features (X) and the target (y)
X = data.drop(columns=['loan_status'])
y = data['loan_status']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Predict on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-08-13 14:24:40.117864: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-08-13 14:24:40.117885: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-08-13 14:24:40.117889: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-08-13 14:24:40.118140: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-13 14:24:40.118151: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/20


2024-08-13 14:24:40.646444: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2024-08-13 14:24:40.651862: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.8171 - loss: 0.4165 - val_accuracy: 0.8811 - val_loss: 0.3142
Epoch 2/20
[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8884 - loss: 0.2936 - val_accuracy: 0.8952 - val_loss: 0.2895
Epoch 3/20
[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8980 - loss: 0.2805 - val_accuracy: 0.9003 - val_loss: 0.2783
Epoch 4/20
[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9007 - loss: 0.2725 - val_accuracy: 0.9051 - val_loss: 0.2749
Epoch 5/20
[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9081 - loss: 0.2579 - val_accuracy: 0.9065 - val_loss: 0.2705
Epoch 6/20
[1m651/651[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9078 - loss: 0.2589 - val_accuracy: 0.9090 - val_loss: 0.2675
Epoch 7/20
[1m651/651[0m [32m━━━━━━━

2024-08-13 14:26:27.369048: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Accuracy: 0.9084
Precision: 0.9016
Recall: 0.6487
F1-Score: 0.7545


## Gradient Boosting Classifier (the best model!)

In [84]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the GB
gb_model = GradientBoostingClassifier(n_estimators=500, random_state=42, learning_rate= 0.3)


# Train the model
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = gb_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9432
Precision: 0.9465
Recall: 0.7840
F1-Score: 0.8576


## Hist Gradient Boosting Classifier

This method is not as good as Gradient Boosting, but it is 20 times faster!

In [86]:

X = data.drop(columns=['loan_status'])
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the GB
hgb_model = HistGradientBoostingClassifier()


# Train the model
hgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = hgb_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9429
Precision: 0.9687
Recall: 0.7628
F1-Score: 0.8535


## Data oversampling with SMOTE and ADYSN

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter
from imblearn.under_sampling import TomekLinks

### SMOTE 

In [89]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
counter = Counter(y_train)
print('Before', counter)
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

counter = Counter(y_train)
print('After', counter)

Before Counter({0: 20354, 1: 5682})
After Counter({1: 20354, 0: 20354})


In [90]:
gb_model = GradientBoostingClassifier(n_estimators=500, random_state=42, learning_rate= 0.3)


# Train the model
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = gb_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9424
Precision: 0.9462
Recall: 0.7804
F1-Score: 0.8554


### ADASYN

In [91]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
counter = Counter(y_train)
print('Before', counter)
X_train, y_train = ADASYN().fit_resample(X_train, y_train)

counter = Counter(y_train)
print('After', counter)

Before Counter({0: 20354, 1: 5682})
After Counter({1: 20552, 0: 20354})


In [92]:
gb_model = GradientBoostingClassifier(n_estimators=500, random_state=42, learning_rate= 0.3)


# Train the model
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = gb_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9409
Precision: 0.9427
Recall: 0.7762
F1-Score: 0.8514


## Data undersampling with TomekLinks

In [87]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
counter = Counter(y_train)
print('Before', counter)
X_train, y_train = TomekLinks().fit_resample(X_train, y_train)

counter = Counter(y_train)
print('After', counter)

Before Counter({0: 20354, 1: 5682})
After Counter({0: 18756, 1: 5682})


In [88]:
gb_model = GradientBoostingClassifier(n_estimators=500, random_state=42, learning_rate= 0.3)


# Train the model
gb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = gb_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9393
Precision: 0.9233
Recall: 0.7875
F1-Score: 0.8500
