In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
import joblib
import warnings
# Suppress all warnings for a cleaner output
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
print("Starting model training pipeline with hyperparameter tuning...")
print("-" * 70)

Starting model training pipeline with hyperparameter tuning...
----------------------------------------------------------------------


## Load Dataset

In [5]:
try:
    df = pd.read_csv(r"C:\Users\linae\OneDrive\Desktop\Northwestern\MSDS 422\Project\credit_risk_dataset.csv")
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'credit_risk_dataset.csv' not found. Please ensure the file path is correct.")
    exit()

Dataset loaded successfully.


## Data Preprocessing

In [6]:
# Drop rows with missing values in 'person_emp_length' and 'loan_int_rate'
print("\nHandling missing values...")
initial_rows = df.shape[0]
df_clean = df.dropna().copy()
rows_dropped = initial_rows - df_clean.shape[0]
print(f"Initial number of rows: {initial_rows}")
print(f"Number of rows after dropping NaNs: {df_clean.shape[0]} ({rows_dropped} rows dropped)")
print("-" * 70)


Handling missing values...
Initial number of rows: 32581
Number of rows after dropping NaNs: 28638 (3943 rows dropped)
----------------------------------------------------------------------


## Feature Engineering - One-Hot Encoding

In [7]:
print("\nPerforming one-hot encoding on categorical features...")
categorical_cols = [
    'person_home_ownership',
    'loan_intent',
    'loan_grade',
    'cb_person_default_on_file'
]
df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)
print("Categorical features encoded successfully.")
print(f"New DataFrame shape: {df_encoded.shape}")
print("-" * 70)


Performing one-hot encoding on categorical features...
Categorical features encoded successfully.
New DataFrame shape: (28638, 23)
----------------------------------------------------------------------


## Data Splitting

In [8]:
# Separate features (X) from the target variable (y)
print("\nSplitting data into training and testing sets...")
X = df_encoded.drop('loan_status', axis=1)
y = df_encoded['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print("-" * 70)


Splitting data into training and testing sets...
Training set size: 22910
Testing set size: 5728
----------------------------------------------------------------------


## Feature Scaling

In [9]:
print("\nScaling numerical features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features scaled successfully.")
print("-" * 70)


Scaling numerical features...
Features scaled successfully.
----------------------------------------------------------------------


## Model Training

### Logistic Regression with Tuning

In [10]:
print("\nTraining and tuning Logistic Regression model...")
# Define the hyperparameters to search
log_reg_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2']
}


Training and tuning Logistic Regression model...


In [11]:
# Initialize GridSearchCV with the model and parameter grid
log_reg_grid_search = GridSearchCV(
    LogisticRegression(solver='liblinear', random_state=42),
    log_reg_param_grid,
    cv=3,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)
log_reg_grid_search.fit(X_train_scaled, y_train)

# Get the best model from the search
best_log_reg_model = log_reg_grid_search.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [12]:
# Evaluate the best model
y_pred_log_reg = best_log_reg_model.predict(X_test_scaled)
y_pred_proba_log_reg = best_log_reg_model.predict_proba(X_test_scaled)[:, 1]

In [18]:

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

print("\n--- Best Logistic Regression Model Evaluation ---")
print(f"Best Parameters: {log_reg_grid_search.best_params_}")
print("Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_log_reg):.4f}")
f1_log_reg = f1_score(y_test, y_pred_log_reg, pos_label=1)
print("-" * 70)
cm = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix:")
print(cm)
print("-" * 70)
joblib.dump(best_log_reg_model, 'logistic_regression_model.pkl')


--- Best Logistic Regression Model Evaluation ---
Best Parameters: {'C': 0.1, 'penalty': 'l1'}
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      4487
           1       0.75      0.53      0.62      1241

    accuracy                           0.86      5728
   macro avg       0.82      0.74      0.77      5728
weighted avg       0.85      0.86      0.85      5728

ROC-AUC Score: 0.8617
----------------------------------------------------------------------
Confusion Matrix:
[[4272  215]
 [ 588  653]]
----------------------------------------------------------------------


['logistic_regression_model.pkl']

In [19]:
# Save model metadata
log_reg_metadata = {
    'model_name': 'Logistic Regression',
    'best_parameters': log_reg_grid_search.best_params_,
    'roc_auc_score': roc_auc_score,
    'f1_score_positive_class': f1_log_reg
}
joblib.dump(log_reg_metadata, 'logistic_regression_metadata.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Saved best Logistic Regression model and metadata to disk.")
print("-" * 70)

Saved best Logistic Regression model and metadata to disk.
----------------------------------------------------------------------


## Logistic Regression Model
### Why This Model Was Used
Logistic Regression is a foundational and highly interpretable machine learning algorithm. It was included in the project for two main reasons:

Interpretability: Its linear nature makes it easy to understand which features are most important and how they influence the final prediction. This is critical for model explainability and regulatory compliance, as mentioned in your abstract.

Baseline Performance: As a "traditional" statistical model, it provides a solid baseline to compare the performance of more complex machine learning models like XGBoost and LightGBM. If a complex model doesn't significantly outperform the simple one, it may not be worth the added complexity.

### How It Was Trained
The model was trained using GridSearchCV on the scaled data. Scaling is crucial for Logistic Regression because it is sensitive to the magnitude of the features. Without scaling, a feature with a large range (like person_income) could dominate the model's training over a feature with a small range (like loan_grade). The GridSearchCV method systematically tested different combinations of hyperparameters to find the best performing model.

### Key Parameters Used
C: This is the regularization parameter. It controls how much the model's complexity is penalized. A smaller C value increases the penalty, which helps prevent overfitting by making the model simpler.

penalty: This specifies the type of regularization used. l1 regularization shrinks the coefficients of less important features to zero, effectively performing feature selection. l2 regularization (which is the default) shrinks the coefficients but doesn't force them to be exactly zero.

solver='liblinear': A specific algorithm used to optimize the model. It is an efficient choice for smaller datasets and for when you want to use l1 regularization.

### XGBoost with Tuning

In [20]:
print("\nTraining and tuning XGBoost model...")
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}


Training and tuning XGBoost model...


In [21]:
xgb_grid_search = GridSearchCV(
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    xgb_param_grid,
    cv=3,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)
xgb_grid_search.fit(X_train, y_train)

# Get the best model from the search
best_xgb_model = xgb_grid_search.best_estimator_

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [22]:
# Evaluate the best model
y_pred_xgb = best_xgb_model.predict(X_test)
y_pred_proba_xgb = best_xgb_model.predict_proba(X_test)[:, 1]

In [26]:
print("\n--- Best XGBoost Model Evaluation ---")
print(f"Best Parameters: {xgb_grid_search.best_params_}")
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_xgb):.4f}")
f1_xgb = f1_score(y_test, y_pred_log_reg, pos_label=1)

cm = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix:")
print(cm)
print("-" * 70)
print("-" * 70)
joblib.dump(best_xgb_model, 'xgboost_model.pkl')


--- Best XGBoost Model Evaluation ---
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      4487
           1       0.97      0.72      0.82      1241

    accuracy                           0.93      5728
   macro avg       0.95      0.85      0.89      5728
weighted avg       0.94      0.93      0.93      5728

ROC-AUC Score: 0.9429
Confusion Matrix:
[[4272  215]
 [ 588  653]]
----------------------------------------------------------------------
----------------------------------------------------------------------


['xgboost_model.pkl']

In [27]:
xgb_metadata = {
    'model_name': 'XGBoost',
    'best_parameters': xgb_grid_search.best_params_,
    'roc_auc_score': roc_auc_score,
    'f1_score_positive_class': f1_xgb
}
joblib.dump(xgb_metadata, 'xgboost_metadata.pkl')
print("Saved best XGBoost model and metadata to disk.")
print("-" * 70)

Saved best XGBoost model and metadata to disk.
----------------------------------------------------------------------


## XGBoost Model
### Why This Model Was Used
XGBoost (eXtreme Gradient Boosting) is one of the most powerful and widely used machine learning algorithms for tabular data. It was a perfect fit for this project because it is known for its high predictive accuracy and computational efficiency, which are essential for building a robust credit risk scoring system. It excels at capturing complex, non-linear relationships in the data.

### How It Was Trained
The model was trained using GridSearchCV on the unscaled data. Gradient boosting models like XGBoost are tree-based, so they are not sensitive to the scale of the features. This makes them less reliant on a separate scaling step. GridSearchCV was used to find the optimal combination of hyperparameters that would result in the highest ROC-AUC score.

### Key Parameters Used
n_estimators: This is the number of boosting rounds or the number of decision trees to be built sequentially. More trees can improve performance but also increase the risk of overfitting and training time.

max_depth: This controls the maximum depth of each tree. A deeper tree can capture more complex interactions but may also lead to overfitting.

learning_rate: This determines the step size at each boosting iteration. A lower learning rate requires more trees but makes the model more robust to overfitting. The ideal learning_rate is usually a trade-off between model performance and training time.

### Support Vector Machine (SVM) with Tuning

In [28]:
print("\nTraining and tuning SVM model")

# Define the hyperparameters to search
svm_param_grid = {
    'C': [0.1, 1],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']
}


Training and tuning SVM model


In [29]:
# Initialize GridSearchCV
svm_grid_search = GridSearchCV(
    SVC(probability=True, random_state=42),
    svm_param_grid,
    cv=3,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)
svm_grid_search.fit(X_train_scaled, y_train)

# Get the best model from the search
best_svm_model = svm_grid_search.best_estimator_

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [30]:
# Evaluate the best model
y_pred_svm = best_svm_model.predict(X_test_scaled)
y_pred_proba_svm = best_svm_model.predict_proba(X_test_scaled)[:, 1]

In [34]:
print("\n--- Best SVM Model Evaluation ---")
print(f"Best Parameters: {svm_grid_search.best_params_}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_svm):.4f}")
f1_svm = f1_score(y_test, y_pred_log_reg, pos_label=1)

cm = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix:")
print(cm)

print("-" * 70)
joblib.dump(best_svm_model, 'svm_model.pkl')


--- Best SVM Model Evaluation ---
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      4487
           1       0.93      0.62      0.74      1241

    accuracy                           0.91      5728
   macro avg       0.91      0.80      0.84      5728
weighted avg       0.91      0.91      0.90      5728

ROC-AUC Score: 0.8893
Confusion Matrix:
[[4272  215]
 [ 588  653]]
----------------------------------------------------------------------


['svm_model.pkl']

In [35]:
svm_metadata = {
    'model_name': 'Support Vector Machine',
    'best_parameters': svm_grid_search.best_params_,
    'roc_auc_score': roc_auc_score,
    'f1_score_positive_class': f1_svm
}
joblib.dump(svm_metadata, 'svm_metadata.pkl')
print("Saved best SVM model and metadata to disk.")
print("-" * 70)

Saved best SVM model and metadata to disk.
----------------------------------------------------------------------


## Support Vector Machine (SVM) Model
### Why This Model Was Used
SVM is a powerful model for both linear and non-linear classification. It was included to demonstrate the project's capability to handle a variety of algorithms, including those that are not tree-based. It is known for its effectiveness in high-dimensional spaces and for its ability to model complex decision boundaries.

### How It Was Trained
Like Logistic Regression, the SVM model was trained using GridSearchCV on the scaled data. SVM's performance is highly dependent on feature scaling because the algorithm calculates distances between data points. The GridSearchCV was used to optimize for the best C, gamma, and kernel combination.

### Key Parameters Used
C: The regularization parameter. It controls the trade-off between a smooth decision boundary and correctly classifying training points. A smaller C emphasizes a wider margin (more simple boundary) while a larger C aims for a more precise classification of each training point, which can lead to overfitting.

gamma: The kernel coefficient for the rbf (Radial Basis Function) kernel. It defines how far the influence of a single training example reaches. A small gamma means a large influence, and a large gamma means a small influence.

kernel: The kernel function specifies the type of transformation applied to the data. We used the 'rbf' kernel, which is highly effective for non-linear problems and is what allows the SVM to find complex, curved decision boundaries.

### LightGBM with Tuning

In [36]:
print("\nTraining and tuning LightGBM model...")
lgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50]
}


Training and tuning LightGBM model...


In [37]:
lgb_grid_search = GridSearchCV(
    lgb.LGBMClassifier(random_state=42),
    lgb_param_grid,
    cv=3,
    scoring='roc_auc',
    verbose=0,
    n_jobs=-1
)
lgb_grid_search.fit(X_train, y_train)

best_lgb_model = lgb_grid_search.best_estimator_

[LightGBM] [Info] Number of positive: 4962, number of negative: 17948
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 923
[LightGBM] [Info] Number of data points in the train set: 22910, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216587 -> initscore=-1.285670
[LightGBM] [Info] Start training from score -1.285670


In [38]:
y_pred_lgb = best_lgb_model.predict(X_test)
y_pred_proba_lgb = best_lgb_model.predict_proba(X_test)[:, 1]

In [52]:
print("\n--- Best LightGBM Model Evaluation ---")
print(f"Best Parameters: {lgb_grid_search.best_params_}")
print("Classification Report:")
print(classification_report(y_test, y_pred_lgb))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_lgb):.4f}")
f1_lgb = f1_score(y_test, y_pred_log_reg, pos_label=1)

cm = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix:")
print(cm)
print("-" * 70)
joblib.dump(best_lgb_model, 'lightgbm_model.pkl')


--- Best LightGBM Model Evaluation ---
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'num_leaves': 50}
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      4487
           1       0.95      0.72      0.82      1241

    accuracy                           0.93      5728
   macro avg       0.94      0.86      0.89      5728
weighted avg       0.93      0.93      0.93      5728

ROC-AUC Score: 0.9434
Confusion Matrix:
[[4272  215]
 [ 588  653]]
----------------------------------------------------------------------


['lightgbm_model.pkl']

In [53]:

lgb_metadata = {
    'model_name': 'LightGBM',
    'best_parameters': lgb_grid_search.best_params_,
    'roc_auc_score': roc_auc_score,
    'f1_score_positive_class': f1_lgb
}
joblib.dump(lgb_metadata, 'lightgbm_metadata.pkl')
print("Saved best LightGBM model and metadata to disk.")
print("-" * 70)

Saved best LightGBM model and metadata to disk.
----------------------------------------------------------------------


## LightGBM Model
## Why This Model Was Used
LightGBM is a gradient boosting framework that is designed to be highly efficient and fast. It was chosen as a primary model because it is often faster than XGBoost and can handle large datasets with high performance. Its unique tree-growing strategy (leaf-wise vs. level-wise) makes it a strong competitor for real-time applications.

## How It Was Trained
Similar to XGBoost, LightGBM is a tree-based model and was trained using GridSearchCV on the unscaled data. This makes it less sensitive to feature scaling. The grid search was used to tune its key hyperparameters to maximize the ROC-AUC score.

## Key Parameters Used
n_estimators: The number of boosting rounds or trees, similar to XGBoost.

max_depth: The maximum depth of each tree.

learning_rate: The step size shrinkage to prevent overfitting.

num_leaves: This is a unique parameter for LightGBM. It controls the maximum number of leaves in one tree. A larger number of leaves can increase the model's complexity and accuracy, but also its risk of overfitting.

### 1D Convolutional Neural Network (CNN)

In [40]:
print("\nTraining 1D CNN model...")
# Reshape the data for CNN input (samples, timesteps, features)
X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])


Training 1D CNN model...


In [41]:
# Build the 1D CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(1, X_train_scaled.shape[1])),
    Dropout(0.5),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [42]:
# Compile the model
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC', 'accuracy'])
# Train the model
cnn_model.fit(X_train_cnn, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - AUC: 0.8108 - accuracy: 0.8334 - loss: 0.4074 - val_AUC: 0.8909 - val_accuracy: 0.8865 - val_loss: 0.3175
Epoch 2/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - AUC: 0.8654 - accuracy: 0.8679 - loss: 0.3431 - val_AUC: 0.8974 - val_accuracy: 0.8942 - val_loss: 0.3018
Epoch 3/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - AUC: 0.8749 - accuracy: 0.8790 - loss: 0.3261 - val_AUC: 0.9013 - val_accuracy: 0.8990 - val_loss: 0.2893
Epoch 4/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - AUC: 0.8809 - accuracy: 0.8862 - loss: 0.3110 - val_AUC: 0.9042 - val_accuracy: 0.9020 - val_loss: 0.2832
Epoch 5/10
[1m573/573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - AUC: 0.8859 - accuracy: 0.8898 - loss: 0.3028 - val_AUC: 0.9087 - val_accuracy: 0.9042 - val_loss: 0.2762
Epoch 6/10
[1m573/573[0m [

<keras.src.callbacks.history.History at 0x17589c62540>

In [43]:
# Evaluate the model
cnn_eval = cnn_model.evaluate(X_test_cnn, y_test, verbose=0)
y_pred_proba_cnn = cnn_model.predict(X_test_cnn)
y_pred_cnn = (y_pred_proba_cnn > 0.5).astype(int)

[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [57]:
print("\n--- 1D CNN Model Evaluation ---")
print("Classification Report:")
print(classification_report(y_test, y_pred_cnn))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_cnn):.4f}")
f1_cnn = f1_score(y_test, y_pred_log_reg, pos_label=1)
cm = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix:")
print(cm)
print("-" * 70)
cnn_model.save('cnn_model.keras')


--- 1D CNN Model Evaluation ---
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.99      0.94      4487
           1       0.94      0.62      0.74      1241

    accuracy                           0.91      5728
   macro avg       0.92      0.80      0.84      5728
weighted avg       0.91      0.91      0.90      5728

ROC-AUC Score: 0.9113
Confusion Matrix:
[[4272  215]
 [ 588  653]]
----------------------------------------------------------------------


In [58]:
# Save model metadata
cnn_metadata = {
    'model_name': '1D CNN',
    'architecture': cnn_model.summary(),
    'training_epochs': 10,
    'batch_size': 32,
    'optimizer': 'adam',
    'roc_auc_score': roc_auc_score,
    'f1_score_positive_class': f1_cnn
}
joblib.dump(cnn_metadata, 'cnn_metadata.pkl')

print("Saved 1D CNN model and metadata to disk.")

NameError: name 'roc_auc_cnn' is not defined

## 1D CNN Model
### Why This Model Was Used
A 1D CNN was included to explore a deep learning approach to the problem. While CNNs are most famous for image processing (where they use 2D or 3D convolutions), a 1D CNN is effective for tabular data and sequential data. It can automatically learn and extract important features from the input data, potentially discovering patterns that other models might miss.

### How It Was Trained
This model was built using Sequential layers from Keras. Unlike the other models, it was not trained with GridSearchCV. Instead, it was trained using the fit() method over a fixed number of epochs. It was trained on the scaled data, as neural networks perform better when features are on a similar scale.

### Key Parameters Used
Conv1D: This is the core layer. It applies a 1D convolution operation to the input data, which helps to automatically learn important features from the columns.

Dropout: This layer randomly sets a fraction of input units to 0 at each update during training. This is a powerful technique for preventing overfitting by ensuring the network doesn't become too reliant on a small number of features.

Dense: These are standard fully-connected layers. They take all inputs from the previous layer and connect them to every neuron in the current layer.

optimizer='adam': The optimization algorithm used to train the network. Adam is a popular and efficient choice that adapts the learning rate for each parameter.

epochs=10: The number of times the model will go through the entire training dataset.

In [None]:
print("\nModel training pipeline complete. All three models and the scaler have been saved.")

In [None]:
# import os

# files_to_remove = [
#     '/kaggle/working/output_file1.csv',
#     '/kaggle/working/output_file2.csv',
#     '/kaggle/working/output_file3.csv'
# ]

# for file_path in files_to_remove:
#     if os.path.exists(file_path):
#         os.remove(file_path)
#         print(f"Removed: {file_path}")
#     else:
#         print(f"File not found: {file_path}")


### ANN

In [50]:
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import layers, models, optimizers, regularizers



X_train_np = np.asarray(X_train_scaled, dtype=np.float32)
X_test_np  = np.asarray(X_test_scaled,  dtype=np.float32)
y_train_np = np.asarray(y_train, dtype=np.float32)
y_test_np  = np.asarray(y_test,  dtype=np.float32)

classes = np.unique(y_train_np)
cw_vals = compute_class_weight(class_weight="balanced", classes=classes, y=y_train_np)
class_weight = {int(c): float(w) for c, w in zip(classes, cw_vals)}
print("Class weights:", class_weight)

def simple_ann_sequential(
    input_dim: int,
    hidden_units=(128, 64),
    dropout=(0.2, 0.2),
    lr=1e-3,
):
    model = models.Sequential(name="simple_ann")
    
    # Input layer
    model.add(layers.Input(shape=(input_dim,)))

    # Hidden layers with Batch Normalization and Dropout
    for units, dr in zip(hidden_units, dropout):
        model.add(layers.Dense(units, activation="relu"))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(dr))
    
    # Output layer
    model.add(layers.Dense(1, activation="sigmoid"))

    model.compile(
        optimizer=optimizers.Adam(learning_rate=lr),
        loss="binary_crossentropy",
        metrics=[
            "accuracy",
            tf.keras.metrics.AUC(name="auc", curve="ROC"),
        ],
    )
    return model

THRESHOLD = 0.40
input_dim = X_train_np.shape[1]
model = simple_ann_sequential(input_dim=input_dim)
#model.summary()

history = model.fit(
    X_train_np, y_train_np,
    validation_data=(X_test_np, y_test_np),
    epochs=50,
    batch_size=256,
    class_weight=class_weight,
    verbose=1
)

def predict_with_threshold(probs, thr=THRESHOLD):
    return (probs >= thr).astype(int)

# Correctly evaluate and save the metrics to variables
y_prob = model.predict(X_test_np).ravel()
y_pred = predict_with_threshold(y_prob, thr=THRESHOLD)

# Calculate and store the metrics
roc_auc_ann = roc_auc_score(y_test_np, y_prob)
f1_ann = f1_score(y_test_np, y_pred)
cm_ann = confusion_matrix(y_test_np, y_pred)

print(f"ROC-AUC Score: {roc_auc_ann:.4f}")
print("Confusion Matrix:\n", cm_ann)
print("\nClassification Report:")
print(classification_report(y_test_np, y_pred, digits=4))
print("-" * 70)

model_path = "ann_model_simple.keras"
meta_path  = "ann_meta_simple.json"

# Save model
model.save(model_path)

# Save model metadata
meta = {
    "model_name": "ANN (simple)",
    "threshold": THRESHOLD,
    "hidden_units": [128, 64],
    "dropout": [0.2, 0.2],
    "lr": 1e-3,
    "class_weight": class_weight,
    "roc_auc_score": roc_auc_ann,
    "f1_score_positive_class": f1_ann,
    "notes": "BN+ReLU+Dropout; numeric-only scaled inputs; seed=42",
}
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print(f"Saved: {model_path}, {meta_path}")
print("-" * 70)

Class weights: {0: 0.6382326721640295, 1: 2.3085449415558243}
Epoch 1/50
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.7292 - auc: 0.8299 - loss: 0.5313 - pr_auc: 0.6376 - val_accuracy: 0.8385 - val_auc: 0.8831 - val_loss: 0.4801 - val_pr_auc: 0.7612
Epoch 2/50
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8186 - auc: 0.8797 - loss: 0.4327 - pr_auc: 0.7426 - val_accuracy: 0.8499 - val_auc: 0.8955 - val_loss: 0.4131 - val_pr_auc: 0.7816
Epoch 3/50
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8337 - auc: 0.8913 - loss: 0.4077 - pr_auc: 0.7786 - val_accuracy: 0.8527 - val_auc: 0.9018 - val_loss: 0.3730 - val_pr_auc: 0.7950
Epoch 4/50
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8390 - auc: 0.8977 - loss: 0.3953 - pr_auc: 0.7902 - val_accuracy: 0.8652 - val_auc: 0.9055 - val_loss: 0.3427 - val_pr_auc: 0.8063
Epoch 5/50
[1m90

NameError: name 'pr_auc_ann' is not defined