In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb

from joblib import dump
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix


In [None]:
X_train = pd.read_csv('../data/splits/X_train.csv')
X_test = pd.read_csv('../data/splits/X_test.csv')
y_train = pd.read_csv('../data/splits/y_train.csv')
y_test = pd.read_csv('../data/splits/y_test.csv')

In [None]:
df = pd.read_csv('../data/cleaned/clean_2012_2024.csv')

In [None]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [None]:
# ANN Classification
early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=15, verbose=1)
ann_model = Sequential()

ann_model.add(Dense(128, activation='relu'))
ann_model.add(Dropout(0.5))
ann_model.add(Dense(64, activation='relu'))
ann_model.add(Dropout(0.5))
ann_model.add(Dense(32, activation='relu'))
ann_model.add(Dropout(0.5))
ann_model.add(Dense(16, activation='relu'))
ann_model.add(Dropout(0.5))
ann_model.add(Dense(1, activation='sigmoid'))

ann_model.compile(loss='binary_crossentropy', optimizer='adam')
ann_model.fit(X_train, y_train, epochs=100, batch_size=16, validation_data=(X_test, y_test), callbacks=[early_stop], verbose=False)
ann_predictions = (ann_model.predict(X_test) > 0.5).astype("int32")

In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=750, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

In [None]:
# XGBoost
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    seed=42,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    min_child_weight=1,
    reg_alpha=0,
    reg_lambda=1
)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

In [None]:
# Log Loss Ensemble Model
rf_probs = rf_model.predict_proba(X_test)[:, 1]
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]
ann_probs = ann_model.predict(X_test).ravel()

log_loss_rf = log_loss(y_test, rf_probs)
log_loss_xgb = log_loss(y_test, xgb_probs)
log_loss_ann = log_loss(y_test, ann_probs)

weights = {
    'rf': 1 / log_loss_rf,
    'xgb': 1 / log_loss_xgb,
    'ann': 1 / log_loss_ann
}

weights_sum = sum(weights.values())
normalized_weights = {k: v / weights_sum for k, v in weights.items()}


weighted_probs = (rf_probs * normalized_weights['rf'] +
                  xgb_probs * normalized_weights['xgb'] +
                  ann_probs * normalized_weights['ann'])

log_loss_predictions = (weighted_probs >= 0.5).astype(int)

In [None]:
print("ANN MODEL")
print(classification_report(y_test, ann_predictions))
print(confusion_matrix(y_test, ann_predictions))

print("RANDOM FOREST MODEL")
print(classification_report(y_test, rf_predictions))
print(confusion_matrix(y_test, rf_predictions))

print("XGBOOST MODEL")
print(classification_report(y_test, xgb_predictions))
print(confusion_matrix(y_test, xgb_predictions))

print("ENSEMBLE MODEL LOG-LOSS ")
print(classification_report(y_test, log_loss_predictions))
print(confusion_matrix(y_test, log_loss_predictions))

In [None]:
# Export Models
ann_model.save('../results/models/ann_model.keras')
xgb_model.save_model('../results/models/xgb_model.json')
dump(rf_model, '../results/models/rf_model.joblib')