In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import lightgbm as lgbm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, 
    average_precision_score, confusion_matrix, log_loss, cohen_kappa_score,
    classification_report,
    ConfusionMatrixDisplay
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler
import lime
import lime.lime_tabular
import shap
import joblib
import seaborn as sns
from skopt import BayesSearchCV
from skopt.space import Integer
import warnings


In [2]:
train_data = pd.read_csv("datasets/train.csv")
X = train_data.drop("EVENT", axis=1)
y = train_data["EVENT"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
rf_model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': Integer(100, 200),
    'max_depth': Integer(3, 20),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 20),
    'max_features': Integer(1, X.shape[1])
}

opt = BayesSearchCV(
    estimator=rf_model,
    search_spaces=param_grid,
    n_iter=1,
    scoring='recall',
    cv=2,
    n_jobs=-1,
    random_state=42
)


In [8]:
start_train = time.time()
opt.fit(X_train, y_train)
end_train = time.time()
training_time = end_train - start_train

best_model = opt.best_estimator_

val_preds = best_model.predict(X_val)
print("========== VALIDATION RESULTS ==========")
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))
print("\nClassification Report:\n", classification_report(y_val, val_preds))
print(f"Training time: {training_time:.2f} seconds")


Confusion Matrix:
 [[27425    25]
 [  818 26802]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98     27450
           1       1.00      0.97      0.98     27620

    accuracy                           0.98     55070
   macro avg       0.99      0.98      0.98     55070
weighted avg       0.99      0.98      0.98     55070

Training time: 706.05 seconds


In [9]:
test_data = pd.read_csv("datasets/test.csv")
if "EVENT" in test_data.columns:
    test_data = test_data.drop("EVENT", axis=1)

start_infer = time.time()
test_preds = best_model.predict(test_data)
end_infer = time.time()
inference_time = end_infer - start_infer

print(f"Inference Time: {inference_time}")



Inference Time: 1.6340532302856445


In [12]:
joblib.dump(best_model, 'rf_model.pkl')

['rf_model.pkl']

In [10]:
cm = confusion_matrix(test_data, test_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [None]:
plt.figure(figsize=(6, 4))
sns.barplot(x=['Training Time', 'Inference Time'], y=[training_time, inference_time], palette='viridis')
plt.ylabel('Time (seconds)')
plt.title('Training vs Inference Time')
plt.show()


In [None]:
explainer = shap.Explainer(best_model, X_train)
shap_values = explainer(X_val)

shap.summary_plot(shap_values, X_val, plot_type="bar")


In [None]:
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X_train.columns.tolist(),
    class_names=['Class 0', 'Class 1'],
    mode='classification'
)

i = 0  # index of instance to explain
exp = lime_explainer.explain_instance(X_val.iloc[i].values, best_model.predict_proba, num_features=10)
exp.show_in_notebook(show_all=False)
