In [1]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import (classification_report, accuracy_score, 
                             confusion_matrix, ConfusionMatrixDisplay, make_scorer, roc_auc_score)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt

In [2]:
# Load data
df_train = pd.read_csv('../data/processed/df_train.csv')
df_test = pd.read_csv('../data/processed/df_test.csv')
df_val = pd.read_csv('../data/processed/df_val.csv')

In [None]:
X_train = df_train.drop(columns='label')
X_test = df_test.drop(columns='label')
X_val = df_val.drop(columns='label')

y_train = df_train['label']
y_test = df_test['label']
y_val = df_val['label']

In [21]:
le = LabelEncoder()

y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
y_val_enc = le.transform(y_val)

In [27]:
# Machine learning
gb_clf = Pipeline(steps=[
    ('gb', GradientBoostingClassifier(random_state=2025))
])

rf_clf = Pipeline(steps=[
    ('rf', RandomForestClassifier(random_state=2025))
])

# Grids for searching best params
param_grids = {
    'rf':{
        'rf__n_estimators': [200, 400, 600],
        'rf__max_depth': [5, 10, 15],
        'rf__min_samples_split': [2, 5, 10],
        'rf__min_samples_leaf': [1, 3, 5],
        'rf__max_features': ['sqrt', 'log2'],
    },

    'gb':{
        'gb__n_estimators': [100, 200, 400],
        'gb__learning_rate': [0.01, 0.05, 0.1],
        'gb__max_depth': [2, 3, 5],
        'gb__min_samples_split': [2, 5],
        'gb__min_samples_leaf': [1, 3],
        'gb__subsample': [0.7, 0.9, 1.0],
        'gb__max_features': ['sqrt', 'log2']
    }
}

In [23]:
def display_metrics(y_hat, y_test, name):
    print(f'Accuracy score: {accuracy_score(y_hat, y_test)}')
    print(classification_report(y_hat, y_test))

    cm = confusion_matrix(y_hat, y_test, labels=[True, False])
    disp = ConfusionMatrixDisplay(cm, display_labels=['spam', 'not_spam'])

    disp.plot()
    plt.title(f'Confusion matrix for {name}')
    plt.show()

In [28]:
weighted_auc = make_scorer(roc_auc_score, needs_proba=True, greater_is_better=True)

models = [
          (rf_clf,'Random Forest', 'rf'),
          (gb_clf,'Histogram GBoosting', 'gb')
]

performance = {}

for est, name, sname in models:
  # Train models
  est.fit(X_train, y_train_enc)

  estimator_cv = GridSearchCV(
      est,
      param_grid=param_grids[sname],
      scoring=weighted_auc,
      cv=5
  )

  # Train model with best params
  estimator_cv.fit(X_train, y_train_enc)
  y_hat = estimator_cv.predict(X_val)

  # Save metrics in performance
  performance[name] = {
    'Accuracy': accuracy_score(y_val_enc, y_hat),
    'Best Params': estimator_cv.best_params_,
    'Estimator': estimator_cv.best_estimator_
  }

  # metrics
  display_metrics(y_train_enc, y_hat)


Traceback (most recent call last):
  File "/workspaces/particle-collision-classification/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/particle-collision-classification/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 308, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/particle-collision-classification/.venv/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 408, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/particle-collision-classification/.venv/lib/python3.12/site-packages/sk

KeyboardInterrupt: 