In [23]:
!pip install catboost scikit-learn xgboost imblearn optuna -q

[0m

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from imblearn.combine import SMOTETomek
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Preprocessing
df = pd.read_csv('/home/workspace/-24633600_1327689908.csv')
# df = pd.read_csv('/content/drive/MyDrive/Projects/TurboFan_20241208_1/-24633600_1327689908.csv')

X = df.drop(['Label'], axis=1)
y = df['Label']

random_state = 42
random_seed = 42
seed = 8

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=10)
dbscan_labels = dbscan.fit_predict(X_scaled)

mask = dbscan_labels != -1
X_filtered = X_scaled[mask]
y_filtered = y[mask]

top_features = [7, 4, 1, 0, 6]
X_filtered_selected = X_filtered[:, top_features]

X_train, X_test, y_train, y_test = train_test_split(
    X_filtered_selected, y_filtered, test_size=0.2, random_state= random_state, stratify=y_filtered
)

smote_tomek = SMOTETomek(random_state=random_state)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

import optuna

def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 2000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 20),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 0, 1),
        'loss_function': 'MultiClass',
        'verbose': 0,
        'random_seed': random_seed
    }
    model = CatBoostClassifier(**param)
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average='macro')

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=seed))
study.optimize(objective, n_trials=70)
best_params = study.best_params
print("Best Parameters from Optuna:", best_params)

cat_model = CatBoostClassifier(**best_params, verbose=0, random_seed=random_seed)
cat_model.fit(X_resampled, y_resampled)
y_pred_cat = cat_model.predict(X_test)
print("F1 Score (macro) for CatBoost:", f1_score(y_test, y_pred_cat, average='macro'))

rf_model = RandomForestClassifier(random_state=random_state, n_estimators=100, max_depth=5)
gb_model = GradientBoostingClassifier(random_state=random_state, n_estimators=100, max_depth=5)

stacking_clf = StackingClassifier(
    estimators=[('cat', cat_model), ('rf', rf_model), ('gb', gb_model)],
    final_estimator=LogisticRegression(),
    cv=5
)
stacking_clf.fit(X_resampled, y_resampled)
y_pred_stack = stacking_clf.predict(X_test)
print("F1 Score (macro) for StackingClassifier:", f1_score(y_test, y_pred_stack, average='macro'))

[I 2024-12-10 15:07:38,134] A new study created in memory with name: no-name-21831cbd-e0a1-4df9-8809-efb6260c801d
[I 2024-12-10 15:08:33,838] Trial 0 finished with value: 0.7682539682539683 and parameters: {'iterations': 1800, 'learning_rate': 0.17808619309984267, 'depth': 11, 'l2_leaf_reg': 11.08625813955638, 'border_count': 84, 'random_strength': 0.011398804277429897}. Best is trial 0 with value: 0.7682539682539683.
[I 2024-12-10 15:08:38,376] Trial 1 finished with value: 0.7682539682539683 and parameters: {'iterations': 900, 'learning_rate': 0.022057740377252798, 'depth': 8, 'l2_leaf_reg': 10.089444120245647, 'border_count': 156, 'random_strength': 0.5433860175425403}. Best is trial 0 with value: 0.7682539682539683.
[I 2024-12-10 15:08:51,480] Trial 2 finished with value: 0.7806637806637807 and parameters: {'iterations': 1600, 'learning_rate': 0.06922083093512263, 'depth': 9, 'l2_leaf_reg': 9.095743639047651, 'border_count': 96, 'random_strength': 0.9738552412004462}. Best is trial 

Best Parameters from Optuna: {'iterations': 1800, 'learning_rate': 0.08901253643210584, 'depth': 6, 'l2_leaf_reg': 8.105501613485234, 'border_count': 137, 'random_strength': 0.23602215621504516}
F1 Score (macro) for CatBoost: 0.8854700854700855
F1 Score (macro) for StackingClassifier: 0.8497435897435898
