In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, f1_score
from catboost import CatBoostClassifier

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]
selected_features_updated = [
    "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Extract target and features
X = data[selected_features_updated]
y = data[target_columns_clusters]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Initialize CatBoostClassifier within a MultiOutputClassifier
catboost_model = MultiOutputClassifier(CatBoostClassifier(verbose=0, random_state=42))

# Define parameter grid for CatBoost
param_grid = {
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__depth': [3, 5, 7],
    'estimator__iterations': [100, 200],
    'estimator__scale_pos_weight': [5, 10, 20]  # Adjusting for imbalanced data
}

# Perform GridSearchCV
grid_catboost = GridSearchCV(
    catboost_model, param_grid, cv=5, scoring='f1_weighted', return_train_score=True, verbose=1, n_jobs=-1
)
grid_catboost.fit(X_train, y_train)

# Get the best parameters and scores
best_params = grid_catboost.best_params_
best_train_score = max(grid_catboost.cv_results_['mean_train_score'])
best_val_score = grid_catboost.best_score_

# Predict on the test set
y_test_pred = grid_catboost.best_estimator_.predict(X_test)

# Calculate test F1-score
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')

# Display detailed results
print("Best parameters:")
print(best_params)
print(f"Best cross-validation train score: {best_train_score:.2f}")
print(f"Best cross-validation validation score: {best_val_score:.2f}")
print(f"Test-set F1-score: {test_f1_score:.2f}")
print("\nClassification report on the test set:")
print(classification_report(y_test, y_test_pred, zero_division=0))


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters:
{'estimator__depth': 3, 'estimator__iterations': 100, 'estimator__learning_rate': 0.1, 'estimator__scale_pos_weight': 5}
Best cross-validation train score: 1.00
Best cross-validation validation score: 0.68
Test-set F1-score: 0.70

Classification report on the test set:
              precision    recall  f1-score   support

           0       0.47      0.85      0.61        55
           1       0.46      0.79      0.58        48
           2       0.57      0.96      0.71        75
           3       0.56      0.93      0.70        76
           4       0.61      0.93      0.74        73
           5       0.66      0.95      0.78        93

   micro avg       0.56      0.91      0.70       420
   macro avg       0.55      0.90      0.69       420
weighted avg       0.57      0.91      0.70       420
 samples avg       0.49      0.75      0.57       420

