In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, f1_score
from catboost import CatBoostClassifier

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]
selected_features_updated = [
    "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Extract target and features
X = data[selected_features_updated]
y = data[target_columns_clusters]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Initialize CatBoostClassifier within a MultiOutputClassifier
catboost_model = MultiOutputClassifier(CatBoostClassifier(verbose=0, random_state=42))

# Updated hyperparameter grid for expanded tuning
# {'estimator__depth': 8, 'estimator__iterations': 100, 'estimator__l2_leaf_reg': 3, 'estimator__learning_rate': 0.05, 'estimator__scale_pos_weight': 5}

param_grid_expanded = {
    'estimator__learning_rate': [0.01, 0.05, 0.1],
    'estimator__depth': [3, 5, 7, 8],
    'estimator__iterations': [100, 200, 300, 500],
    'estimator__scale_pos_weight': [5, 10, 20],
    'estimator__l2_leaf_reg': [3, 5, 7]
}

# Initialize CatBoost with early stopping
catboost_model = MultiOutputClassifier(
    CatBoostClassifier(
        verbose=50,
        random_state=42,
        early_stopping_rounds=20
    )
)

# Perform grid search with expanded parameter grid
grid_catboost_expanded = GridSearchCV(
    catboost_model,
    param_grid_expanded,
    cv=5,
    scoring='f1_weighted',
    return_train_score=True,
    verbose=1,
    n_jobs=-1
)
grid_catboost_expanded.fit(X_train, y_train)

# Best parameters and scores
best_params_expanded = grid_catboost_expanded.best_params_
best_train_score_expanded = max(grid_catboost_expanded.cv_results_['mean_train_score'])
best_val_score_expanded = grid_catboost_expanded.best_score_

# Test set predictions
y_test_pred_expanded = grid_catboost_expanded.best_estimator_.predict(X_test)
test_f1_score_expanded = f1_score(y_test, y_test_pred_expanded, average='weighted')

# Display results
print("Best parameters with expanded tuning:")
print(best_params_expanded)
print(f"Best cross-validation train score: {best_train_score_expanded:.2f}")
print(f"Best cross-validation validation score: {best_val_score_expanded:.2f}")
print(f"Test-set F1-score: {test_f1_score_expanded:.2f}")
print("\nClassification report on the test set:")
print(classification_report(y_test, y_test_pred_expanded, zero_division=0))


Fitting 5 folds for each of 432 candidates, totalling 2160 fits
0:	learn: 0.6896868	total: 51.4ms	remaining: 5.09s
0:	learn: 0.6892082	total: 50.5ms	remaining: 5s
0:	learn: 0.6896734	total: 51.7ms	remaining: 5.12s
50:	learn: 0.6000341	total: 105ms	remaining: 101ms
0:	learn: 0.6892794	total: 51.3ms	remaining: 5.08s
50:	learn: 0.5838185	total: 107ms	remaining: 103ms
0:	learn: 0.6895601	total: 51.1ms	remaining: 5.06s
50:	learn: 0.5969785	total: 106ms	remaining: 102ms
99:	learn: 0.5546159	total: 159ms	remaining: 0us
50:	learn: 0.5844736	total: 109ms	remaining: 105ms
0:	learn: 0.6867982	total: 55.9ms	remaining: 5.54s
0:	learn: 0.6877276	total: 57.1ms	remaining: 5.65s
99:	learn: 0.5375719	total: 163ms	remaining: 0us
0:	learn: 0.6904221	total: 1.34ms	remaining: 133ms
0:	learn: 0.6903228	total: 1.41ms	remaining: 140ms
50:	learn: 0.5999008	total: 112ms	remaining: 108ms
99:	learn: 0.5508331	total: 165ms	remaining: 0us
0:	learn: 0.6872855	total: 59ms	remaining: 5.84s
0:	learn: 0.6898632	total: 1.

  _data = np.array(data, dtype=dtype, copy=copy,


0:	learn: 0.6729557	total: 65.2ms	remaining: 6.46s
50:	learn: 0.3313901	total: 183ms	remaining: 176ms
99:	learn: 0.2230778	total: 318ms	remaining: 0us
0:	learn: 0.6745755	total: 4.83ms	remaining: 478ms
50:	learn: 0.2903036	total: 144ms	remaining: 138ms
99:	learn: 0.1704170	total: 273ms	remaining: 0us
0:	learn: 0.6681515	total: 3.17ms	remaining: 314ms
50:	learn: 0.2929775	total: 138ms	remaining: 133ms
99:	learn: 0.2017704	total: 261ms	remaining: 0us
0:	learn: 0.6686642	total: 3.26ms	remaining: 322ms
50:	learn: 0.2926215	total: 173ms	remaining: 167ms
99:	learn: 0.2060780	total: 298ms	remaining: 0us
0:	learn: 0.6588829	total: 2.45ms	remaining: 243ms
50:	learn: 0.2711598	total: 128ms	remaining: 123ms
99:	learn: 0.1838655	total: 270ms	remaining: 0us
0:	learn: 0.6586929	total: 3.46ms	remaining: 343ms
50:	learn: 0.2512785	total: 133ms	remaining: 128ms
99:	learn: 0.1811299	total: 296ms	remaining: 0us
Best parameters with expanded tuning:
{'estimator__depth': 8, 'estimator__iterations': 100, 'e