In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, f1_score
from catboost import CatBoostClassifier

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]
selected_features_updated = [
    "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Extract target and features
X = data[selected_features_updated]
y = data[target_columns_clusters]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Initialize CatBoostClassifier within a MultiOutputClassifier
catboost_model = MultiOutputClassifier(
    CatBoostClassifier(
        verbose=50,
        random_state=42,
        early_stopping_rounds=20
    )
)

# Updated hyperparameter grid for expanded tuning
param_grid_expanded = {
    'estimator__learning_rate': [0.01, 0.05, 0.1],  # Learning rate
    'estimator__depth': [3, 5, 7, 8],  # Tree depth
    'estimator__iterations': [100, 200, 300, 500],  # Number of boosting iterations
    'estimator__scale_pos_weight': [5, 10, 20],  # Balance for positive classes
    'estimator__l2_leaf_reg': [3, 5, 7, 10, 12],  # Regularization
    'estimator__bagging_temperature': [0, 0.3, 0.5, 0.7, 1.0],  # Feature bagging
    'estimator__colsample_bylevel': [0.6, 0.8, 1.0],  # Feature sampling at each tree level
}

# Perform grid search with expanded parameter grid
grid_catboost_expanded = GridSearchCV(
    catboost_model,
    param_grid_expanded,
    cv=5,
    scoring='f1_weighted',
    return_train_score=True,
    verbose=1,
    n_jobs=-1
)

# Fit the model
grid_catboost_expanded.fit(X_train, y_train)

# Best parameters and scores
best_params_expanded = grid_catboost_expanded.best_params_
best_train_score_expanded = max(grid_catboost_expanded.cv_results_['mean_train_score'])
best_val_score_expanded = grid_catboost_expanded.best_score_

# Test set predictions
y_test_pred_expanded = grid_catboost_expanded.best_estimator_.predict(X_test)
test_f1_score_expanded = f1_score(y_test, y_test_pred_expanded, average='weighted')

# Feature importance logging
final_model = grid_catboost_expanded.best_estimator_.estimators_[0]
feature_importances = pd.DataFrame({
    'Feature': selected_features_updated,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display results
print("Best parameters with expanded tuning:")
print(best_params_expanded)
print(f"Best cross-validation train score: {best_train_score_expanded:.2f}")
print(f"Best cross-validation validation score: {best_val_score_expanded:.2f}")
print(f"Test-set F1-score: {test_f1_score_expanded:.2f}")
print("\nClassification report on the test set:")
print(classification_report(y_test, y_test_pred_expanded, zero_division=0))
print("\nTop Feature Importances:")
print(feature_importances.head(10))
