In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, f1_score
from catboost import CatBoostClassifier

# Function to load data and split into features and target
def load_data(file_path, target_columns, selected_features):
    data = pd.read_csv(file_path)
    X = data[selected_features]
    y = data[target_columns]
    return X, y

# Function to initialize CatBoost classifier within a MultiOutputClassifier
def initialize_catboost():
    return MultiOutputClassifier(
        CatBoostClassifier(
            verbose=50,
            random_state=42,
            early_stopping_rounds=20
        )
    )

# Main workflow
target_columns_clusters = [f"Cluster{i}" for i in range(6)]
selected_features_updated = [
    "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]
file_path = 'xydata_2021winter.csv'

# Load the data and split
X, y = load_data(file_path, target_columns_clusters, selected_features_updated)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the model
catboost_model = initialize_catboost()

# Define a simpler hyperparameter grid
# param_grid = {
#     'estimator__learning_rate': [0.025, 0.05, 0.75],
#     'estimator__depth': [7, 8, 9],
#     'estimator__iterations': [50, 100, 300],
#     'estimator__scale_pos_weight': [7, 5, 10],
#     'estimator__l2_leaf_reg': [2, 3, 4]
# }
# Best parameters: {'estimator__depth': 7, 'estimator__iterations': 300, 'estimator__l2_leaf_reg': 2, 'estimator__learning_rate': 0.025, 'estimator__scale_pos_weight': 7}
# param_grid = {
#     'estimator__learning_rate': [0.015, 0.025],
#     'estimator__depth': [6, 7],
#     'estimator__iterations': [300,400],
#     'estimator__scale_pos_weight': [7, 8],
#     'estimator__l2_leaf_reg': [1, 2]
# }
# Best parameters: {'estimator__depth': 7, 'estimator__iterations': 300, 'estimator__l2_leaf_reg': 2, 'estimator__learning_rate': 0.025, 'estimator__scale_pos_weight': 7}
# Best validation score: 0.69
# Test F1 score: 0.70

# Classification report on the test set:
#               precision    recall  f1-score   support

#            0       0.45      0.82      0.58        55
#            1       0.47      0.79      0.59        48
#            2       0.59      0.97      0.73        75
#            3       0.57      0.93      0.71        76
#            4       0.61      0.92      0.73        73
#            5       0.64      0.96      0.76        93

#    micro avg       0.57      0.91      0.70       420
#    macro avg       0.56      0.90      0.69       420
# weighted avg       0.57      0.91      0.70       420
#  samples avg       0.50      0.76      0.58       420

param_grid = {
    'estimator__learning_rate': [0.025],
    'estimator__depth': [7],
    'estimator__iterations': [300],
    'estimator__scale_pos_weight': [6, 7],
    'estimator__l2_leaf_reg': [2]
}


# Perform grid search
grid_search = GridSearchCV(
    catboost_model,
    param_grid,
    cv=5,
    scoring='f1_weighted',
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Summarize results
best_params = grid_search.best_params_
best_val_score = grid_search.best_score_
y_test_pred = grid_search.best_estimator_.predict(X_test)
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')

# Calculate train scores
mean_train_score = np.mean(grid_search.cv_results_['mean_train_score']) if 'mean_train_score' in grid_search.cv_results_ else None

# Display results
print("Best parameters:", best_params)
if mean_train_score is not None:
    print(f"Best cross-validation train score: {mean_train_score:.2f}")
print(f"Best validation score: {best_val_score:.2f}")
print(f"Test F1 score: {test_f1_score:.2f}")
print("\nClassification report on the test set:")
print(classification_report(y_test, y_test_pred, zero_division=0))

import joblib

# Save the best estimator from the grid search
joblib.dump(grid_search.best_estimator_, 'catboost_multioutput_model.pkl')
print("Model saved successfully.")


Fitting 5 folds for each of 2 candidates, totalling 10 fits
0:	learn: 0.6817977	total: 54.2ms	remaining: 16.2s
0:	learn: 0.6835206	total: 54.5ms	remaining: 16.3s
0:	learn: 0.6832157	total: 53.9ms	remaining: 16.1s
0:	learn: 0.6844532	total: 56.3ms	remaining: 16.8s
0:	learn: 0.6806648	total: 58ms	remaining: 17.4s
0:	learn: 0.6807851	total: 60.6ms	remaining: 18.1s
0:	learn: 0.6816439	total: 67.4ms	remaining: 20.1s
0:	learn: 0.6826758	total: 77.4ms	remaining: 23.2s
0:	learn: 0.6822063	total: 68.7ms	remaining: 20.6s
0:	learn: 0.6838050	total: 167ms	remaining: 49.9s
50:	learn: 0.4216515	total: 573ms	remaining: 2.79s
50:	learn: 0.4032509	total: 567ms	remaining: 2.77s
50:	learn: 0.4157615	total: 630ms	remaining: 3.08s
50:	learn: 0.4265300	total: 669ms	remaining: 3.27s
50:	learn: 0.3997328	total: 664ms	remaining: 3.24s
50:	learn: 0.4237002	total: 678ms	remaining: 3.31s
50:	learn: 0.4090033	total: 595ms	remaining: 2.9s
50:	learn: 0.4137011	total: 607ms	remaining: 2.96s
50:	learn: 0.3993052	total

In [2]:
import joblib

# Save the best estimator from the grid search
joblib.dump(grid_search.best_estimator_, 'catboost_multioutput_model.pkl')
print("Model saved successfully.")


Model saved successfully.


In [4]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the trained model
print("Loading the trained model...")
model = joblib.load('catboost_multioutput_model.pkl')

# Step 2: Load the new dataset
print("Loading the dataset...")
data = pd.read_csv('xydata_2023winter.csv')

# Step 3: Define the feature and target columns
features = [
    "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

target_columns = [f"Cluster{i}" for i in range(6)]

# Step 4: Ensure all features are present in the dataset
for feature in features:
    if feature not in data.columns:
        data[feature] = 0  # Add missing features with a default value of 0

# Step 5: Extract the features (X) and target (y)
X = data[features]
y_true = data[target_columns]

# Step 6: Generate predictions
print("Making predictions...")
y_pred = model.predict(X)

# Step 7: Evaluate predictions
# Rename clusters for simplicity
y_pred_df = pd.DataFrame(y_pred, columns=[f"C{i}" for i in range(6)])
y_true.columns = [f"C{i}" for i in range(6)]

# Calculate the accuracy score (mean accuracy across all clusters)
cluster_accuracies = []
for col in y_pred_df.columns:
    cluster_accuracy = accuracy_score(y_true[col], y_pred_df[col])
    cluster_accuracies.append(cluster_accuracy)
overall_accuracy = sum(cluster_accuracies) / len(cluster_accuracies)

print(f"Overall Test Set Accuracy Score (Average of All Clusters): {overall_accuracy:.2f}")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred_df, target_names=y_pred_df.columns, zero_division=0))

# Step 8: Show a comparison of predictions and true labels for the first 5 examples
print("\nFirst 5 Predictions vs. True Labels:")

# Concatenate predictions and true values for comparison
comparison = pd.concat(
    [y_pred_df.head(5), y_true.head(5).reset_index(drop=True)],
    axis=1,
    keys=["Predicted", "True"]
)

# Format and display predictions and true labels side by side
separator = "\n" + "-" * 50 + "\n"
formatted_output = ""

for index, row in comparison.iterrows():
    formatted_output += f"Row {index + 1}{separator}"
    for col in y_pred_df.columns:
        formatted_output += f"{col}: Predicted={row[('Predicted', col)]}, True={row[('True', col)]}\n"
    formatted_output += separator

print(formatted_output)


Loading the trained model...
Loading the dataset...
Making predictions...
Overall Test Set Accuracy Score (Average of All Clusters): 0.63

Classification Report:
              precision    recall  f1-score   support

          C0       0.50      0.87      0.63       195
          C1       0.41      0.70      0.51       155
          C2       0.61      0.97      0.75       257
          C3       0.60      0.94      0.73       256
          C4       0.59      0.93      0.72       244
          C5       0.63      0.97      0.76       283

   micro avg       0.57      0.92      0.70      1390
   macro avg       0.55      0.90      0.68      1390
weighted avg       0.57      0.92      0.70      1390
 samples avg       0.50      0.77      0.57      1390


First 5 Predictions vs. True Labels:
Row 1
--------------------------------------------------
C0: Predicted=1, True=1
C1: Predicted=0, True=1
C2: Predicted=1, True=1
C3: Predicted=1, True=0
C4: Predicted=1, True=1
C5: Predicted=1, True=0

-

In [5]:
# Load the saved model
loaded_model = joblib.load('catboost_multioutput_model.pkl')
print("Model loaded successfully.")

# Make predictions with new data (replace `new_data` with your new input data)
new_data = pd.DataFrame({
    # Replace with your new data features
    "Time_Period": [1],
    "Stn Press (kPa)": [100.5],
    "Dew Point Temp (°C)": [5.0],
    "Rel Hum (%)": [60],
    "Visibility (km)": [10],
    "Temp (°C)": [15],
    "Wind Dir (10s deg)": [3],
    "Wind Spd (km/h)": [5],
    "C0D-1HA": [0.1],
    "C0D-2HA": [0.2],
    "C0D-4HA": [0.3],
    "C1D-1HA": [0.1],
    "C1D-2HA": [0.2],
    "C1D-4HA": [0.3],
    "C2D-1HA": [0.1],
    "C2D-2HA": [0.2],
    "C2D-3HA": [0.3],
    "C3D-1HA": [0.1],
    "C4D-1HA": [0.2],
    "C4D-2HA": [0.3],
    "C5D-1HA": [0.1],
})

predictions = loaded_model.predict(new_data)
print("Predictions on new data:")
print(predictions)


Model loaded successfully.
Predictions on new data:
[[1 0 1 1 1 1]]
