In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, f1_score
from catboost import CatBoostClassifier
import joblib

# Function to load data and split into features and target
def load_data(file_path, target_columns, selected_features):
    data = pd.read_csv(file_path)
    X = data[selected_features]
    y = data[target_columns]
    return X, y

# Function to initialize a tuned CatBoost classifier within MultiOutputClassifier
def initialize_tuned_catboost():
    return MultiOutputClassifier(
        CatBoostClassifier(
            verbose=50,
            random_state=42,
            early_stopping_rounds=20,
            learning_rate=0.025,
            depth=7,
            iterations=300,
            scale_pos_weight=7,
            l2_leaf_reg=2
        )
    )

# Main workflow
if __name__ == "__main__":
    # Define file path, features, and target columns
    file_path = 'xydata_2021winter.csv'
    target_columns_clusters = [f"Cluster{i}" for i in range(6)]
    selected_features_updated = [
        "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
        "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
        "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
        "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
        "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
        "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
    ]
    
    # Load and split the data
    X, y = load_data(file_path, target_columns_clusters, selected_features_updated)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Initialize and train the model
    print("Training the tuned CatBoost model...")
    tuned_catboost_model = initialize_tuned_catboost()
    tuned_catboost_model.fit(X_train, y_train)
    
    # Evaluate the model
    print("Evaluating the model...")
    y_test_pred = tuned_catboost_model.predict(X_test)
    test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
    print(f"Test F1 Score: {test_f1_score:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred, zero_division=0))

# Save the best estimator from the grid search
joblib.dump(tuned_catboost_model, 'catboost_multioutput_model.pkl')
print("Model saved successfully.")


Training the tuned CatBoost model...
0:	learn: 0.6827533	total: 54ms	remaining: 16.1s
50:	learn: 0.4220702	total: 120ms	remaining: 587ms
100:	learn: 0.3304941	total: 179ms	remaining: 353ms
150:	learn: 0.2741036	total: 255ms	remaining: 251ms
200:	learn: 0.2276690	total: 332ms	remaining: 163ms
250:	learn: 0.1873314	total: 399ms	remaining: 77.9ms
299:	learn: 0.1571405	total: 462ms	remaining: 0us
0:	learn: 0.6824584	total: 3.62ms	remaining: 1.08s
50:	learn: 0.4019573	total: 82.6ms	remaining: 403ms
100:	learn: 0.2962517	total: 212ms	remaining: 417ms
150:	learn: 0.2362105	total: 334ms	remaining: 330ms
200:	learn: 0.1913557	total: 477ms	remaining: 235ms
250:	learn: 0.1495897	total: 620ms	remaining: 121ms
299:	learn: 0.1172553	total: 751ms	remaining: 0us
0:	learn: 0.6759461	total: 2.97ms	remaining: 887ms
50:	learn: 0.3477279	total: 53.3ms	remaining: 260ms
100:	learn: 0.2794743	total: 103ms	remaining: 203ms
150:	learn: 0.2389112	total: 152ms	remaining: 150ms
200:	learn: 0.2097690	total: 206ms	r

In [2]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the trained model
print("Loading the trained model...")
model = joblib.load('catboost_multioutput_model.pkl')

# Step 2: Load the new dataset
print("Loading the dataset...")
data = pd.read_csv('xydata_2023winter.csv')

# Step 3: Define the feature and target columns
features = [
    "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

target_columns = [f"Cluster{i}" for i in range(6)]

# Step 4: Ensure all features are present in the dataset
for feature in features:
    if feature not in data.columns:
        data[feature] = 0  # Add missing features with a default value of 0

# Step 5: Extract the features (X) and target (y)
X = data[features]
y_true = data[target_columns]

# Step 6: Generate predictions
print("Making predictions...")
y_pred = model.predict(X)

# Step 7: Evaluate predictions
# Rename clusters for simplicity
y_pred_df = pd.DataFrame(y_pred, columns=[f"C{i}" for i in range(6)])
y_true.columns = [f"C{i}" for i in range(6)]

# Calculate the accuracy score (mean accuracy across all clusters)
cluster_accuracies = []
for col in y_pred_df.columns:
    cluster_accuracy = accuracy_score(y_true[col], y_pred_df[col])
    cluster_accuracies.append(cluster_accuracy)
overall_accuracy = sum(cluster_accuracies) / len(cluster_accuracies)

print(f"Overall Test Set Accuracy Score (Average of All Clusters): {overall_accuracy:.2f}")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred_df, target_names=y_pred_df.columns, zero_division=0))

# Step 8: Show a comparison of predictions and true labels for the first 5 examples
print("\nFirst 5 Predictions vs. True Labels:")

# Concatenate predictions and true values for comparison
comparison = pd.concat(
    [y_pred_df.head(5), y_true.head(5).reset_index(drop=True)],
    axis=1,
    keys=["Predicted", "True"]
)

# Format and display predictions and true labels side by side
separator = "\n" + "-" * 50 + "\n"
formatted_output = ""

for index, row in comparison.iterrows():
    formatted_output += f"Row {index + 1}{separator}"
    for col in y_pred_df.columns:
        formatted_output += f"{col}: Predicted={row[('Predicted', col)]}, True={row[('True', col)]}\n"
    formatted_output += separator

print(formatted_output)


Loading the trained model...
Loading the dataset...
Making predictions...
Overall Test Set Accuracy Score (Average of All Clusters): 0.63

Classification Report:
              precision    recall  f1-score   support

          C0       0.50      0.87      0.63       195
          C1       0.41      0.70      0.51       155
          C2       0.61      0.97      0.75       257
          C3       0.60      0.94      0.73       256
          C4       0.59      0.93      0.72       244
          C5       0.63      0.97      0.76       283

   micro avg       0.57      0.92      0.70      1390
   macro avg       0.55      0.90      0.68      1390
weighted avg       0.57      0.92      0.70      1390
 samples avg       0.50      0.77      0.57      1390


First 5 Predictions vs. True Labels:
Row 1
--------------------------------------------------
C0: Predicted=1, True=1
C1: Predicted=0, True=1
C2: Predicted=1, True=1
C3: Predicted=1, True=0
C4: Predicted=1, True=1
C5: Predicted=1, True=0

-