In [None]:
# some library imports
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

data1718 = pd.read_csv('xydata_2017to2018.csv')
data19 = pd.read_csv('xydata_2019.csv')

In [2]:
# # Adjust pandas options to display all columns
# pd.set_option('display.max_columns', None)

# # Display the first two rows of cv_set with all columns
# print(data1718.head(2))
# data1718.info()

In [3]:
# Define the target columns for clusters and incident types
target_columns_clusters = [f"Cluster{i}" for i in range(6)]  # Predict probabilities for all clusters
print(target_columns_clusters)

# Drop non-predictive columns for X
columns_to_drop = target_columns_clusters + ["Date"]
y = data1718[target_columns_clusters]  # Target
X = data1718.drop(columns=columns_to_drop)  # Features

print("First few rows of X:")
X.head(2)

['Cluster0', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5']
First few rows of X:


Unnamed: 0,Time_Period,C0D-1HA,C0D-2HA,C0D-3HA,C0D-4HA,C0D-5HA,C0D-6HA,C0D-7HA,C0D-8HA,C0D-9HA,...,Temp (°C),Dew Point Temp (°C),Rel Hum (%),Wind Dir (10s deg),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Day_Of_Week,Is_Weekend,Is_Holiday
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.085667,-14.783333,79.833333,18.666667,25.5,8.116667,88.966667,6,1,1
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005667,-16.1,70.833333,24.333333,16.666667,21.166667,89.108333,6,1,1


In [4]:
print("First few rows of y:")
y.head(2)

First few rows of y:


Unnamed: 0,Cluster0,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5
0,0,0,0,0,0,0
1,0,0,0,0,0,0


In [5]:
# Perform test/train split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Initialize Pipeline into variable pipe
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize features
    ('classifier', MultiOutputClassifier(SVC()))  # Step 2: Multi-label SVC classifier
])

# Setup grid set
param_grid = {
    'scaler': [StandardScaler(), RobustScaler(), None],  # Scaler options
    'classifier__estimator__C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter C
    'classifier__estimator__gamma': [0.001, 0.01, 0.1, 1],  # Kernel coefficient gamma
    'classifier__estimator__kernel': ['rbf']  # Radial Basis Function kernel
}

# Perform the grid search to train the model and tune hyperparameters
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid.fit(X_train, y_train)

# Print the results from the grid search
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation train score: {:.2f}".format(max(grid.cv_results_['mean_train_score'])))
print("Best cross-validation validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

# Predict the labels for X_test
y_pred = grid.predict(X_test)

# Evaluate each cluster individually (multi-label confusion matrix)
print("Confusion Matrix and Classification Report per Cluster:")
for i in range(y_test.shape[1]):  # Loop through each cluster
    print(f"\nCluster {i}:")
    conf_matrix = confusion_matrix(y_test[:, i], y_pred[:, i])
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(classification_report(y_test[:, i], y_pred[:, i]))


In [None]:
from sklearn.ensemble import RandomForestClassifier
# Initialize the pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Normalize the features
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))  # Multi-label Random Forest
])

# Define the parameter grid
param_grid = {
    'classifier__estimator__n_estimators': [50, 100, 150],  # Number of trees
    'classifier__estimator__max_depth': [None, 10, 20],  # Tree depth
    'classifier__estimator__min_samples_split': [2, 5, 10]  # Minimum samples to split
}

# Perform grid search with cross-validation
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid.fit(X_train, y_train)

# Print the best parameters and scores
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation train score: {:.2f}".format(max(grid.cv_results_['mean_train_score'])))
print("Best cross-validation validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

# Predict the labels for X_test
y_pred = grid.predict(X_test)

# Evaluate each cluster individually (multi-label confusion matrix)
print("Confusion Matrix and Classification Report per Cluster:")
for i in range(y_test.shape[1]):  # Loop through each cluster
    print(f"\nCluster {i}:")
    conf_matrix = confusion_matrix(y_test[:, i], y_pred[:, i])
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(classification_report(y_test[:, i], y_pred[:, i]))
