<a href="https://colab.research.google.com/github/cathyai0320/A.I./blob/main/partb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**COMP 247 Supervised Learning**
*   Group 2
*   Part B
*   Predict if incident would result in fatality or not

**Members (alphabetically)**
*   Boonluea, Chinnawut 301276464
*   Chan, Kai Chung     301321990
*   Mak, Chung Ping     301281670
*   Pequino, Catherine  301308416
*   Yurderi, Emre       301270260

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import datetime as dt
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, RFE
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib
import matplotlib.ticker as ticker
from scipy.stats import chi2_contingency
from imblearn.over_sampling import SMOTE, SMOTEN, SMOTENC
warnings.filterwarnings('ignore')

In [None]:
# drop the rows without target value
def drop_rows_without_target_value(data, target_column):
    data.dropna(subset=[target_column], inplace=True)
    return data

In [None]:
# Convert the age to mean of their respective class
def transform_age(data, column):
    age_mapping = {
      '0 to 4': 2,
      '5 to 9': 7,
      '10 to 14': 12,
      '15 to 19': 17,
      '20 to 24': 22,
      '25 to 29': 27,
      '30 to 34': 32,
      '35 to 39': 37,
      '40 to 44': 42,
      '45 to 49': 47,
      '50 to 54': 52,
      '55 to 59': 57,
      '60 to 64': 62,
      '65 to 69': 67,
      '70 to 74': 72,
      '75 to 79': 77,
      '80 to 84': 82,
      '85 to 89': 87,
      '90 to 94': 92,
      'Over 95': 97
    }
    data[column] = data[column].map(age_mapping)
    data[column].fillna(data[column].mean(), inplace=True)
    return data

In [None]:
def impute_na_others(data, column):
    data[column].fillna("others", inplace=True)
    return data

In [None]:
def calculate_fatality_ratio(df):
    # Group by 'DISTRICT'
    grouped = df.groupby('DISTRICT')

    # Calculate the number of total accidents and fatal accidents
    accident_counts = grouped['ACCLASS'].count()
    fatal_counts = grouped['ACCLASS'].apply(lambda x: (x == 'Fatal').sum())

    # Calculate the fatality rate
    fatality_rate = fatal_counts / accident_counts

    # Merge the calculated fatality rates back into the original DataFrame
    df["FATALITY_RATE"] = df['DISTRICT'].map(fatality_rate)
    return df

In [None]:
def select_column(df, columns):
    df = df[columns]
    return df

In [None]:
# function use LOCCOORD to fill the missing the values for ACCLOC, if LOCCOORD is also NA, then fill ACCLOC with "Unknown"
def transform_loccord(data):
    data['ACCLOC'] = data['ACCLOC'].fillna(data['LOCCOORD'])
    data['ACCLOC'] = data['ACCLOC'].fillna("Unknown")
    return data

In [None]:
# function to map binary
def transform_target_to_binary(data, target_column):
    data[target_column] = data[target_column].map(lambda x: 1 if x == 'Fatal' else 0)
    return data

In [None]:
# Function to convert multiple columns to binary
def transform_to_binary(data, columns):
    for column in columns:
        data[column] = data[column].fillna("No")
        data[column] = data[column].replace({'Yes': 1, 'No': 0})
    return data

In [None]:
# Label Encoding
def label_encode(data, columns):
    label_encoder = LabelEncoder()
    for col in columns:
        data[col] = label_encoder.fit_transform(data[col])
    return data

In [None]:
input_pipeline = Pipeline([
    ('drop_rows_without_target_value', FunctionTransformer(drop_rows_without_target_value, kw_args={'target_column': 'ACCLASS'})),
    ('impute_na_others', FunctionTransformer(impute_na_others, kw_args={'column': 'DISTRICT'})),
    ('calculate_fatality_ratio',FunctionTransformer(calculate_fatality_ratio)),
    ('transform_loccord', FunctionTransformer(transform_loccord)),
    ('transform_target_to_binary', FunctionTransformer(transform_target_to_binary, kw_args={'target_column': 'ACCLASS'})),
    ('transform_to_binary', FunctionTransformer(transform_to_binary, kw_args={'columns': ['PEDESTRIAN', 'CYCLIST', 'TRUCK', 'TRSN_CITY_VEH', 'SPEEDING']})),
    ('transform_age', FunctionTransformer(transform_age, kw_args={'column': 'INVAGE'})),
    ('select_column', FunctionTransformer(select_column, kw_args={'columns': ["ACCLASS", "INVAGE", "TRSN_CITY_VEH", "DISTRICT", "CYCLIST", "PEDESTRIAN", "SPEEDING", "ACCLOC", "TRAFFCTL", "TRUCK", "INVTYPE", "HOOD_158", "FATALITY_RATE"]})),
    ('column_impute', ColumnTransformer([
      ('impute_others', SimpleImputer(strategy='constant',fill_value='others'), ['INVTYPE', 'TRAFFCTL']),
      ('impute_most_frequent', SimpleImputer(strategy='most_frequent'), ['HOOD_158'])
    ], remainder='passthrough')),
    # ('label_encode', FunctionTransformer(label_encode, kw_args={'columns': [
    #     "HOOD_158",
    #     # "INVTYPE",
    #     # "DISTRICT",
    #     # "TRAFFCTL",
    #     # "ACCLOC"
    # ]}))
])

In [None]:
# Load the data into dataframe
path = '/content/KSI.csv'
df = pd.read_csv(path)

In [None]:
# preprocessed data
preprocessed_data = input_pipeline.fit_transform(df)

In [None]:
print(preprocessed_data[0])

['Driver' 'Traffic Signal' '88' 1 42.40340179717587 0
 'Toronto and East York' 0 1 0 'At Intersection' 0 0.10530612244897959]


In [None]:
# convert back to dataframe
preprocessed_df = pd.DataFrame(
    preprocessed_data,
    columns=["INVTYPE", "TRAFFCTL", "HOOD_158", "ACCLASS", "INVAGE", "TRSN_CITY_VEH", "DISTRICT", "CYCLIST", "PEDESTRIAN", "SPEEDING", "ACCLOC", "TRUCK", "FATALITY_RATE"]
)

In [None]:
# another processing
preprocessed_df['INVAGE'] = preprocessed_df['INVAGE'].astype(float)
preprocessed_df['FATALITY_RATE'] = preprocessed_df['FATALITY_RATE'].astype(float)


encoded_features = ["HOOD_158","INVTYPE","DISTRICT","TRAFFCTL","ACCLOC"]

label_encoder = LabelEncoder()

for column in encoded_features:
    preprocessed_df[column] = label_encoder.fit_transform(preprocessed_df[column])

preprocessed_df['ACCLASS'] = preprocessed_df['ACCLASS'].astype(int)
preprocessed_df['PEDESTRIAN'] = preprocessed_df['PEDESTRIAN'].astype(int)
preprocessed_df['CYCLIST'] = preprocessed_df['CYCLIST'].astype(int)
preprocessed_df['TRUCK'] = preprocessed_df['TRUCK'].astype(int)
preprocessed_df['TRSN_CITY_VEH'] = preprocessed_df['TRSN_CITY_VEH'].astype(int)
preprocessed_df['SPEEDING'] = preprocessed_df['SPEEDING'].astype(int)

In [None]:
y = preprocessed_df["ACCLASS"]
X = preprocessed_df.drop("ACCLASS", axis = 1)

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18189 entries, 0 to 18188
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   INVTYPE        18189 non-null  int64  
 1   TRAFFCTL       18189 non-null  int64  
 2   HOOD_158       18189 non-null  int64  
 3   INVAGE         18189 non-null  float64
 4   TRSN_CITY_VEH  18189 non-null  int64  
 5   DISTRICT       18189 non-null  int64  
 6   CYCLIST        18189 non-null  int64  
 7   PEDESTRIAN     18189 non-null  int64  
 8   SPEEDING       18189 non-null  int64  
 9   ACCLOC         18189 non-null  int64  
 10  TRUCK          18189 non-null  int64  
 11  FATALITY_RATE  18189 non-null  float64
dtypes: float64(2), int64(10)
memory usage: 1.7 MB


In [None]:
# Test Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
smote_numerical   = SMOTE(sampling_strategy='auto', random_state=42)
# smote_categorical = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled  = smote_numerical.fit_resample(X_train.select_dtypes(include=['float64', 'int64']), y_train)
# X_train_cat_resampled, y_train_cat_resampled          = smote_categorical.fit_resample(X_train.select_dtypes(include=['object']), y_train)

In [None]:
# Logistic Regression and GridSearchCV
param_grid_logistic = {
    'C': [0.1, 1.0, 10.0],
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l1', 'l2', 'None']
}

logistic_regression_model = LogisticRegression()

grid_search_logistic = GridSearchCV(logistic_regression_model, param_grid=param_grid_logistic, cv=5)
grid_search_logistic.fit(X_train_resampled, y_train_resampled)

best_params_logistic = grid_search_logistic.best_params_
print("Best Hyperparameters for Logistic Regression:")
print(best_params_logistic)

# Evaluate on the test set
y_pred_logistic = grid_search_logistic.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
precision_logistic = precision_score(y_test, y_pred_logistic, average="macro")*100
recall_logistic = recall_score(y_test, y_pred_logistic, average="macro")*100
print(f"Logistic Regression Accuracy: {accuracy_logistic:.4f}")
print(f"Logistic Regression Precision: {precision_logistic:.4f}")
print(f"Logistic Regression Recall: {recall_logistic:.4f}")
print('Confusion Matrix: ')
confusion_matrix(y_test, y_pred_logistic)

Best Hyperparameters for Logistic Regression:
{'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
Logistic Regression Accuracy: 0.5313
Logistic Regression Precision: 50.6736
Logistic Regression Recall: 51.3792
Confusion Matrix: 


array([[1681, 1442],
       [ 263,  252]])

In [None]:
# Dump Logistic Regression model with best parameters
best_logistic_model = grid_search_logistic.best_estimator_
best_logistic_params = grid_search_logistic.best_params_
joblib.dump(best_logistic_model, 'best_logistic_model.h5')
joblib.dump(best_logistic_params, 'best_logistic_params.h5')

['best_logistic_params.h5']

In [None]:
# Decision Tree model
decision_tree_model = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'], # The function to measure the quality of a split
    'splitter': ['best', 'random'],   # The strategy used to choose the split at each node
    'max_depth': np.arange(1, 35),    # The maximum depth of the tree
    'min_samples_split': np.arange(2, 25),    # The minimum number of samples required to split an internal node
    'min_samples_leaf': np.arange(1, 30),     # The minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2', None]    # The number of features to consider when looking for the best split
}

randomized_search_decision_tree = RandomizedSearchCV(decision_tree_model, param_distributions=param_grid, n_iter=100, cv=5)
randomized_search_decision_tree.fit(X_train_resampled, y_train_resampled)

best_params_randomized_decision_tree = randomized_search_decision_tree.best_params_
print("Best Parameters for Decision Tree (Randomized Search):")
print(best_params_randomized_decision_tree)

# Evaluate on the test set
y_pred_randomized_decision_tree = randomized_search_decision_tree.predict(X_test)
accuracy_randomized_decision_tree = accuracy_score(y_test, y_pred_randomized_decision_tree)
precision_randomized_decision_tree = precision_score(y_test, y_pred_randomized_decision_tree, average="macro")*100
recall_randomized_decision_tree = recall_score(y_test, y_pred_randomized_decision_tree, average="macro")*100
print(f"Randomized Decision Tree Accuracy: {accuracy_randomized_decision_tree:.4f}")
print(f"Randomized Decision Tree Precision: {precision_randomized_decision_tree:.4f}")
print(f"Randomized Decision Tree Recall: {recall_randomized_decision_tree:.4f}")

Best Parameters for Decision Tree (Randomized Search):
{'splitter': 'best', 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 24, 'criterion': 'entropy'}
Randomized Decision Tree Accuracy: 0.7883
Randomized Decision Tree Precision: 59.9442
Randomized Decision Tree Recall: 62.0517


In [None]:
# Dump Decision Tree model with best parameters
best_decision_tree_model = randomized_search_decision_tree.best_estimator_
best_decision_tree_params = randomized_search_decision_tree.best_params_
joblib.dump(best_decision_tree_model, 'best_decision_tree_model.h5')
joblib.dump(best_decision_tree_params, 'best_decision_tree_params.h5')

['best_decision_tree_params.h5']

In [None]:
# Create the Random Forest model
random_forest_model = RandomForestClassifier()

param_grid = {
    'n_estimators': np.arange(100, 1001, 250),    # Number of trees in the forest
    'criterion': ['gini', 'entropy'],             # The function to measure the quality of a split
    'max_depth': np.arange(1, 35),                # The maximum depth of the trees
    'min_samples_split': np.arange(2, 30),        # The minimum number of samples required to split an internal node
    'min_samples_leaf': np.arange(1, 21),         # The minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],     # The number of features to consider when looking for the best split
    'bootstrap': [True, False]                    # Whether bootstrap samples are used when building trees
}

# Randomized Search
randomized_search_random_forest = RandomizedSearchCV(random_forest_model, param_distributions=param_grid, n_iter=100, cv=5)
randomized_search_random_forest.fit(X_train_resampled, y_train_resampled)

best_params_randomized_random_forest = randomized_search_random_forest.best_params_
print("Best Parameters for Random Forest (Randomized Search):")
print(best_params_randomized_random_forest)

# Evaluate on the test set
y_pred_randomized_random_forest = randomized_search_random_forest.predict(X_test)
accuracy_randomized_random_forest = accuracy_score(y_test, y_pred_randomized_random_forest)
precision_randomized_random_forest = precision_score(y_test, y_pred_randomized_random_forest, average="macro")*100
recall_randomized_random_forest = recall_score(y_test, y_pred_randomized_random_forest, average="macro")*100
print(f"Random Forest Accuracy (Randomized Search): {accuracy_randomized_random_forest:.4f}")
print(f"Random Forest Precision (Randomized Search): {precision_randomized_random_forest:.4f}")
print(f"Random Forest Recall (Randomized Search): {recall_randomized_random_forest:.4f}")

In [None]:
# Dump Random Forest model with best parameters
best_random_forest_model = randomized_search_random_forest.best_estimator_
best_random_forest_params = randomized_search_random_forest.best_params_
joblib.dump(best_random_forest_model, 'best_random_forest_model.h5')
joblib.dump(best_random_forest_params, 'best_random_forest_params.h5')

In [None]:
import requests
import json

def send_slack_notification(webhook_url, message):
    payload = {
        "text": message
    }

    headers = {
        "Content-Type": "application/json"
    }

    response = requests.post(webhook_url, data=json.dumps(payload), headers=headers)

    if response.status_code == 200:
        print("Notification sent to Slack successfully.")
    else:
        print(f"Failed to send notification to Slack. Status code: {response.status_code}")

# Replace this with your actual webhook URL
slack_webhook_url = "https://hooks.slack.com/services/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"

In [None]:
send_slack_notification(slack_webhook_url, "Random Forest model model training has finished.")

In [None]:
# Create the Neural Network model
neural_network_model = MLPClassifier()
param_grid_neural_network = {
    'hidden_layer_sizes': [(2), (3), (3,5)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

grid_search_neural_network = GridSearchCV(neural_network_model, param_grid=param_grid_neural_network, cv=5)
grid_search_neural_network.fit(X_train_resampled, y_train_resampled)

best_params_grid_neural_network = grid_search_neural_network.best_params_
print("Best Parameters for Neural Network (Grid Search):")
print(best_params_grid_neural_network)

# Evaluate on the test set
y_pred_neural_network = grid_search_neural_network.predict(X_test)
accuracy_neural_network = accuracy_score(y_test, y_pred_neural_network)
precision_neural_network = precision_score(y_test, y_pred_neural_network, average="macro")*100
recall_neural_network = recall_score(y_test, y_pred_neural_network, average="macro")*100
print(f"Neural Network Accuracy (Grid Search): {accuracy_neural_network:.4f}")
print(f"Neural Network Precision (Grid Search): {precision_neural_network:.4f}")
print(f"Neural Network Recall (Grid Search): {recall_neural_network:.4f}")


In [None]:
# Dump Neural Network model with best parameters
best_neural_network_model = grid_search_neural_network.best_estimator_
best_neural_network_params = grid_search_neural_network.best_params_
joblib.dump(best_neural_network_model, 'best_neural_network_model.h5')
joblib.dump(best_neural_network_params, 'best_neural_network_params.h5')

In [None]:
send_slack_notification(slack_webhook_url, "Neural Network model model training has finished.")

In [None]:
import scipy.stats as stats
xgb_pipeline = Pipeline([
                ("xgb_clf",XGBClassifier(random_state=42))
    ])

parameters={
    'xgb_clf__learning_rate': stats.uniform(0.01, 0.3),
    'xgb_clf__n_estimators': [50, 100, 150, 200],
    'xgb_clf__max_depth': [3, 5, 7, 9],
    'xgb_clf__min_child_weight': [1, 5, 10],
    'xgb_clf__gamma': [0, 0.1, 0.2, 0.3],
    'xgb_clf__subsample': [0.8, 0.9, 1.0],
    'xgb_clf__colsample_bytree': [0.8, 0.9, 1.0],
    'xgb_clf__colsample_bylevel': [0.8, 0.9, 1.0],
    'xgb_clf__reg_alpha': [0, 0.001, 0.01, 0.1, 1.0],
    'xgb_clf__reg_lambda': [0, 0.001, 0.01, 0.1, 1.0],
    'xgb_clf__scale_pos_weight': [1, 2, 5, 10],
    'xgb_clf__eval_metric': ['logloss']
}
xgb_grid_pipeline=RandomizedSearchCV(xgb_pipeline, parameters,n_jobs=-1, cv=5, n_iter=100, refit='str',random_state=42)

xgb_grid_pipeline.fit(X_train_resampled, y_train_resampled)

best_params_randomized_xgboost = xgb_grid_pipeline.best_params_
print("Best Parameters for XGBoost (Randomized Search):")
print(best_params_randomized_xgboost)

# Evaluate on the test set
y_pred_xgboost = xgb_grid_pipeline.predict(X_test)
accuracy_xgboost = accuracy_score(y_test, y_pred_xgboost)
precision_xgboost = precision_score(y_test, y_pred_xgboost, average="macro")*100
recall_xgboost = recall_score(y_test, y_pred_xgboost, average="macro")*100
print(f"XGBoost Accuracy (Randomized Search): {accuracy_xgboost:.4f}")
print(f"XGBoost Precision (Randomized Search): {precision_xgboost:.4f}")
print(f"XGBoost Recall (Randomized Search): {recall_xgboost:.4f}")

In [None]:
# Dump XGBoost model with best parameters
best_xgboost_model = xgb_grid_pipeline.best_estimator_
best_xgboost_params = xgb_grid_pipeline.best_params_
joblib.dump(best_xgboost_model, 'best_xgboost_model.h5')
joblib.dump(best_xgboost_params, 'best_xgboost_params.h5')

In [None]:
send_slack_notification(slack_webhook_url, "XGBoost model model training has finished.")

In [None]:
# Create the SVM model
svm_model = SVC()

param_grid_svm = {
    'C': [0.1, 1.0],
    'kernel': ['linear', 'poly', 'sigmoid'],
}

randomized_search_svm = RandomizedSearchCV(svm_model, param_distributions=param_grid_svm, n_iter=50, cv=5)
randomized_search_svm.fit(X_train_resampled, y_train_resampled)

best_params_randomized_svm = randomized_search_svm.best_params_
print("Best Parameters for SVM (Randomized Search):")
print(best_params_randomized_svm)

# Evaluate on the test set
y_pred_svm = randomized_search_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average="macro")*100
recall_svm = recall_score(y_test, y_pred_svm, average="macro")*100
print(f"SVC Accuracy (Randomized Search): {accuracy_svm:.4f}")
print(f"SVC Precision (Randomized Search): {precision_svm:.4f}")
print(f"SVC Recall (Randomized Search): {recall_svm:.4f}")


In [None]:
# Dump SVM model with best parameters
best_svm_model = randomized_search_svm.best_estimator_
best_svm_params = randomized_search_svm.best_params_
joblib.dump(best_svm_model, 'best_svm_model.h5')
joblib.dump(best_svm_params, 'best_svm_params.h5')

In [None]:
send_slack_notification(slack_webhook_url, "SVM model model training has finished.")