# Jupyter Notebook Purpose

- Final ML learning

## Group 2 Members

- 1. Melissa Hartwick - [Email](mailto:mhartwic@uwaterloo.ca)
- 2. McKinleigh Needham - [Email](mailto:mjneedha@uwaterloo.ca)
- 3. Daniel Adam Cebula  - [Email](mailto:dacebula@uwaterloo.ca)
- 4. Athithian Selvadurai - [Email](mailto:a6selvad@uwaterloo.ca)
- 5. Aravind Kakarala - [Email](mailto:akakaral@uwaterloo.ca)
- 6. Allan Sales - [Email](mailto:asales@uwaterloo.ca)

In [1]:
import pandas as pd
import numpy as np
import os
import requests  # simple HTTP library for Python
import io        # Tool for working with streams (Input/Ouput data)
import matplotlib.pyplot as plt
import glob
import time
from joblib import dump, load

%matplotlib inline

# some matplotlib defaults
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
# Get FilePaths
cwd = os.getcwd()

Final_Data_Directory = os.path.join(cwd, "FINAL_DATA")

Final_Data = os.path.join(Final_Data_Directory, "2014-2019-TTC-Delay-Data-Final_ML.csv")

In [3]:
# Load the Final Data into DataFrames
df = pd.read_csv(Final_Data, low_memory=False)
df.sample(5)

Unnamed: 0,Identity,Hour,Season,Day,Holiday,Incident,Temp (°C),Precip. Amount (mm),Min Delay Category,Min Gap Category
376094,Bus,Afternoon / Evening,Winter,Weekday,Not A Holiday,Mechanical,0.83,2.6,10 - 30 minutes,30 - 60 minutes
78101,Subway,Afternoon / Evening,Summer,Weekday,Not A Holiday,Mechanical,19.63,0.0,0 - 3 minutes,0 - 3 minutes
492876,Bus,Morning,Fall,Weekday,Not A Holiday,Mechanical,21.03,0.0,3 - 10 minutes,3 - 10 minutes
485275,Bus,Afternoon / Evening,Summer,Weekday,Not A Holiday,Route Problems,24.23,0.0,10 - 30 minutes,10 - 30 minutes
528139,Bus,Morning,Spring,Weekday,Not A Holiday,Route Problems,-11.33,0.0,10 - 30 minutes,30 - 60 minutes


In [4]:
# continuous variables
continuous = ["Temp (°C)", "Precip. Amount (mm)"]

# categorical variables
categorical_ml = ["Identity", "Hour", "Season", "Day", "Holiday", "Incident", "Min Delay Category", "Min Gap Category"]

# turn each attribute into categorical
for x in categorical_ml:
    df[x] = df[x].astype("category")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653736 entries, 0 to 653735
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   Identity             653736 non-null  category
 1   Hour                 653736 non-null  category
 2   Season               653736 non-null  category
 3   Day                  653736 non-null  category
 4   Holiday              653736 non-null  category
 5   Incident             653736 non-null  category
 6   Temp (°C)            653736 non-null  float64 
 7   Precip. Amount (mm)  653736 non-null  float64 
 8   Min Delay Category   653736 non-null  category
 9   Min Gap Category     653736 non-null  category
dtypes: category(8), float64(2)
memory usage: 15.0 MB


# Preliminary Machine Learning Imports

In [6]:
# Split data into Training and Testing data
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# Scale the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# Scikit pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Search for best hyperparameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature Reduction
from sklearn.decomposition import PCA

# ML Models
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Ensemble
from sklearn.ensemble import VotingClassifier

# Classification Report
from sklearn.metrics import classification_report

# Pipeline for both Min Delay and Min Gap

In [7]:
# Stratified Split for Delay Data Categories
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)

for train_index1, test_index1 in split.split(df, df["Min Delay Category"]):
    delay_train_set = df.loc[train_index1]
    delay_test_set = df.loc[test_index1]
    
# Stratified Split for Delay Gap Categories
for train_index2, test_index2 in split.split(df, df["Min Gap Category"]):
    gap_train_set = df.loc[train_index2]
    gap_test_set = df.loc[test_index2]
    
# Create X (features) and y (labels) for both training and testing data
X_train_delay = delay_train_set.drop(['Min Delay Category', 'Min Gap Category'], axis=1)
X_test_delay = delay_test_set.drop(['Min Delay Category', 'Min Gap Category'], axis=1)
y_train_delay = delay_train_set['Min Delay Category']
y_test_delay = delay_test_set['Min Delay Category']

X_train_gap = gap_train_set.drop(['Min Delay Category', 'Min Gap Category'], axis=1)
X_test_gap = gap_test_set.drop(['Min Delay Category', 'Min Gap Category'], axis=1)
y_train_gap = gap_train_set['Min Gap Category']
y_test_gap = gap_test_set['Min Gap Category']

In [8]:
# one hot encoding categories
one_hot_categories = [
    "Identity", "Hour", "Season", "Day", "Holiday", "Incident"
]

# categorical data treatment pipeline
categorical_pipe = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

In [9]:
# continuous variables
continuous_columns = [
    'Temp (°C)', 'Precip. Amount (mm)'
]

# numerical / continuous data treatment pipeline
continuous_pipe = Pipeline(steps=[
    ("scaler", MinMaxScaler())
])

In [10]:
# perserve 95% of variance
pca = PCA(n_components=0.95)

In [11]:
preprocess = ColumnTransformer(
    transformers=[
    ('continuous', continuous_pipe, continuous_columns),
    ('categorical', categorical_pipe, one_hot_categories)
    ])

In [12]:
# assemble it with RandomForestClassifier
pipeline_rf_delay = Pipeline(steps = [
               ('preprocess', preprocess),
               ("pca", pca),
               ('RandomForestClassifier', RandomForestClassifier(n_estimators=10))
           ])
pipeline_rf_gap = Pipeline(steps = [
               ('preprocess', preprocess),
               ("pca", pca),
               ('RandomForestClassifier', RandomForestClassifier(n_estimators=10))
           ])

# Assemble it with Gradient Boosted Classifier
pipeline_gbc_delay = Pipeline(steps = [
               ('preprocess', preprocess),
               ("pca", pca),
               ('GradientBoostingClassifier', GradientBoostingClassifier(n_estimators=10))
           ])
pipeline_gbc_gap = Pipeline(steps = [
               ('preprocess', preprocess),
               ("pca", pca),
               ('GradientBoostingClassifier', GradientBoostingClassifier(n_estimators=10))
           ])

# Assemble it with Support Vector Classifier
pipeline_svc_delay  = Pipeline(steps = [
               ('preprocess', preprocess),
               ("pca", pca),
               ('LinearSVC', LinearSVC(tol=0.1))
           ])
pipeline_svc_gap  = Pipeline(steps = [
               ('preprocess', preprocess),
               ("pca", pca),
               ('LinearSVC', LinearSVC(tol=0.1))
           ])

In [13]:
# Create an Ensemble
# Delay
vt_delay = VotingClassifier(
    estimators=[("rf", pipeline_rf_delay),
                ("gbc", pipeline_gbc_delay),
                ("svc", pipeline_svc_delay)],
    voting="hard"
)

# Gap
vt_gap = VotingClassifier(
    estimators=[("rf", pipeline_rf_gap),
                ("gbc", pipeline_gbc_gap),
                ("svc", pipeline_svc_gap)],
    voting="hard"
)

In [14]:
# Train the Model for Delay
t0 = time.time()

vt_delay.fit(X_train_delay, y_train_delay)

print(f"Time: {time.time() - t0}")

Time: 239.66053438186646


In [15]:
# Get Preditions and Classification Report
y_test_delay_pred = vt_delay.predict(X_test_delay)

class_delay = classification_report(y_test_delay, y_test_delay_pred)

print(f"Minutes Delay Classification Report\n\n{class_delay}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Minutes Delay Classification Report

                  precision    recall  f1-score   support

   0 - 3 minutes       0.73      0.65      0.69     25074
 10 - 30 minutes       0.52      0.84      0.64     49820
  3 - 10 minutes       0.56      0.32      0.41     47361
 30 - 60 minutes       0.00      0.00      0.00      5611
60 - 180 minutes       0.00      0.00      0.00      1612
   >=180 minutes       0.50      0.00      0.00      1270

        accuracy                           0.56    130748
       macro avg       0.38      0.30      0.29    130748
    weighted avg       0.54      0.56      0.52    130748



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# save the model for later
from joblib import dump, load

dump(vt_delay, 'Minutes_Delay_Model.joblib') 

['Minutes_Delay_Model.joblib']

In [17]:
# Train the Model for Gap
t0 = time.time()

vt_gap.fit(X_train_gap, y_train_gap)

print(f"Time: {time.time() - t0}")

Time: 237.79996299743652


In [18]:
# Get Preditions and Classification Report
y_test_gap_pred = vt_gap.predict(X_test_gap)

class_gap = classification_report(y_test_gap, y_test_gap_pred)

print(f"Minutes Gap Classification Report\n\n{class_gap}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Minutes Gap Classification Report

                  precision    recall  f1-score   support

   0 - 3 minutes       0.71      0.66      0.69     26378
 10 - 30 minutes       0.58      0.97      0.72     63490
  3 - 10 minutes       0.00      0.00      0.00     14789
 30 - 60 minutes       0.00      0.00      0.00     19939
60 - 180 minutes       0.00      0.00      0.00      4773
   >=180 minutes       0.00      0.00      0.00      1379

        accuracy                           0.60    130748
       macro avg       0.22      0.27      0.23    130748
    weighted avg       0.42      0.60      0.49    130748



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# save the model for later
from joblib import dump, load

dump(vt_gap, 'Minutes_Gap_Model.joblib')

['Minutes_Gap_Model.joblib']