# Jupyter Notebook Purpose

- Preliminary ML learning will need to be done
    - a couple of models are attempted and an ensemble is created from the best performing models

## Group 2 Members

- 1. Melissa Hartwick - [Email](mailto:mhartwic@uwaterloo.ca)
- 2. McKinleigh Needham - [Email](mailto:mjneedha@uwaterloo.ca)
- 3. Daniel Adam Cebula  - [Email](mailto:dacebula@uwaterloo.ca)
- 4. Athithian Selvadurai - [Email](mailto:a6selvad@uwaterloo.ca)
- 5. Aravind Kakarala - [Email](mailto:akakaral@uwaterloo.ca)
- 6. Allan Sales - [Email](mailto:asales@uwaterloo.ca)

In [1]:
import pandas as pd
import numpy as np
import os
import requests  # simple HTTP library for Python
import io        # Tool for working with streams (Input/Ouput data)
import matplotlib.pyplot as plt
import glob
import time

%matplotlib inline

# some matplotlib defaults
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
# Get FilePaths
cwd = os.getcwd()

Final_Data_Directory = os.path.join(cwd, "FINAL_DATA")

Final_Data = os.path.join(Final_Data_Directory, "2014-2019-TTC-Delay-Data-Incident.csv")

In [3]:
# Load the Final Data into DataFrames
df = pd.read_csv(Final_Data, parse_dates=["DateTime", "Date"], low_memory=False)
df.sample(5)

Unnamed: 0,Identity,DateTime,Date,Hour,BusinessQuarter,MonthName,WeekOfYear,DayName,HolidayName,Incident,Temp (°C),Precip. Amount (mm),Wind Dir (10s deg),Wind Spd (km/h),Stn Press (kPa),Min Delay,Min Gap
540787,Bus,2019-05-21 14:01:00,2019-05-21,14,2,May,21,Tuesday,Not A Holiday,Route Problems,18.07,0.0,3.0,18.0,100.56,1,1
564151,Bus,2019-10-22 16:36:00,2019-10-22,17,4,October,43,Tuesday,Not A Holiday,Mechanical,14.47,0.0,18.5,18.0,98.96,21,42
132901,Bus,2014-02-05 08:08:00,2014-02-05,8,1,February,6,Wednesday,Not A Holiday,General,-6.33,0.5,5.0,28.0,100.06,12,12
313669,Bus,2016-04-21 05:28:00,2016-04-21,5,2,April,16,Thursday,Not A Holiday,Mechanical,7.6,0.0,3.5,11.5,100.19,6,12
402155,Bus,2017-07-02 12:20:00,2017-07-02,12,3,July,26,Sunday,Not A Holiday,Route Problems,22.67,0.0,15.5,12.0,99.99,11,24


In [None]:
# only a subset of the columns will be useful for machine learning
df_prelim = df.loc[:,
                   ["Identity", "Hour", "MonthName", "DayName", "HolidayName",
                    "Incident", "Temp (°C)", "Precip. Amount (mm)", "Wind Dir (10s deg)",
                    "Wind Spd (km/h)", "Stn Press (kPa)", "Min Delay", "Min Gap"]]

In [None]:
# continuous variables
continuous = ["Temp (°C)", "Precip. Amount (mm)", "Wind Dir (10s deg)",
              "Wind Spd (km/h)", "Stn Press (kPa)"]

# categorical variables
categorical_ml = ["Identity", "Hour", "MonthName", "DayName", "HolidayName", "Incident"]

# turn each attribute into categorical
for x in categorical_ml:
    df_prelim[x] = df_prelim[x].astype("category")

In [None]:
df_prelim.info()

# Preliminary Machine Learning

In [None]:
# Split data into Training and Testing data
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# Scale the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# Imputation of missing values
from sklearn.impute import SimpleImputer

# Scikit pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Search for best hyperparameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature Reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import MDS

# ML Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Train and Test Split

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
for train_index, test_index in split.split(df_ML, df_ML["DEF_PAY"]):
    strat_train_set = df_ML.loc[train_index]
    strat_test_set = df_ML.loc[test_index]
    
    
# categorical data treatment pipeline
categorical_pipe = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# numerical / continuous data treatment pipeline
continuous_pipe = Pipeline(steps=[
    ("scaler", MinMaxScaler())
])

# process the data columns depending on if it is a Numeric or Categorical
preprocess = ColumnTransformer(
   transformers=[
    ('continuous', continuous_pipe, continuous)
   ,('categorical', categorical_pipe, categorical)
])


# assemble it with RandomForestClassifier
pipeline = Pipeline(steps = [
               ('preprocess', preprocess)
              ,('RandomForestClassifier', RandomForestClassifier())
           ])


In [None]:
#Function for X and y choosing the y column (Min Gap or Min Delay)
def x_features (Y, df):
    X = df.loc[:, df.columns != Y]
    y = df[Y].values
    #y = df.loc[:, df.columns == Y]
    if Y == 'Min Gap':
        X = X.drop(['Min Delay'], axis=1)
    else:
        X = X.drop(['Min Gap'], axis=1)
    return X, y

In [None]:
# Running to predict the Min Gap
X, y = x_features('Min Gap', df_cleaned)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
# Testing Random Forest
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(max_depth = 50)
rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
# Running to predict the Min Delay
X, y = x_features('Min Delay', df_cleaned)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
rf = RandomForestRegressor(max_depth = 15)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)