# Jupyter Notebook Purpose

- Preliminary ML learning will need to be done
    - a couple of models are attempted and an ensemble is created from the best performing models

## Group 2 Members

- 1. Melissa Hartwick - [Email](mailto:mhartwic@uwaterloo.ca)
- 2. McKinleigh Needham - [Email](mailto:mjneedha@uwaterloo.ca)
- 3. Daniel Adam Cebula  - [Email](mailto:dacebula@uwaterloo.ca)
- 4. Athithian Selvadurai - [Email](mailto:a6selvad@uwaterloo.ca)
- 5. Aravind Kakarala - [Email](mailto:akakaral@uwaterloo.ca)
- 6. Allan Sales - [Email](mailto:asales@uwaterloo.ca)

In [1]:
import pandas as pd
import numpy as np
import os
import requests  # simple HTTP library for Python
import io        # Tool for working with streams (Input/Ouput data)
import matplotlib.pyplot as plt
import glob

%matplotlib inline

# some matplotlib defaults
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
# Get FilePaths
cwd = os.getcwd()

Final_Data_Directory = os.path.join(cwd, "FINAL_DATA")

Final_Data = os.path.join(Final_Data_Directory, "2014-2019-TTC-Delay-Data-Incident.csv")

In [3]:
# Load the Final Data into DataFrames
df = pd.read_csv(Final_Data, parse_dates=["DateTime", "Date"], low_memory=False)
df.sample(5)

Unnamed: 0,Identity,DateTime,Date,Hour,BusinessQuarter,MonthName,WeekOfYear,DayName,HolidayName,Incident,Temp (°C),Precip. Amount (mm),Wind Dir (10s deg),Wind Spd (km/h),Stn Press (kPa),Min Delay,Min Gap
7701,Subway,2016-10-06 17:21:00,2016-10-06,17,4,October,40,Thursday,Not A Holiday,Other,20.2,0.0,11.0,13.5,100.9,7,12
523434,Bus,2019-02-14 05:57:00,2019-02-14,6,1,February,7,Thursday,Not A Holiday,Mechanical,-2.8,0.0,25.0,16.5,99.94,12,23
156511,Bus,2014-05-02 19:30:00,2014-05-02,20,2,May,18,Friday,Not A Holiday,Mechanical,9.3,0.0,22.0,16.5,99.4,10,20
139963,Bus,2014-03-01 15:48:00,2014-03-01,16,1,March,9,Saturday,Not A Holiday,Mechanical,-2.6,0.0,27.0,22.0,100.36,7,14
635525,Streetcar,2018-07-17 17:23:00,2018-07-17,17,3,July,29,Tuesday,Not A Holiday,Mechanical,25.6,0.0,35.0,22.5,99.73,4,9


In [4]:
df.columns

Index(['Identity', 'DateTime', 'Date', 'Hour', 'BusinessQuarter', 'MonthName',
       'WeekOfYear', 'DayName', 'HolidayName', 'Incident', 'Temp (°C)',
       'Precip. Amount (mm)', 'Wind Dir (10s deg)', 'Wind Spd (km/h)',
       'Stn Press (kPa)', 'Min Delay', 'Min Gap'],
      dtype='object')

In [16]:
# only a subset of the columns will be useful for machine learning
df_prelim = df.loc[:,
                   ["Identity", "Hour", "MonthName", "DayName", "HolidayName",
                    "Incident", "Temp (°C)", "Precip. Amount (mm)", "Wind Dir (10s deg)",
                    "Wind Spd (km/h)", "Stn Press (kPa)", "Min Delay", "Min Gap"]]

# subset useful for time series analysis
df_ts = df.loc[:,
               ["DateTime", "Identity", "Incident",
                "Temp (°C)", "Precip. Amount (mm)", "Wind Dir (10s deg)",
                "Wind Spd (km/h)", "Stn Press (kPa)", "Min Delay", "Min Gap"]]

In [17]:
# continuous variables
continuous = ["Temp (°C)", "Precip. Amount (mm)", "Wind Dir (10s deg)",
              "Wind Spd (km/h)", "Stn Press (kPa)"]

# categorical variables
categorical_ml = ["Identity", "Hour", "MonthName", "DayName", "HolidayName", "Incident"]
categorical_ts = ["Identity", "Incident"]

# turn each attribute into categorical
for x in categorical_ml:
    df_prelim[x] = df_prelim[x].astype("category")
    
# turn each attribute into categorical
for x in categorical_ts:
    df_ts[x] = df_ts[x].astype("category")

In [18]:
df_prelim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653766 entries, 0 to 653765
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   Identity             653766 non-null  category
 1   Hour                 653766 non-null  category
 2   MonthName            653766 non-null  category
 3   DayName              653766 non-null  category
 4   HolidayName          653766 non-null  category
 5   Incident             650159 non-null  category
 6   Temp (°C)            653741 non-null  float64 
 7   Precip. Amount (mm)  653564 non-null  float64 
 8   Wind Dir (10s deg)   653554 non-null  float64 
 9   Wind Spd (km/h)      653659 non-null  float64 
 10  Stn Press (kPa)      653741 non-null  float64 
 11  Min Delay            653766 non-null  int64   
 12  Min Gap              653766 non-null  int64   
dtypes: category(6), float64(5), int64(2)
memory usage: 38.7 MB


In [19]:
df_ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653766 entries, 0 to 653765
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   DateTime             653766 non-null  datetime64[ns]
 1   Identity             653766 non-null  category      
 2   Incident             650159 non-null  category      
 3   Temp (°C)            653741 non-null  float64       
 4   Precip. Amount (mm)  653564 non-null  float64       
 5   Wind Dir (10s deg)   653554 non-null  float64       
 6   Wind Spd (km/h)      653659 non-null  float64       
 7   Stn Press (kPa)      653741 non-null  float64       
 8   Min Delay            653766 non-null  int64         
 9   Min Gap              653766 non-null  int64         
dtypes: category(2), datetime64[ns](1), float64(5), int64(2)
memory usage: 41.2 MB


# Preliminary Machine Learning

In [None]:
# Split data into Training and Testing data
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# Scale the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# Imputation of missing values
from sklearn.impute import SimpleImputer

# Scikit pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Search for best hyperparameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Feature Reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import MDS

# ML Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Train and Test Split

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=123)
for train_index, test_index in split.split(df_ML, df_ML["DEF_PAY"]):
    strat_train_set = df_ML.loc[train_index]
    strat_test_set = df_ML.loc[test_index]
    
    
# categorical data treatment pipeline
categorical_pipe = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

# numerical / continuous data treatment pipeline
continuous_pipe = Pipeline(steps=[
    ("scaler", MinMaxScaler())
])

# process the data columns depending on if it is a Numeric or Categorical
preprocess = ColumnTransformer(
   transformers=[
    ('continuous', continuous_pipe, continuous)
   ,('categorical', categorical_pipe, categorical)
])


# assemble it with RandomForestClassifier
pipeline = Pipeline(steps = [
               ('preprocess', preprocess)
              ,('RandomForestClassifier', RandomForestClassifier())
           ])


In [None]:
#Function for X and y choosing the y column (Min Gap or Min Delay)
def x_features (Y, df):
    X = df.loc[:, df.columns != Y]
    y = df[Y].values
    #y = df.loc[:, df.columns == Y]
    if Y == 'Min Gap':
        X = X.drop(['Min Delay'], axis=1)
    else:
        X = X.drop(['Min Gap'], axis=1)
    return X, y

In [None]:
# Running to predict the Min Gap
X, y = x_features('Min Gap', df_cleaned)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
# Testing Random Forest
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(max_depth = 50)
rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
# Running to predict the Min Delay
X, y = x_features('Min Delay', df_cleaned)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
rf = RandomForestRegressor(max_depth = 15)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)