# Jupyter Notebook Purpose

- With the finalized dataset we have some attributes that need to be "managed"
    - without Latitude and Longitude Information I am unable to link Kaggle data to TTC data - Location data will need to be dropped
    - "Incident" information will need to be re-categorized

## Group 2 Members

- 1. Melissa Hartwick - [Email](mailto:mhartwic@uwaterloo.ca)
- 2. McKinleigh Needham - [Email](mailto:mjneedha@uwaterloo.ca)
- 3. Daniel Adam Cebula  - [Email](mailto:dacebula@uwaterloo.ca)
- 4. Athithian Selvadurai - [Email](mailto:a6selvad@uwaterloo.ca)
- 5. Aravind Kakarala - [Email](mailto:akakaral@uwaterloo.ca)
- 6. Allan Sales - [Email](mailto:asales@uwaterloo.ca)

In [1]:
import pandas as pd
import numpy as np
import os
import requests  # simple HTTP library for Python
import io        # Tool for working with streams (Input/Ouput data)
import matplotlib.pyplot as plt
import glob
import time

%matplotlib inline

In [2]:
# Get FilePaths
cwd = os.getcwd()

Final_Data_Directory = os.path.join(cwd, "FINAL_DATA")

Final_Data = os.path.join(Final_Data_Directory, "2014-2019-TTC-Delay-Data.csv")

In [3]:
# Load the Final Data into DataFrames
df = pd.read_csv(Final_Data, parse_dates=["DateTime", "Date"], low_memory=False)
df.sample(5)

Unnamed: 0,Identity,DateTime,Date,BusinessQuarter,MonthName,WeekOfYear,DayName,HolidayName,Time,Route,Location,Incident,Temp (°C),Precip. Amount (mm),Wind Dir (10s deg),Wind Spd (km/h),Stn Press (kPa),Min Delay,Min Gap
272298,Bus,2015-09-22 06:00:00,2015-09-22,3,September,39,Tuesday,Not A Holiday,06:00:00,169,Scarbourgh Town Center,Utilized Off Route,11.83,0.0,1.0,2.0,100.92,20,50
521244,Bus,2019-02-05 15:24:00,2019-02-05,1,February,6,Tuesday,Not A Holiday,15:24:00,95,Silverdale crescent on York Mills,Mechanical,-1.33,0.0,32.0,16.0,100.66,8,16
297494,Bus,2016-01-28 18:30:00,2016-01-28,1,January,4,Thursday,Not A Holiday,18:30:00,91,Woodbine Route,General Delay,0.6,0.55,22.0,19.0,98.57,24,48
464758,Bus,2018-05-02 06:35:00,2018-05-02,2,May,18,Wednesday,Not A Holiday,06:35:00,79,Runnymede Stn,Mechanical,16.27,0.0,24.0,17.5,99.96,10,20
76765,Subway,2017-05-24 15:47:00,2017-05-24,2,May,21,Wednesday,Not A Holiday,15:47,YU-N,EGLINTON STATION,Injured or ill Customer (On Train) - Medical A...,19.2,0.0,7.5,27.5,98.93,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653766 entries, 0 to 653765
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Identity             653766 non-null  object        
 1   DateTime             653766 non-null  datetime64[ns]
 2   Date                 653766 non-null  datetime64[ns]
 3   BusinessQuarter      653766 non-null  int64         
 4   MonthName            653766 non-null  object        
 5   WeekOfYear           653766 non-null  int64         
 6   DayName              653766 non-null  object        
 7   HolidayName          653766 non-null  object        
 8   Time                 653766 non-null  object        
 9   Route                653766 non-null  object        
 10  Location             652758 non-null  object        
 11  Incident             650159 non-null  object        
 12  Temp (°C)            653741 non-null  float64       
 13  Precip. Amount

# Grouping the "Incident" column

In [5]:
# Remove Location as no Latitude or Longitude data is contained within
df = df.drop(columns=['Location'])

# Remove Route as it is too unique to be used for Machine Learning
df = df.drop(columns=['Route'])

In [6]:
# lets look at incidents
df_incident = df.groupby('Incident').count().sort_values('Identity', ascending=False)
incidents = df['Incident'].unique()
len(incidents)

156

In [7]:
## Grouping as "Other" the categories with less occurrence
others = []
for i in df_incident['Identity'].index:
    if df_incident['Identity'].loc[i] < 200:
        others.append(i)
len(others)

84

In [8]:
## Grouping the similar Incidents
t0 = time.time()
df['Incident'] = df.apply(lambda x: 'Other' if str(x['Incident']) in others else x['Incident'], axis=1)
print(f"Done 1 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Other' if str(x['Incident']) == 'Miscellaneous Other' else x['Incident'], axis=1)
print(f"Done 2 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'General Delay' if str(x['Incident']) == 'Miscellaneous General Delays' else x['Incident'], axis=1)
print(f"Done 3 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Speed Control' if 'Speed' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 4 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Late Leaving Garage' if 'Late Leaving Garage' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 5 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Door Problems' if 'Door Problems' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 6 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Door Problems' if 'TR Cab Doors' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 7 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Operator' if 'Operator' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 8 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Injured or ill Customer' if 'Injured or ill Customer' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 9 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Assault' if 'Assault' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 10 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Passenger' if 'Passenger' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 11 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Assault' if 'Robbery' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 12 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Mechanical' if 'Brakes' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 13 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Mechanical' if 'Propulsion System' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 14 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Transportation' if 'Transportation' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 15 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Track' if 'Track' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 16 - {time.time() - t0}")

Done 1 - 25.062106132507324
Done 2 - 47.205235719680786
Done 3 - 67.83324337005615
Done 4 - 89.05551028251648
Done 5 - 110.16583466529846
Done 6 - 131.3070158958435
Done 7 - 153.34630250930786
Done 8 - 174.56921195983887
Done 9 - 195.15211629867554
Done 10 - 216.59174919128418
Done 11 - 237.85706281661987
Done 12 - 258.5308952331543
Done 13 - 280.22164249420166
Done 14 - 301.5684823989868
Done 15 - 322.3609390258789
Done 16 - 343.9757921695709


In [9]:
## Reduction from 156 to 46 categories in 'Incident' column.
df_incident = df.groupby('Incident').count().sort_values('Identity', ascending=False)
len(df_incident)

45

In [10]:
incidents = df['Incident'].unique()
incidents

array(['Other', 'Door Problems', 'Operator', 'Speed Control',
       'Mechanical', 'Transportation', 'Escalator/Elevator Incident',
       'Ice / Snow Related Problems', 'Passenger',
       'Injured or ill Customer', 'Train Control - VOBC',
       'Signals or Related Components Failure ', nan, 'General Delay',
       'Track', 'Equipment - No Trouble Found', 'Timeout',
       'Weather Reports / Related Delays',
       'Collector Booth Alarm Activated', 'Signals - Train Stops',
       'S/E/C Department Other', 'Disorderly Patron', 'Assault',
       'Training Department Related Delays',
       'Signal Problem - No Trouble', 'Transit Control Related Problems',
       'Air Conditioning', 'Rail Cars & Shops Opr. Error',
       'Unsanitary Vehicle', 'Station Other',
       'Fire/Smoke Plan B - Source TTC', 'Station Stairway Incident ',
       'Emergency Alarm Station Activation', 'Body',
       'Insulated Joint Related Problem', 'T&S Other', 'Storm Trains',
       'Injured Employee', 'ATC Pro

In [11]:
df_incident.iloc[:, :1]

Unnamed: 0_level_0,Identity
Incident,Unnamed: 1_level_1
Mechanical,209130
General Delay,76131
Utilized Off Route,75690
Late Leaving Garage,73980
Investigation,53800
Diversion,27685
Speed Control,20585
Operator,19917
Injured or ill Customer,16799
Emergency Services,11644


# Incidents need to be further dropped down from 45 to 4

- need to further drill it down
    - 1. Mechanical
    - 2. Route Problems
    - 3. Investigation / Emergency
    - 4. General

In [12]:
df.groupby(["Incident"]).count()["Identity"].sort_values(ascending=False).to_frame()

Unnamed: 0_level_0,Identity
Incident,Unnamed: 1_level_1
Mechanical,209130
General Delay,76131
Utilized Off Route,75690
Late Leaving Garage,73980
Investigation,53800
Diversion,27685
Speed Control,20585
Operator,19917
Injured or ill Customer,16799
Emergency Services,11644


In [13]:
mechanical_list = [
    "Mechanical", "Disorderly Patron", "Door Problems"
    , "Equipment - No Trouble Found", "Unsanitary Vehicle", "Timeout"
    , "Insulated Joint Related Problem", "Body", "Train Control - VOBC"
    , "Air Conditioning", "Rail Cars & Shops Opr. Error"
    , "Escalator/Elevator Incident", "Transportation", "Track"
    , "Station Stairway Incident ", "Signals - Train Stops", "ATC Project"
    , "Signals or Related Components Failure ", "Signal Problem - No Trouble"
    , "Collector Booth Alarm Activated"
]
route_list = [
    "Utilized Off Route", "Late Leaving Garage", "Diversion"
    , "Held By", "Vision", "Transit Control Related Problems"
    , "Storm Trains"
]
invest_emerg_list = [
    "Investigation", "Emergency Services", "Fire/Smoke Plan B - Source TTC"
    , "Emergency Alarm Station Activation"
]
general_list = [
    "General Delay", "Other", "Station Other"
    , "S/E/C Department Other", "Weather Reports / Related Delays", "Training Department Related Delays"
    , "T&S Other", "Ice / Snow Related Problems", "Speed Control", "Operator", "Injured Employee"
    , "Injured or ill Customer", "Passenger", "Disorderly Patron", "Assault"
]

In [14]:
## Grouping the Incidents together
t0 = time.time()
df['Incident'] = df.apply(lambda x: 'Mechanical' if str(x['Incident']) in mechanical_list else x['Incident'], axis=1)
print(f"Done 1 - {time.time() - t0}")

df['Incident'] = df.apply(lambda x: 'Route Problems' if str(x['Incident']) in route_list else x['Incident'], axis=1)
print(f"Done 2 - {time.time() - t0}")

df['Incident'] = df.apply(lambda x: 'Investigation / Emergency' if str(x['Incident']) in invest_emerg_list else x['Incident'], axis=1)
print(f"Done 3 - {time.time() - t0}")

df['Incident'] = df.apply(lambda x: 'General' if str(x['Incident']) in general_list else x['Incident'], axis=1)
print(f"Done 4 - {time.time() - t0}")

df["Incident"] = df["Incident"].fillna("General")
print(f"Done 5 - {time.time() - t0}")

Done 1 - 20.116268634796143
Done 2 - 40.269365310668945
Done 3 - 62.01537585258484
Done 4 - 82.88525724411011
Done 5 - 82.9261462688446


In [15]:
df.groupby(["Incident"]).count()["Identity"].sort_values(ascending=False).to_frame()

Unnamed: 0_level_0,Identity
Incident,Unnamed: 1_level_1
Mechanical,236312
Route Problems,187636
General,163140
Investigation / Emergency,66678


In [16]:
# check the distribution across subway, bus and streetcar
df.groupby(["Identity", "Incident"]).count()["DateTime"].to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,DateTime
Identity,Incident,Unnamed: 2_level_1
Bus,General,66475
Bus,Investigation / Emergency,51823
Bus,Mechanical,169222
Bus,Route Problems,165951
Streetcar,General,5944
Streetcar,Investigation / Emergency,13621
Streetcar,Mechanical,37828
Streetcar,Route Problems,20999
Subway,General,90721
Subway,Investigation / Emergency,1234


# Add the hour in 24 hour format

In [17]:
# Get the nearest hour and add it to the dataset
df["Hour"] = df["DateTime"].dt.round('h').dt.hour

In [18]:
# Reorder the columns and take a subset
df = df.loc[:, ["Identity", "DateTime", "Date", "Hour",
                "BusinessQuarter", "MonthName", "WeekOfYear", "DayName",
                "HolidayName", "Incident", "Temp (°C)",
                "Precip. Amount (mm)", "Wind Dir (10s deg)", "Wind Spd (km/h)", "Stn Press (kPa)",
                "Min Delay", "Min Gap"]]

df.sample(5)

Unnamed: 0,Identity,DateTime,Date,Hour,BusinessQuarter,MonthName,WeekOfYear,DayName,HolidayName,Incident,Temp (°C),Precip. Amount (mm),Wind Dir (10s deg),Wind Spd (km/h),Stn Press (kPa),Min Delay,Min Gap
339209,Bus,2016-08-19 09:08:00,2016-08-19,9,3,August,33,Friday,Not A Holiday,Route Problems,24.97,0.0,20.5,10.0,100.06,42,56
474458,Bus,2018-06-17 14:48:00,2018-06-17,15,2,June,24,Sunday,Not A Holiday,Investigation / Emergency,29.4,0.0,14.0,13.5,100.1,16,32
345296,Bus,2016-09-16 08:25:00,2016-09-16,8,3,September,37,Friday,Not A Holiday,Mechanical,16.93,0.0,6.0,18.5,100.83,7,14
595066,Streetcar,2015-09-02 01:13:00,2015-09-02,1,3,September,36,Wednesday,Not A Holiday,Mechanical,20.97,0.0,25.5,6.5,100.28,9,18
230888,Bus,2015-03-01 08:37:00,2015-03-01,9,1,March,9,Sunday,Not A Holiday,Mechanical,-7.63,0.0,34.0,3.0,101.54,5,10


In [19]:
# Write the updated finalized data to a .csv
df.to_csv(os.path.join(Final_Data_Directory, "2014-2019-TTC-Delay-Data-Incident.csv"), index=False)