# Jupyter Notebook Purpose

- With the finalized dataset we have some attributes that need to be "managed"
    - without Latitude and Longitude Information I am unable to link Kaggle data to TTC data - Location data will need to be dropped
    - "Incident" information will need to be re-categorized

## Group 2 Members

- 1. Melissa Hartwick - [Email](mailto:mhartwic@uwaterloo.ca)
- 2. McKinleigh Needham - [Email](mailto:mjneedha@uwaterloo.ca)
- 3. Daniel Adam Cebula  - [Email](mailto:dacebula@uwaterloo.ca)
- 4. Athithian Selvadurai - [Email](mailto:a6selvad@uwaterloo.ca)
- 5. Aravind Kakarala - [Email](mailto:akakaral@uwaterloo.ca)
- 6. Allan Sales - [Email](mailto:asales@uwaterloo.ca)

In [1]:
import pandas as pd
import numpy as np
import os
import requests  # simple HTTP library for Python
import io        # Tool for working with streams (Input/Ouput data)
import matplotlib.pyplot as plt
import glob
import time

%matplotlib inline

In [2]:
# Get FilePaths
cwd = os.getcwd()

Final_Data_Directory = os.path.join(cwd, "FINAL_DATA")

Final_Data = os.path.join(Final_Data_Directory, "2014-2019-TTC-Delay-Data.csv")

In [3]:
# Load the Final Data into DataFrames
df = pd.read_csv(Final_Data, parse_dates=["DateTime", "Date"], low_memory=False)
df.sample(5)

Unnamed: 0,Identity,DateTime,Date,BusinessQuarter,MonthName,WeekOfYear,DayName,HolidayName,Time,Route,Location,Incident,Temp (°C),Precip. Amount (mm),Wind Dir (10s deg),Wind Spd (km/h),Stn Press (kPa),Min Delay,Min Gap
486095,Bus,2018-08-11 14:00:00,2018-08-11,3,August,32,Saturday,Not A Holiday,14:00:00,81,Thorneclife Park route,General Delay,25.77,0.0,15.0,11.5,100.1,10,1
556181,Bus,2019-08-22 19:16:00,2019-08-22,3,August,34,Thursday,Not A Holiday,19:16:00,80,Sherway Gardens,Investigation,20.63,0.0,17.5,14.5,100.01,30,60
305358,Bus,2016-03-09 14:04:00,2016-03-09,1,March,10,Wednesday,Not A Holiday,14:04:00,29,Dufferin at Auburn,Mechanical,15.93,0.0,22.0,27.5,99.72,5,10
306055,Bus,2016-03-13 11:23:00,2016-03-13,1,March,10,Sunday,Not A Holiday,11:23:00,29,Wilson Station,Mechanical,6.43,0.0,8.5,28.0,100.6,6,12
195722,Bus,2014-10-08 06:01:00,2014-10-08,4,October,41,Wednesday,Not A Holiday,06:01:00,25,Don Mills Duncan Mills,Mechanical,10.77,0.0,24.0,18.0,99.08,5,13


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653766 entries, 0 to 653765
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Identity             653766 non-null  object        
 1   DateTime             653766 non-null  datetime64[ns]
 2   Date                 653766 non-null  datetime64[ns]
 3   BusinessQuarter      653766 non-null  int64         
 4   MonthName            653766 non-null  object        
 5   WeekOfYear           653766 non-null  int64         
 6   DayName              653766 non-null  object        
 7   HolidayName          653766 non-null  object        
 8   Time                 653766 non-null  object        
 9   Route                653766 non-null  object        
 10  Location             652758 non-null  object        
 11  Incident             650159 non-null  object        
 12  Temp (°C)            653741 non-null  float64       
 13  Precip. Amount

In [5]:
df.columns

Index(['Identity', 'DateTime', 'Date', 'BusinessQuarter', 'MonthName',
       'WeekOfYear', 'DayName', 'HolidayName', 'Time', 'Route', 'Location',
       'Incident', 'Temp (°C)', 'Precip. Amount (mm)', 'Wind Dir (10s deg)',
       'Wind Spd (km/h)', 'Stn Press (kPa)', 'Min Delay', 'Min Gap'],
      dtype='object')

# Grouping the "Incident" column

In [6]:
# Remove Location as no Latitude or Longitude data is contained within
df = df.drop(columns=['Location'])

# Remove Route as it is too unique to be used for Machine Learning
df = df.drop(columns=['Route'])

In [7]:
# lets look at incidents
df_incident = df.groupby('Incident').count().sort_values('Identity', ascending=False)
incidents = df['Incident'].unique()
len(incidents)

156

In [8]:
## Grouping as "Other" the categories with less occurrence
others = []
for i in df_incident['Identity'].index:
    if df_incident['Identity'].loc[i] < 200:
        others.append(i)
len(others)

84

In [9]:
## Grouping the similar Incidents
t0 = time.time()
df['Incident'] = df.apply(lambda x: 'Other' if str(x['Incident']) in others else x['Incident'], axis=1)
print(f"Done 1 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Other' if str(x['Incident']) == 'Miscellaneous Other' else x['Incident'], axis=1)
print(f"Done 2 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'General Delay' if str(x['Incident']) == 'Miscellaneous General Delays' else x['Incident'], axis=1)
print(f"Done 3 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Speed Control' if 'Speed' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 4 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Late Leaving Garage' if 'Late Leaving Garage' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 5 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Door Problems' if 'Door Problems' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 6 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Door Problems' if 'TR Cab Doors' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 7 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Operator' if 'Operator' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 8 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Injured or ill Customer' if 'Injured or ill Customer' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 9 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Assault' if 'Assault' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 10 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Passenger' if 'Passenger' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 11 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Assault' if 'Robbery' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 12 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Mechanical' if 'Brakes' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 13 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Mechanical' if 'Propulsion System' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 14 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Transportation' if 'Transportation' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 15 - {time.time() - t0}")
df['Incident'] = df.apply(lambda x: 'Track' if 'Track' in str(x['Incident']) else x['Incident'], axis=1)
print(f"Done 16 - {time.time() - t0}")

Done 1 - 26.28025722503662
Done 2 - 47.69956636428833
Done 3 - 72.49035406112671
Done 4 - 94.38361120223999
Done 5 - 115.29226183891296
Done 6 - 142.53995633125305
Done 7 - 162.95998167991638
Done 8 - 188.9448802471161
Done 9 - 211.11766624450684
Done 10 - 238.16004252433777
Done 11 - 260.3653643131256
Done 12 - 281.61242294311523
Done 13 - 302.7820506095886
Done 14 - 323.49203085899353
Done 15 - 345.3429825305939
Done 16 - 368.86204171180725


In [10]:
## Reduction from 156 to 46 categories in 'Incident' column.
df_incident = df.groupby('Incident').count().sort_values('Identity', ascending=False)
len(df_incident)

45

In [11]:
incidents = df['Incident'].unique()
incidents

array(['Other', 'Door Problems', 'Operator', 'Speed Control',
       'Mechanical', 'Transportation', 'Escalator/Elevator Incident',
       'Ice / Snow Related Problems', 'Passenger',
       'Injured or ill Customer', 'Train Control - VOBC',
       'Signals or Related Components Failure ', nan, 'General Delay',
       'Track', 'Equipment - No Trouble Found', 'Timeout',
       'Weather Reports / Related Delays',
       'Collector Booth Alarm Activated', 'Signals - Train Stops',
       'S/E/C Department Other', 'Disorderly Patron', 'Assault',
       'Training Department Related Delays',
       'Signal Problem - No Trouble', 'Transit Control Related Problems',
       'Air Conditioning', 'Rail Cars & Shops Opr. Error',
       'Unsanitary Vehicle', 'Station Other',
       'Fire/Smoke Plan B - Source TTC', 'Station Stairway Incident ',
       'Emergency Alarm Station Activation', 'Body',
       'Insulated Joint Related Problem', 'T&S Other', 'Storm Trains',
       'Injured Employee', 'ATC Pro

In [12]:
df_incident.iloc[:, :1]

Unnamed: 0_level_0,Identity
Incident,Unnamed: 1_level_1
Mechanical,209130
General Delay,76131
Utilized Off Route,75690
Late Leaving Garage,73980
Investigation,53800
Diversion,27685
Speed Control,20585
Operator,19917
Injured or ill Customer,16799
Emergency Services,11644


In [13]:
# Get the nearest hour and add it to the dataset
df["Hour"] = df["DateTime"].dt.round('h').dt.hour

In [14]:
# Reorder the columns and take a subset
df = df.loc[:, ["Identity", "DateTime", "Date", "Hour",
                "BusinessQuarter", "MonthName", "WeekOfYear", "DayName",
                "HolidayName", "Incident", "Temp (°C)",
                "Precip. Amount (mm)", "Wind Dir (10s deg)", "Wind Spd (km/h)", "Stn Press (kPa)",
                "Min Delay", "Min Gap"]]

df.sample(5)

Unnamed: 0,Identity,DateTime,Date,Hour,BusinessQuarter,MonthName,WeekOfYear,DayName,HolidayName,Incident,Temp (°C),Precip. Amount (mm),Wind Dir (10s deg),Wind Spd (km/h),Stn Press (kPa),Min Delay,Min Gap
91266,Subway,2018-04-03 17:07:00,2018-04-03,17,2,April,14,Tuesday,Not A Holiday,Speed Control,2.63,0.4,7.0,31.0,99.27,0,0
361883,Bus,2016-11-26 12:21:00,2016-11-26,12,4,November,47,Saturday,Not A Holiday,Utilized Off Route,6.03,0.0,31.5,13.5,100.64,7,14
88426,Subway,2018-01-30 08:52:00,2018-01-30,9,1,January,5,Tuesday,Not A Holiday,Other,-10.73,0.0,32.0,21.5,101.07,7,9
250578,Bus,2015-06-02 07:45:00,2015-06-02,8,2,June,23,Tuesday,Not A Holiday,General Delay,13.37,0.0,6.5,11.5,100.91,15,10
30811,Subway,2014-11-10 22:21:00,2014-11-10,22,4,November,46,Monday,Not A Holiday,Escalator/Elevator Incident,7.87,0.0,7.0,17.0,99.62,0,0


In [15]:
# Write the updated finalized data to a .csv
df.to_csv(os.path.join(Final_Data_Directory, "2014-2019-TTC-Delay-Data-Incident.csv"), index=False)