## The purpose of this notebook is to combine the delay reports from (https://www.toronto.ca/city-government/data-research-maps/open-data/) for subway, bus and streetcar delays.

The delay data will be all merged on the year-month-day (in that format) and combined with the climate data I have cleaned previously.

In [1]:
# import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
import time
import os

import warnings
warnings.filterwarnings("ignore")

folders = ["TTC_Bus_Delay", "TTC_Subway_Delay", "TTC_Streetcar_Delay"]

pathway1 = os.path.dirname(os.path.abspath('Clean_Merge_Delay_Data.ipynb'))
xlsx_subway = os.path.join(pathway1, folders[1])

In [2]:
def sheetsTTC_Subway(xlsx_file):
    
    """ 
    Reads in an Excel file and concatenates each sheet into 
    a returned pandas DataFrame.    
    
    Note:
        - Requires: pandas
        - Requires: Each worksheet must have exact matching
                    n columns, column order, column labels
                    and column dtypes.
    """
    
    xls = pd.ExcelFile(xlsx_file)
    
    out_df = pd.DataFrame()
    
    for sheet in xls.sheet_names:
        
        df = pd.read_excel(xls, sheet_name = sheet)
        
        df = df[[
            #'Report Date', 'Route', 'Time', 'Day', 'Location', 'Incident', 'Min Delay', 'Min Gap', 'Direction', 'Vehicle'
            'Date', 'Time', 'Day', 'Station', 'Code', 'Min Delay', 'Min Gap', 'Bound', 'Line', 'Vehicle'
        ]]
        
        out_df = pd.concat([out_df, df])
        
    return out_df

In [3]:
def excelDate(excel_time):
    
    """
    Converts excel datetime float format to pandas datetime
    
    """
    
    return pd.to_datetime('1900-01-01') + pd.to_timedelta(excel_time, 'D')

In [4]:
year1417 = os.path.join(xlsx_subway, 'Subway_2014_042017.xlsx')
sub1417 = sheetsTTC_Subway(year1417)

year17 = os.path.join(xlsx_subway, 'Subway_2017.xlsx')
sub17 = sheetsTTC_Subway(year17)

year18 = os.path.join(xlsx_subway, 'Subway_2018.xlsx')
sub18 = sheetsTTC_Subway(year18)

year19 = os.path.join(xlsx_subway, 'Subway_2019.xlsx')
sub19 = sheetsTTC_Subway(year19)

sub19['Date'] = excelDate(sub19['Date'])
sub18['Date'] = excelDate(sub18['Date'])
sub17['Date'] = excelDate(sub17['Date'])
sub1417['Date'] = excelDate(sub1417['Date'])

sub19['Year'] = 2019
sub18['Year'] = 2018
sub17['Year'] = 2017
sub1417['Year']  = sub1417['Date'].dt.year

subway = pd.concat([sub1417, sub17])
subway = pd.concat([subway, sub18])
subway = pd.concat([subway, sub19])

codes_link = os.path.join(xlsx_subway, 'Log_Codes.csv')
codes = pd.read_csv(codes_link)
subway = pd.merge(subway, codes, on='Code', how='left')

subway_srt = os.path.join(xlsx_subway, 'ttc_subway_srt.csv')
subway.to_csv(subway_srt)

In [5]:
len(subway)

109326

In [6]:
subway.sample(5)

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle,Year,Description
64666,2017-02-04,18:27,Thursday,WARDEN STATION,MUSC,0,0,E,BD,5029,2017,Miscellaneous Speed Control
90490,2018-06-04,01:28,Saturday,SHEPPARD WEST STATION,MUIS,0,0,,YU,0,2018,Injured or ill Customer (In Station) - Transpo...
34178,2015-08-23,20:00,Friday,FINCH STATION,MUIS,0,0,,YU,0,2015,Injured or ill Customer (In Station) - Transpo...
57780,2016-10-07,05:53,Wednesday,GREENWOOD STATION,TUDOE,11,0,W,BD,5254,2016,Doors Open in Error
91150,2018-06-16,00:39,Thursday,OSSINGTON STATION,TUSC,0,0,E,BD,5281,2018,Operator Overspeeding


In [7]:
temp_link = os.path.join(xlsx_subway, 'Completely_Averaged.csv')
temp = pd.read_csv(temp_link)




print(subway.dtypes)
print(temp.dtypes)

Date           datetime64[ns]
Time                   object
Day                    object
Station                object
Code                   object
Min Delay               int64
Min Gap                 int64
Bound                  object
Line                   object
Vehicle                 int64
Year                    int64
Description            object
dtype: object
Date/Time                  object
Year                        int64
Month                       int64
Day                         int64
Max Temp (°C)             float64
Min Temp (°C)             float64
Mean Temp (°C)            float64
Total Rain (mm)           float64
Total Snow (cm)           float64
Total Precip (mm)         float64
Snow on Grnd (cm)         float64
Spd of Max Gust (km/h)    float64
dtype: object


In [8]:
temp.head()

Unnamed: 0,Date/Time,Year,Month,Day,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Spd of Max Gust (km/h)
0,2014-01-01,2014,1,1,-9.4,-14.9,-12.2,0.0,0.0,0.0,7.4,32.0
1,2014-01-02,2014,1,2,-15.3,-19.7,-17.5,0.0,0.4,0.8,7.2,38.0
2,2014-01-03,2014,1,3,-7.6,-23.6,-15.6,0.0,0.0,0.0,7.6,34.0
3,2014-01-04,2014,1,4,-0.1,-9.6,-4.9,0.0,0.9,1.0,7.6,43.0
4,2014-01-05,2014,1,5,0.8,-2.6,-1.0,1.4,6.9,12.9,8.8,31.0


In [9]:
temp["Date/Time"] = pd.to_datetime(temp["Date/Time"], format = "%Y-%m-%d")
temp = temp.rename(index=str, columns={"Date/Time": "Date"})

temp.dtypes

Date                      datetime64[ns]
Year                               int64
Month                              int64
Day                                int64
Max Temp (°C)                    float64
Min Temp (°C)                    float64
Mean Temp (°C)                   float64
Total Rain (mm)                  float64
Total Snow (cm)                  float64
Total Precip (mm)                float64
Snow on Grnd (cm)                float64
Spd of Max Gust (km/h)           float64
dtype: object

In [10]:
subway.head()

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle,Year,Description
0,2014-01-03,00:21,Wednesday,VICTORIA PARK STATION,MUPR1,55,60,W,BD,5111,2014,Priority One - Train in Contact With Person
1,2014-01-03,02:06,Wednesday,HIGH PARK STATION,SUDP,3,7,W,BD,5001,2014,Disorderly Patron
2,2014-01-03,02:40,Wednesday,SHEPPARD STATION,MUNCA,0,0,,YU,0,2014,
3,2014-01-03,03:10,Wednesday,LANSDOWNE STATION,SUDP,3,8,W,BD,5116,2014,Disorderly Patron
4,2014-01-03,03:20,Wednesday,BLOOR STATION,MUSAN,5,10,S,YU,5386,2014,Unsanitary Vehicle


In [11]:
temp.head()

Unnamed: 0,Date,Year,Month,Day,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Rain (mm),Total Snow (cm),Total Precip (mm),Snow on Grnd (cm),Spd of Max Gust (km/h)
0,2014-01-01,2014,1,1,-9.4,-14.9,-12.2,0.0,0.0,0.0,7.4,32.0
1,2014-01-02,2014,1,2,-15.3,-19.7,-17.5,0.0,0.4,0.8,7.2,38.0
2,2014-01-03,2014,1,3,-7.6,-23.6,-15.6,0.0,0.0,0.0,7.6,34.0
3,2014-01-04,2014,1,4,-0.1,-9.6,-4.9,0.0,0.9,1.0,7.6,43.0
4,2014-01-05,2014,1,5,0.8,-2.6,-1.0,1.4,6.9,12.9,8.8,31.0


In [12]:
subway_temp = pd.merge(subway, temp, on='Date', how='left')

subway_srt_temp = os.path.join(xlsx_subway, 'ttc_subway_srt_temp.csv')
subway_temp.to_csv(subway_srt_temp)

In [13]:
hol_link = os.path.join(xlsx_subway, 'holidays.csv')
holiday = pd.read_csv(hol_link)
holiday.dtypes

date       object
holiday    object
dtype: object

In [14]:
holiday["date"] = pd.to_datetime(holiday["date"], format = "%Y-%m-%d")
holiday = holiday.rename(index=str, columns={"date": "Date"})

holiday.dtypes

Date       datetime64[ns]
holiday            object
dtype: object

In [15]:
holiday.head()

Unnamed: 0,Date,holiday
0,2012-01-02,New Year's Day
1,2012-02-20,Family Day
2,2012-04-06,Good Friday
3,2012-05-21,Victoria Day
4,2012-07-02,Canada Day


In [17]:
subway_temp_hol = pd.merge(subway_temp, holiday, on='Date', how='left')

subway_srt_temp_hol = os.path.join(xlsx_subway, 'ttc_subway_master.csv')
subway_temp_hol.to_csv(subway_srt_temp_hol)

In [None]:
temp_link = os.path.join(xlsx_subway, 'Completely_Averaged.csv')
codes = pd.read_csv(temp_link)
subway_temp = pd.merge(subway, codes, on='Date', how='left')

subway_srt_temp = os.path.join(xlsx_subway, 'ttc_subway_srt_temp.csv')
subway_temp.to_csv(subway_srt_temp)

In [None]:
# I cannot iterate through it normally as the files are excel and follow an abnormal naming convention

# folder structure
folders = ["TTC_Bus_Delay", "TTC_Subway_Delay", "TTC_Streetcar_Delay"]

years = ["2014", "2015", "2016", "2017", "2018", "2019"]

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "July", 
         "Aug", "Sept", "Oct", "Nov", "Dec"]

sheets = []

# daily weather station readouts from Jan 1 till Dec 31 from 2014 - 2015
bus_xlsx_list = ["Bus 2014.xlsx", 
                  "Bus 2015.xlsx", 
                  "Bus 2016.xlsx", 
                  "Bus 2017.xlsx", 
                  "Bus_2018.xlsx", 
                  "Bus_2019.xlsx"]

bus_xlsx_list = ["Bus 2014.xlsx", 
                  "Bus 2015.xlsx", 
                  "Bus 2016.xlsx", 
                  "Bus 2017.xlsx", 
                  "Bus_2018.xlsx", 
                  "Bus_2019.xlsx"]

streetcar_xlsx_list = ["Streetcar 2014.xlsx", 
                  "Streetcar 2015.xlsx", 
                  "Streetcar 2016.xlsx", 
                  "Streetcar 2017.xlsx", 
                  "Streetcar_2018.xlsx", 
                  "Streetcar_2019.xlsx"]

# remember 2 sheets ("1" and "2")
# , need to concacanate them, as the logs have them combined as well
subway_delay_code_key = "Subway & SRT Log Codes-Modified.xlsx"

subway_xlsx_list = ["Subway & SRT Logs (Jan01_14 to April30_17).xlsx", 
                  "Subway & SRT Logs (May 2017).xlsx", 
                  "SubwayDelay201706.xlsx", 
                  "SubwaySRTLogs201707.xlsx", 
                  "SubwaySRTLogs201708.xlsx", 
                  "SubwaySRTLogs201709.xlsx", 
                   "SubwaySRTLogs201710.xlsx", 
                  "Subway_&_SRT_Logs_(November_2017).xlsx", 
                   "Subway_&_SRT_Logs_(December_2017).xlsx", 
                  "Subway_SRT_Logs(January 2018).xlsx", 
                   "Subway&SRT_Logs_February_2018.xlsx", 
                   "Subway&SRT_Logs_March_2018.xlsx", 
                  "Subway&SRT_Logs_April_2018.xlsx", 
                   "Subway&SRT_Logs_May_2018.xlsx", 
                  "Subway_SRT_Logs(June2018).xlsx", 
                   "Subway_SRT_Logs(July_2018).xlsx", 
                  "Subway_&_SRT_Logs_(August_2018).xlsx", 
                   "Subway_&_SRT_Logs_(September_2018).xlsx", 
                  "Subway_SRT_Logs(October 2018).xlsx", 
                   "Subway_&_SRT_Logs_November_2018.xlsx", 
                  "Subway_&_SRT_Logs_December_2018.xlsx", 
                   "Subway_&_SRT_Logs_January_2019.xlsx", 
                  "Subway_&_SRT_Logs_February2019.xlsx", 
                    "Subway_&_SRT_Logs_March2019.xlsx", 
                   "Subway_&_SRT_Logs_April2019.xlsx"
                   ]

# output the cleaned delay readouts
clean_list = ["TTC_Bus_Clean.csv", "TTC_Streetcar_Clean.csv",
            "TTC_Subway_Clean.csv"]

output_folder = "Cleaned_Delay"

pathway1 = os.path.dirname(os.path.abspath('Clean_Merge_Delay_Data.ipynb'))
xlsx_to_load = os.path.join(pathway1, folders[0], bus_xlsx_list[0])