In [181]:
import pandas as pd
import numpy as np
import requests
import math
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import nan
from pandas.io.json import json_normalize

In [42]:
# Creating a list of bus stops from already extracted data. This list will be used to make api calls

df_stops = pd.read_csv('Bus_Stops.csv')
df_stops.number= df_stops.number.astype('Int64').astype('str')
stop_numbers_list = list(set(df_stops['number']))
number_of_stops = len(stop_numbers_list)
number_of_stops

5085

# We will be using the same methodlogy of batch calling for extracting stop schedule data as well

In [110]:
# For 5085 bus stops we would need 5087/100 number of chunks
chunks_required = round(number_of_stops/100)

# Creating a function which will take stops list and chunks required as input
# The function with create sub-lists of 100 elements in the whole stops list of 5085 stops

def divide_chunks(stop_numbers_list,chunks_required):
    
    # Loops starts with 0 till number of stops and hops on to every another 100th element of the list
    
    for i in range(0, number_of_stops, 100):
        
        # Indexes from 100 till another 100th element
        # For example indexing at first loop i=0 is [0:100]
        # For second loop the i=100 and indexing is [100:200]
        yield stop_numbers_list[i:i + 100]


In [111]:
# Calling the function and giving it object name bus_stop_chunks
# Bus_stop_chunks in coverted list of chunks of list with 100 elements each

bus_stops_chunks = list(divide_chunks(stop_numbers_list, chunks_required))
print(f'Number of bus_stops chunks from original bus stops list: {len(bus_stops_chunks)}')
print(f'Number of elements in each chunk: {len(bus_stops_chunks[0])}')

Number of bus_stops chunks from original bus stops list: 51
Number of elements in each chunk: 100


In [112]:
# Creating a function which will go inside in each bus_stop chunk list elements
# And make an api call for each bus stop

# This dataframe will capture schedule for each bus variant
df_each_stop_schedule_per_bus = pd.DataFrame()

# This data frame will populate with all bus variants schedule for each stop
df_each_stop_schedule = pd.DataFrame()

# Initializing with an empty pandas dataframe which will populate with with a chunk of 100 stops
df_stop_schedule_per_chunk = pd.DataFrame()



# This function takes in chunk_index(what chunk of the whole list to work on)
# Then for each chunk makes api calls of bus stops in that chunk

def bus_stop_schedule_chunkwise(chunk_index):
    
    # Declaring the initialized dataframe variable df_stop_schedule as a global variable
    # Without it the function will consider it to be local variable and not work
    global df_stop_schedule_per_chunk
    global df_each_stop_schedule
    global df_each_stop_schedule_per_bus
    
    # Running a for loop to make api call for each stop in the bus_stops_chunks list
    for stop in bus_stops_chunks[chunk_index]:
        if stop != None:
            try:
                response = requests.get(f"https://api.winnipegtransit.com/v3/stops/{stop}/schedule.json?api-key=wPOIZFdePaV6BdttKD7C")
                json_format = response.json()['stop-schedule']

             # Recording the information got into a temporary frame df_each_stop_schedule
                #df_each_stop_schedule = pd.DataFrame()
                #df_each_stop_schedule['name'] = 'Nothing'
                #df_each_stop_schedule['count'] = None
                #df_each_stop_schedule['Stop_Number'] = stop
                #df_each_stop_schedule.loc[len(df_each_stop_schedule.index)] = [None,None, f'{stop}']
                
                # If the search returns no errors or any empty value
                if json_format['route-schedules'] == []:
                    df_each_stop_schedule['Stop_Number'] = stop
                   
                    #print(f'Empty row added for variant {stop}')
                    
                # If the api search returns a blank search, the stop will be recorded in the dataframe with empty values
                else:
                    for i in range(len(json_format['route-schedules'])):
                        df_each_stop_schedule_per_bus = pd.json_normalize(json_format['route-schedules'][i]['scheduled-stops'])
                        df_each_stop_schedule_per_bus['Stop_Number'] = stop
                        df_each_stop_schedule = pd.concat([df_each_stop_schedule,df_each_stop_schedule_per_bus], ignore_index=True)
                    

            except:
                print(f'{stop} returned error in search. No values found for this variant')

            # Simulatenous concatanetion of each stop's info in df_stop_feature_chunk
            # This dataframe will contain records for each chunk of the big 5085 stops list
            df_stop_schedule_per_chunk = pd.concat([df_stop_schedule_per_chunk,df_each_stop_schedule], ignore_index=True)

In [113]:
# Initializng with index 0

chunk_index=0

# Initializing empty dataframe which will contain bus stops schedule of all bus stops
# This is our final outcome 

df_stop_schedule =pd.DataFrame()

# Creating a function to repeadtly execute above created function bus_stop_schedule_chunkwise
# On every run chunk_index increases by 1 
# We will repeadtly make calls of this function in 60 secods intervals

# This function merely runs the previous function and takes in index number of chunk list
def bus_stops_schedule(index):
    
    # Declaring chunk_index and chunks_required variable as global as its referenced before the function
    global chunk_index
    global chunks_required
    global df_stop_schedule
    global df_stop_schedule_per_chunk
    
    # The loop will run until the index_count reaches equal to chunks_required
    # which is number of bus stops in our list divided by 100 elements' chunks)
    if chunk_index <= chunks_required:
        
        # Using a for loop to call run bus_stop_schedule_chunkwise for each index position
        
        for i in [chunk_index]:
            bus_stop_schedule_chunkwise(chunk_index=chunk_index)
            
            # For each loop, index value be incremented by 1 which becomes input when function is run again
            chunk_index+=1
            
            # Concatenating with main dataframe
            df_stop_schedule = pd.concat([df_stop_schedule, df_stop_schedule_per_chunk], ignore_index=True)
        
        print(f'Chunk index {i} of Bus Stops List Added to Dataframe at {time.ctime()}')
        print(f'Number of Stops added {df_stop_schedule_per_chunk.Stop_Number.nunique()}')
        print(f'Number of rows of dataframe after addition: {len(df_stop_schedule)}' )
        
        # Emptying the last chunk's dataframe on everyloop to avoid data duplication
        df_stop_schedule_per_chunk = pd.DataFrame()
        
    else:
        print('Every stop feature put into the dataframe. Process complete')
        

In [114]:
# Importing schedule library for scheduling tasks

import schedule
import time

# Creating a schedule object which will run the bus_stops_features function at every 60 seconds intreval
# We also have defined the index to start with, i.e 0 which will initialized above
schedule.clear()
schedule.every(60).seconds.do(bus_stops_schedule, index=chunk_index)

Every 60 seconds do bus_stops_schedule(index=0) (last run: [never], next run: 2022-12-26 11:19:03)

In [115]:
# Running a while loop which will keep the scheduled job running at 1 minute intervals
# The scheduled job will run until the all chunks in the bus stops list are looped and entered into dataframe


# The loop will run until chunk_index(which is incrementing by 1 on every loop) reaches the equal to (number of chunks + 1)
# We have 41 chunks_required (sub-lists of 100 bus stops) in our whole bus stops list(of 5085 stops)
# We have incremented chunks_required+1 to enable the while loop to enter else condition when the index reaches the limit
while chunk_index <= (chunks_required-1):
    
    # If statement runs until all chunk indexes are looped, which in our case is 41
    if chunk_index <= chunks_required:
        schedule.run_pending()
    
    # Once the index reaches 49 and increments to 50, it means we have looped all the chunks 
    # And our data from 51 chunks of bus stops in the bus stop list has been entered to DataFrame df_stop_features
    else:
        # Clearing pending schedules after job is done
        schedule.clear()
        print('Scheduler Stopped. Process Complete ')
        break
        

Chunk index 0 of Bus Stops List Added to Dataframe at Mon Dec 26 11:19:16 2022
Number of Stops added 99
Number of rows of dataframe after addition: 15619
Chunk index 1 of Bus Stops List Added to Dataframe at Mon Dec 26 11:20:27 2022
Number of Stops added 102
Number of rows of dataframe after addition: 62137
Chunk index 2 of Bus Stops List Added to Dataframe at Mon Dec 26 11:21:39 2022
Number of Stops added 102
Number of rows of dataframe after addition: 147915
Chunk index 3 of Bus Stops List Added to Dataframe at Mon Dec 26 11:22:51 2022
Number of Stops added 100
Number of rows of dataframe after addition: 272480
Chunk index 4 of Bus Stops List Added to Dataframe at Mon Dec 26 11:24:04 2022
Number of Stops added 102
Number of rows of dataframe after addition: 433503
Chunk index 5 of Bus Stops List Added to Dataframe at Mon Dec 26 11:25:16 2022
Number of Stops added 110
Number of rows of dataframe after addition: 642910
Chunk index 6 of Bus Stops List Added to Dataframe at Mon Dec 26 11

In [153]:
# Dropping rows where no information on estimated arrival time is available

df_stop_schedule.dropna(subset=['times.arrival.scheduled', 'times.arrival.estimated'], inplace=True)

In [179]:
# Creating Delay_Time (in seconds column) which shows how much delay a bus will be 

df_stop_schedule['Delay_Time'] = (pd.to_datetime(df_stop_schedule['times.arrival.estimated']) - pd.to_datetime(df_stop_schedule['times.arrival.scheduled'])).dt.total_seconds()

In [182]:
df_stop_schedule['IsDelayed'] = np.where(df_stop_schedule['Delay_Time']!= 0, True, False)

In [183]:
df_stop_schedule

Unnamed: 0,Stop_Number,key,cancelled,times.departure.scheduled,times.departure.estimated,variant.key,variant.name,bus.key,bus.bike-rack,bus.wifi,times.arrival.scheduled,times.arrival.estimated,Delay_Time,IsDelayed
6,30009,22722298-10,false,2022-12-26T11:59:12,2022-12-26T11:59:12,18-1-A,North Main-Corydon to Assiniboine Park,101.0,false,false,2022-12-26T11:59:12,2022-12-26T11:59:12,0.0,False
7,30009,22722299-10,false,2022-12-26T12:44:12,2022-12-26T12:44:12,18-1-A,North Main-Corydon to Assiniboine Park,625.0,false,false,2022-12-26T12:44:12,2022-12-26T12:44:12,0.0,False
10,30009,22722298-10,false,2022-12-26T11:59:12,2022-12-26T11:59:12,18-1-A,North Main-Corydon to Assiniboine Park,101.0,false,false,2022-12-26T11:59:12,2022-12-26T11:59:12,0.0,False
11,30009,22722299-10,false,2022-12-26T12:44:12,2022-12-26T12:44:12,18-1-A,North Main-Corydon to Assiniboine Park,625.0,false,false,2022-12-26T12:44:12,2022-12-26T12:44:12,0.0,False
12,40492,22722129-78,false,2022-12-26T11:20:52,2022-12-26T11:20:52,20-0-W,Watt-Academy to Watt & Leighton,712.0,false,false,2022-12-26T11:20:52,2022-12-26T11:20:52,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45316242,10061,22722436-20,false,2022-12-26T12:53:19,2022-12-26T12:53:19,16-0-M,Selkirk-Osborne to Tyndall Park via Manitoba,875.0,false,false,2022-12-26T12:53:19,2022-12-26T12:53:19,0.0,False
45316243,10061,22722438-18,false,2022-12-26T13:14:19,2022-12-26T13:14:19,16-0-B,Selkirk-Osborne to Tyndall Park via Burrows,735.0,false,false,2022-12-26T13:14:19,2022-12-26T13:14:19,0.0,False
45316244,10061,22722434-20,false,2022-12-26T13:35:19,2022-12-26T13:35:19,16-0-M,Selkirk-Osborne to Tyndall Park via Manitoba,184.0,false,false,2022-12-26T13:35:19,2022-12-26T13:35:19,0.0,False
45316245,10061,22722430-18,false,2022-12-26T13:56:19,2022-12-26T13:56:19,16-0-B,Selkirk-Osborne to Tyndall Park via Burrows,611.0,false,false,2022-12-26T13:56:19,2022-12-26T13:56:19,0.0,False


In [189]:
# Since our main dataframe turns out to be 45M rows, we are randomly picing 100000 records of stop schedule data for our machine learning problem
# The intention is to save compute time due to time and hardware constraints

df_stop_schedule_sample = df_stop_schedule.sample(n=100000,  random_state=1)
df_stop_schedule_sample

Unnamed: 0,Stop_Number,key,cancelled,times.departure.scheduled,times.departure.estimated,variant.key,variant.name,bus.key,bus.bike-rack,bus.wifi,times.arrival.scheduled,times.arrival.estimated,Delay_Time,IsDelayed
36874149,10830,22722773-2,false,2022-12-26T13:03:37,2022-12-26T13:03:37,11-1-D,Portage-Kildonan to North Kildonan via Donwood,191.0,false,false,2022-12-26T13:03:37,2022-12-26T13:03:37,0.0,False
33432680,60776,22721899-46,false,2022-12-26T13:22:56,2022-12-26T13:22:56,38-1-F,Salter to The Forks,419.0,true,false,2022-12-26T13:22:56,2022-12-26T13:22:56,0.0,False
5770192,50794,22722776-49,false,2022-12-26T13:09:00,2022-12-26T13:09:00,11-0-C,Portage-Kildonan to Crestview via,192.0,false,false,2022-12-26T13:09:00,2022-12-26T13:09:00,0.0,False
33425055,60776,22721004-31,false,2022-12-26T11:41:12,2022-12-26T11:41:12,650-1-S,McGillivray to Seel Station,180.0,false,false,2022-12-26T11:41:12,2022-12-26T11:41:12,0.0,False
43848935,30809,22721224-29,false,2022-12-26T12:03:08,2022-12-26T12:03:08,89-1-K,to Kildonan Place,808.0,false,false,2022-12-26T12:03:08,2022-12-26T12:03:08,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11542654,30563,22722767-95,false,2022-12-26T12:19:16,2022-12-26T12:19:16,11-1-D,Portage-Kildonan to North Kildonan via Donwood,192.0,false,false,2022-12-26T12:19:16,2022-12-26T12:19:16,0.0,False
81959,50430,22722131-78,false,2022-12-26T12:43:52,2022-12-26T12:43:52,20-0-W,Watt-Academy to Watt & Leighton,129.0,false,false,2022-12-26T12:43:52,2022-12-26T12:43:52,0.0,False
21374590,10022,22722774-15,false,2022-12-26T12:11:34,2022-12-26T12:11:34,11-0-S,Portage-Kildonan to St. Charles via,300.0,false,false,2022-12-26T12:11:34,2022-12-26T12:11:34,0.0,False
17955781,10118,22722297-73,false,2022-12-26T12:03:00,2022-12-26T12:03:00,18-1-A,North Main-Corydon to Assiniboine Park,345.0,false,false,2022-12-26T12:03:00,2022-12-26T12:03:00,0.0,False


In [190]:
df_stop_schedule_sample.IsDelayed.value_counts()

False    93558
True      6442
Name: IsDelayed, dtype: int64

In [192]:
df_stop_schedule_sample.to_csv('Stop_Schedule.csv', index=False)