In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import branca
import folium
from shared_utils import gtfs_utils_v2

from siuba import *
import pandas as pd
import geopandas as gpd 

import datetime as dt
import time

In [2]:
# Creating function for datacheck
def analyze_dataset(df):
    #Number of rows and columns
    num_rows, num_cols = df.shape 
    print(f"Number of rows: {num_rows}, Number of columns: {num_cols}")
    print()
    
    # Print column names 
    column_names = df.columns.tolist()
    print(f"Column names: \n{column_names}\n")
    
    #Print data type
    print("Data type:")
    print(type(df))
    print()
          
    # Print data types
    print("Data types:")
    print(df.dtypes)
    print()
          
    # Check for duplicates
    duplicate_rows = df[df.duplicated()]
    if not duplicate_rows.empty:
          print("Duplicate rows:")
          print(duplicate_rows)
          print()
    else:
        print("No duplicate rows found \n")
            
    # Print first 3 words 
    print("First 3 rows:")
    display(df.head(3))
    print()

In [3]:
#Analysis dates 
analysis_dt = dt.date(2022,6,1)
analysis_sat = dt.date(2022,6,4)
analysis_sun = dt.date(2022,6,5)

In [49]:
#Function to fetch feeds, trips, stops and stoptimes data from gtfs warehouse v2
def get_feeds_trips_stops_data(selected_agency, selected_date):
    
    trip_cols = ["name", "gtfs_dataset_key", "feed_key", "trip_id", "route_id", "route_type"]
    stoptimes_cols = ["key", "_gtfs_key", "feed_key", "trip_id", "stop_id"]
    stop_cols = ["key", "service_date", "feed_key", "stop_id", "geometry", "stop_name", "stop_code", "location_type"]
    
    feeds = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=selected_date)
    
    def select_by_agency(df, column, value):
        selected_df = df[df[column].str.contains(value)].copy()
        return selected_df
    
    feed_data = select_by_agency(feeds, 'name', selected_agency)
    
    if feed_data.empty:
        raise ValueError(f"No feeds data found for agency '{selected_agency}' on {selected_date}.")
        
    
    feed_key_list = feed_data['feed_key'].tolist()
    
    trips_data_list = []
    for feed_key in feed_key_list:
        trips = gtfs_utils_v2.get_trips(selected_date=selected_date, operator_feeds=[feed_key])[trip_cols]
        trips_data_list.append(trips)
        
    trips_data = pd.concat(trips_data_list, ignore_index=True)
        
    stoptimes_data_list = []
    for feed_key in feed_key_list:
        stoptimes = gtfs_utils_v2.get_stop_times(selected_date=selected_date, operator_feeds=[feed_key], 
                                           trip_df = trips_data, get_df= True)[stoptimes_cols]
    stoptimes_data_list.append(stoptimes)
        
    stoptimes_data = pd.concat(stoptimes_data_list, ignore_index=True)
    
    stop_locations_gdf = gpd.GeoDataFrame()
    for feed_key in feed_key_list:
        stops_gdf = gtfs_utils_v2.get_stops(selected_date=selected_date, operator_feeds=[feed_key])[stop_cols]
        
    stop_locations_gdf = pd.concat([stop_locations_gdf, stops_gdf], ignore_index=True)

    
    return feed_data, trips_data, stoptimes_data, stop_locations_gdf
    

In [50]:
metro_weekday_feeds_data, metro_weekday_trips_data, metro_weekday_stoptimes_data, metro_weekday_stopsdata = get_feeds_trips_stops_data('LA Metro', analysis_dt)
metro_sun_feeds_data, metro_sun_trips_data, metro_sun_stoptimes_data, metro_sun_stopsdata = get_feeds_trips_stops_data('LA Metro', analysis_sun)
metro_sat_feeds_data, metro_sat_trips_data, metro_sat_stoptimes_data, metro_sat_stopsdata = get_feeds_trips_stops_data('LA Metro', analysis_sat)

  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(


In [30]:
#Function to merge stops and trips and aggregate by route type and stop id
def merge_and_aggregate_stops_and_trips(stoptimes_data, trips_data, agg_prefix=''):
    on_cols = ["trip_id", "feed_key"]
    how = 'left'
    joined_df = pd.merge(stoptimes_data, trips_data, on=on_cols, how=how, suffixes=('_stops','_trips'))
    
    groupby_cols=['route_type', 'stop_id']
    agg_cols={
        f'n_trips_{agg_prefix}': ('trip_id', 'nunique'),
        f'n_routes_{agg_prefix}': ('route_id', 'nunique')
}
    aggregated_data = joined_df.groupby(groupby_cols).agg(**agg_cols).reset_index()
    return aggregated_data                       

In [33]:
metro_stop_times_weekday = merge_and_aggregate_stops_and_trips(metro_weekday_trips_data, metro_weekday_stoptimes_data, agg_prefix='weekday_' )
metro_stop_times_sunday = merge_and_aggregate_stops_and_trips(metro_sun_trips_data, metro_sun_stoptimes_data, agg_prefix='sunday_')
metro_stop_times_saturday = merge_and_aggregate_stops_and_trips(metro_sat_trips_data, metro_sat_stoptimes_data, agg_prefix='saturday_')

In [36]:
#Function to merge stoptimes for weekday, saturday and sunday 
def merge_stoptimes(stoptimes_weekday, stoptimes_sat, stoptimes_sun):
    merged_df = pd.merge(stoptimes_weekday, stoptimes_sat, on=["route_type", "stop_id"], how="outer")
    merged_df = pd.merge(merged_df, stoptimes_sun, on=["route_type", "stop_id"], how="outer")
    
    return merged_df


In [38]:
metro_stoptimes_all = merge_stoptimes(metro_stop_times_weekday, metro_stop_times_saturday, metro_stop_times_sunday)