# Transit Ridership Dashboard GTFS Refactor

- Migrating the transit ridership dashboard created Fall 2022 to warehouse v2

In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import branca
import folium
from shared_utils import gtfs_utils_v2

from siuba import *
import pandas as pd
import geopandas as gpd 

import datetime as dt
import time

In [3]:
# Creating function for datacheck
def analyze_dataset(df):
    #Number of rows and columns
    num_rows, num_cols = df.shape 
    print(f"Number of rows: {num_rows}, Number of columns: {num_cols}")
    print()
    
    # Print column names 
    column_names = df.columns.tolist()
    print(f"Column names: \n{column_names}\n")
    
    #Print data type
    print("Data type:")
    print(type(df))
    print()
          
    # Print data types
    print("Data types:")
    print(df.dtypes)
    print()
          
    # Check for duplicates
    duplicate_rows = df[df.duplicated()]
    if not duplicate_rows.empty:
          print("Duplicate rows:")
          print(duplicate_rows)
          print()
    else:
        print("No duplicate rows found \n")
            
    # Print first 3 words 
    print("First 3 rows:")
    display(df.head(3))
    print()

    

## Creating trips per weekday, saturday and sunday by stop

In [4]:
# using the 2022 data used in the previous dashboard.
analysis_dt = dt.date(2022,6,1)
analysis_sat = dt.date(2022,6,4)
analysis_sun = dt.date(2022,6,5)

analysis_operator_list = [182,293,208]

### Extracting Feed Data for Weekdays 

In [5]:
feeds = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=analysis_dt)

In [6]:
analyze_dataset(feeds)

Number of rows: 200, Number of columns: 9

Column names: 
['key', 'date', 'feed_key', 'feed_timezone', 'base64_url', 'gtfs_dataset_key', 'name', 'regional_feed_type', 'type']

Data type:
<class 'pandas.core.frame.DataFrame'>

Data types:
key                           object
date                  datetime64[ns]
feed_key                      object
feed_timezone                 object
base64_url                    object
gtfs_dataset_key              object
name                          object
regional_feed_type            object
type                          object
dtype: object

No duplicate rows found 

First 3 rows:


Unnamed: 0,key,date,feed_key,feed_timezone,base64_url,gtfs_dataset_key,name,regional_feed_type,type
0,9cafaafdcdd4905e7dc0c778df79d3d3,2022-06-01,5efaa2460085a481db5dfbf57ae78187,America/Los_Angeles,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,033def72a7efd143935ee6fdb0c675af,Kern Schedule,,schedule
1,6fda4cb881fff0b6ab9b88154468a9df,2022-06-01,c50220b8622624dfa0c5c22859b14694,America/Los_Angeles,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,039d199710a017218a60054a4660b3b9,Humboldt Schedule,,schedule
2,3295ba500a119aaf8eb6ca1088297977,2022-06-01,1b77ef49f5bc70038cbf15e4f5f98477,America/Los_Angeles,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,04580bce11b70bda8a43f58fe6468e36,Compton Schedule,,schedule





### Selecting specific agencies : LA Metro

In [7]:
def select_by_agency(df, column, value):
    selected_df = df[df[column].str.contains(value)].copy()
    return selected_df

In [8]:
metrofeeds = select_by_agency(feeds, 'name', 'LA Metro')

In [9]:
# Feed key of LA metro in list format
metrofeed_list = metrofeeds.feed_key.to_list()

### Getting Trip Data for LA Metro for Weekdays

In [10]:
metro_trips = gtfs_utils_v2.get_trips(selected_date=analysis_dt, operator_feeds=metrofeed_list)

In [11]:
trip_cols = ["name", "gtfs_dataset_key", "feed_key",
             "trip_id", "route_id", "route_type"]

In [12]:
metro_trips = metro_trips[trip_cols]

In [13]:
analyze_dataset(metro_trips)

Number of rows: 13834, Number of columns: 6

Column names: 
['name', 'gtfs_dataset_key', 'feed_key', 'trip_id', 'route_id', 'route_type']

Data type:
<class 'pandas.core.frame.DataFrame'>

Data types:
name                object
gtfs_dataset_key    object
feed_key            object
trip_id             object
route_id            object
route_type          object
dtype: object

No duplicate rows found 

First 3 rows:


Unnamed: 0,name,gtfs_dataset_key,feed_key,trip_id,route_id,route_type
0,LA Metro Bus Schedule,a09d454d421c1ef01e77b9e94aad0f5e,06d1f3ac2b0ae5e74424edbbfefa19ed,DSE-HG-1650-DS-004,DSE-HG,3
1,LA Metro Bus Schedule,a09d454d421c1ef01e77b9e94aad0f5e,06d1f3ac2b0ae5e74424edbbfefa19ed,DSE-HG-1650-DS-008,DSE-HG,3
2,LA Metro Bus Schedule,a09d454d421c1ef01e77b9e94aad0f5e,06d1f3ac2b0ae5e74424edbbfefa19ed,DSE-HG-1650-DS-002,DSE-HG,3





### Getting Stop Times Data for Weekdays

In [14]:
metro_stops = gtfs_utils_v2.get_stop_times(selected_date=analysis_dt, operator_feeds=metrofeed_list, 
                                           trip_df = metro_trips, get_df= True)

  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(
  sqlalchemy.util.warn(


In [15]:
stop_cols=["key", "_gtfs_key", "feed_key", "trip_id", "stop_id"]

In [16]:
metro_stops= metro_stops[stop_cols]

In [17]:
analyze_dataset(metro_stops)

Number of rows: 793697, Number of columns: 5

Column names: 
['key', '_gtfs_key', 'feed_key', 'trip_id', 'stop_id']

Data type:
<class 'pandas.core.frame.DataFrame'>

Data types:
key          object
_gtfs_key    object
feed_key     object
trip_id      object
stop_id      object
dtype: object

No duplicate rows found 

First 3 rows:


Unnamed: 0,key,_gtfs_key,feed_key,trip_id,stop_id
0,bd8866407ade3e81eddbbe7ebe6e6e86,8bda692f5f1c6b0fd99b05990845b189,bc633d97886566eba81d46f81b0573b6,56064321,80109
1,1449dbb7bbac7b5e9ae8356de963b096,6a13fb05e30ae71fa772fdfb445c11d6,bc633d97886566eba81d46f81b0573b6,55217353,80427
2,7add6f0ae1867b4c6a23e0c32fd3e4ce,881d52b8a1aa430179c64b3d52155d74,bc633d97886566eba81d46f81b0573b6,55217353,80426





### Joining Stop Times and Trip Data 

In [18]:
metro_joined = pd.merge(
    metro_stops, metro_trips,
    on = ["trip_id", "feed_key"],
    how = 'left'
)                    

In [20]:
metro_joined.head(3)

Unnamed: 0,key,_gtfs_key,feed_key,trip_id,stop_id,name,gtfs_dataset_key,route_id,route_type
0,bd8866407ade3e81eddbbe7ebe6e6e86,8bda692f5f1c6b0fd99b05990845b189,bc633d97886566eba81d46f81b0573b6,56064321,80109,LA Metro Rail Schedule,683682f3c501f1edd5954f0a1f2a4d12,801,0
1,1449dbb7bbac7b5e9ae8356de963b096,6a13fb05e30ae71fa772fdfb445c11d6,bc633d97886566eba81d46f81b0573b6,55217353,80427,LA Metro Rail Schedule,683682f3c501f1edd5954f0a1f2a4d12,804,0
2,7add6f0ae1867b4c6a23e0c32fd3e4ce,881d52b8a1aa430179c64b3d52155d74,bc633d97886566eba81d46f81b0573b6,55217353,80426,LA Metro Rail Schedule,683682f3c501f1edd5954f0a1f2a4d12,804,0


### Finding Number of Trips on weekday per Stops

In [27]:
metrotrips_weekday =  metro_joined.groupby(['route_type', 'stop_id']).agg(
    ntrips_weekday = ('trip_id', 'nunique'),
    ntrips_route = ('route_id', 'nunique')).reset_index()

In [28]:
metrotrips_weekday

Unnamed: 0,route_type,stop_id,ntrips_weekday,ntrips_route
0,0,80101,197,1
1,0,80102,94,1
2,0,80105,197,1
3,0,80106,197,1
4,0,80107,197,1
...,...,...,...,...
12246,3,9992,99,1
12247,3,9993,99,1
12248,3,9994,99,1
12249,3,9996,99,1
