## Data Snippets

In [55]:
import geopandas as gpd
import numpy as np
import pandas as pd
import shared_utils
from calitp_data_analysis.sql import to_snakecase

In [56]:
from calitp_data_analysis.tables import tbls
from calitp_data_analysis.sql import query_sql
from siuba import *

In [57]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Exercise 2

In [58]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/bus_service_increase/"
FILE_NAME = "ntd_metrics_2019.csv"

metrics = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME}")

In [59]:
# metrics.to_parquet("./data/exercise_2_3_ntd_metrics_2019.parquet")

In [60]:
FILE_NAME = "ntd_vehicles_2019.csv"
vehicles = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME}")


In [61]:
# vehicles.to_parquet("./data/exercise_2_ntd_vehicles_2019.parquet")

### Exercise 4

In [62]:
FEEDS = [
    "25c6505166c01099b2f6f2de173e20b9", # Caltrain
    "52639f09eb535f75b33d2c6a654cb89e", # Merced
    "e1d7185ffb6f73f7d373787910f0bf30" # Lodi
]

In [63]:
stops = (
    tbls.mart_gtfs.dim_stops()
    >> filter(_.feed_key.isin(FEEDS))
    >> select(_.feed_key, _.stop_id, 
             _.stop_lat, _.stop_lon, _.stop_name)
    >> arrange(_.feed_key, _.stop_id, 
               _.stop_lat, _.stop_lon)
    >> collect() 
)



In [64]:
len(stops) * .4

309.6

In [65]:
stops2 = stops.sample(250)

In [66]:
stops2.feed_key.value_counts()

52639f09eb535f75b33d2c6a654cb89e    161
e1d7185ffb6f73f7d373787910f0bf30     65
25c6505166c01099b2f6f2de173e20b9     24
Name: feed_key, dtype: int64

In [67]:
# stops2.to_parquet("./data/exercise_4_stops_sample.parquet")

### Exercise 5

In [86]:
feeds_to_names = shared_utils.gtfs_utils_v2.schedule_daily_feed_to_organization(
    selected_date = "2023-03-15",
    get_df = True
)[["feed_key", "name"]].drop_duplicates()

In [87]:
feeds_to_names.shape

(202, 2)

In [88]:
feeds_to_names

Unnamed: 0,feed_key,name
0,5e1be3854cae4470eccf4a1323526e3a,Playa Vista Schedule
2,8086f0d4ae362ae545a4b96a41587e09,Tahoe Transportation District Schedule
3,5ccf86b4334e6c6db3eee03b9f65372c,Bay Area 511 Sonoma-Marin Area Rail Transit Schedule
4,a9f08db1bc889c72499a14d329cd7260,LA DOT Schedule
5,5ef522e5d899e686fd1ba63de6103b25,WeHo Schedule
6,062563b11ac99ddec6d3bec6f613b78d,Morro Bay Schedule
7,163d634eb0bc5456f2c7576ad9096ff2,Taft Schedule
8,37a065f0ed0a167af84633eadd0513e4,Lake Schedule
9,272603c5381418b4a0b2b0872a3067ef,Oregon POINT
10,a7271743dd85c8f153460147b66d63fb,Sage Stage Schedule


In [89]:
OPERATORS = [
    "Alhambra Schedule", 
    "San Diego Schedule",
    "Big Blue Bus Schedule",
]

SUBSET_FEEDS = feeds_to_names[
    feeds_to_names.name.isin(OPERATORS)
].feed_key.tolist()

In [90]:
SUBSET_FEEDS

['71d91d70ad6c07b1f9b0a618ffceef93',
 'a7ba6f075198e9bf9152fab6c7faf0f6',
 '4f77ef02b983eccc0869c7540f98a7d0']

In [91]:
stops_gcs = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/stops_2023-03-15.parquet")

In [92]:
stops_gcs.shape

(83529, 16)

In [93]:
stops_gcs.sample()

Unnamed: 0,feed_key,stop_id,stop_key,stop_name,route_type_0,route_type_1,route_type_2,route_type_3,route_type_4,route_type_5,route_type_6,route_type_7,route_type_11,route_type_12,missing_route_type,geometry
9648,ae93a53469371fb3f9059d2097f66842,6999,0a916a8dbb9a91a576c3ca2dd8d6f3ae,Citrus @ Central Wb Mid,,,,24.0,,,,,,,,POINT (261150.984 -436107.362)


In [94]:
stops_gcs_filtered = stops_gcs[stops_gcs.feed_key.isin(SUBSET_FEEDS)].reset_index(drop = True)

In [95]:
stops_gcs_filtered.feed_key.value_counts()

a7ba6f075198e9bf9152fab6c7faf0f6    4241
4f77ef02b983eccc0869c7540f98a7d0     905
71d91d70ad6c07b1f9b0a618ffceef93      80
Name: feed_key, dtype: int64

In [97]:
len(stops_gcs_filtered)* .4

2090.4

In [98]:
stops_gcs_filtered = stops_gcs_filtered.sample(2100).reset_index(drop = True)

In [99]:
stops_gcs_filtered.to_parquet("./data/exercise_5_stops_sample.parquet")