## Data Snippets

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import shared_utils
from calitp_data_analysis.sql import to_snakecase


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
from calitp_data_analysis.tables import tbls
from calitp_data_analysis.sql import query_sql
from siuba import *

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Exercises 2 & 3

In [4]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/bus_service_increase/"
FILE_NAME = "ntd_metrics_2019.csv"

metrics = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME}")

In [5]:
# metrics.to_parquet("./data/exercise_2_3_ntd_metrics_2019.parquet")

In [6]:
FILE_NAME = "ntd_vehicles_2019.csv"
vehicles = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME}")


In [7]:
# vehicles.to_parquet("./data/exercise_2_ntd_vehicles_2019.parquet")

### Exercise 4

In [8]:
def sample_df(df, file_name:str):
    rows_to_keep = int(len(df)*0.4)
    
    df = df.sample(rows_to_keep).reset_index(drop = True)
    
    df.to_parquet(f"./data/{file_name}.parquet")
    return df

In [9]:
FEEDS = [
    "25c6505166c01099b2f6f2de173e20b9", # Caltrain
    "52639f09eb535f75b33d2c6a654cb89e", # Merced
    #"e1d7185ffb6f73f7d373787910f0bf30" # Lodi
]

In [10]:
stops = (
    tbls.mart_gtfs.dim_stops()
    >> filter(_.feed_key.isin(FEEDS))
    >> select(_.feed_key, _.stop_id, 
             _.stop_lat, _.stop_lon, _.stop_name)
    >> arrange(_.feed_key, _.stop_id, 
               _.stop_lat, _.stop_lon)
    >> collect() 
)



In [11]:
stops_ex4 = sample_df(stops, "exercise_4_stops_sample")

In [12]:
stops_ex4.head()

Unnamed: 0,feed_key,stop_id,stop_lat,stop_lon,stop_name
0,52639f09eb535f75b33d2c6a654cb89e,768319,37.07,-120.86,G St @ Davita Dialysis (southbound)
1,52639f09eb535f75b33d2c6a654cb89e,768398,37.35,-120.61,Atwater Transpo
2,25c6505166c01099b2f6f2de173e20b9,san_carlos,37.51,-122.26,San Carlos
3,52639f09eb535f75b33d2c6a654cb89e,768199,37.38,-120.72,Main St @ F St (northbound)
4,52639f09eb535f75b33d2c6a654cb89e,768413,37.35,-120.61,Winton @ Grove (southbound)


In [13]:
stops_ex4.feed_key.value_counts()

52639f09eb535f75b33d2c6a654cb89e    197
25c6505166c01099b2f6f2de173e20b9     35
Name: feed_key, dtype: int64

### Exercise 5

In [14]:
feeds_to_names = shared_utils.gtfs_utils_v2.schedule_daily_feed_to_organization(
    selected_date = "2023-03-15",
    get_df = True
)[["feed_key", "name"]].drop_duplicates()

In [15]:
OPERATORS = [
    "Alhambra Schedule", 
    "San Diego Schedule",
    "Big Blue Bus Schedule",
]

SUBSET_FEEDS = feeds_to_names[
    feeds_to_names.name.isin(OPERATORS)
].feed_key.tolist()

In [16]:
SUBSET_FEEDS

['71d91d70ad6c07b1f9b0a618ffceef93',
 'a7ba6f075198e9bf9152fab6c7faf0f6',
 '4f77ef02b983eccc0869c7540f98a7d0']

In [17]:
stops_gcs = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/stops_2023-03-15.parquet")

In [18]:
stops_gcs_filtered = stops_gcs[stops_gcs.feed_key.isin(SUBSET_FEEDS)].reset_index(drop = True)

In [19]:
stops_gcs_filtered.feed_key.value_counts()

a7ba6f075198e9bf9152fab6c7faf0f6    4241
4f77ef02b983eccc0869c7540f98a7d0     905
71d91d70ad6c07b1f9b0a618ffceef93      80
Name: feed_key, dtype: int64

In [20]:
stops_ex5 = sample_df(stops_gcs_filtered, "exercise_5_stops_sample")

### Exercise 9

In [21]:
import intake

In [22]:
catalog = intake.open_catalog(
    "../_shared_utils/shared_utils/shared_data_catalog.yml")

In [23]:
GCS_FILE_PATH = ("gs://calitp-analytics-data/data-analyses/"
                 "rt_delay/compiled_cached_views/"
                )

analysis_date = "2023-01-18"
STOP_TIMES_FILE = f"{GCS_FILE_PATH}st_{analysis_date}.parquet"
STOPS_FILE = f"{GCS_FILE_PATH}stops_{analysis_date}.parquet"
highways = catalog.state_highway_network.read()

In [28]:
highways.shape

(1052, 6)

In [24]:
highways_ex9 = sample_df(highways, "exercise_9_highway_sample")

In [27]:
highways_ex9.shape

(420, 6)

In [29]:
stops = catalog.ca_transit_stops.read()[["agency", "stop_id", 
                                         "stop_name", "geometry"]]

In [31]:
agencies = ['Tahoe Transportation District', 'Muni', 'Burbank']

In [32]:
stops2 = stops[stops.agency.isin(agencies)].reset_index()

In [33]:
stops2.agency.value_counts()

Muni                             5161
Tahoe Transportation District     156
Burbank                            47
Name: agency, dtype: int64

In [34]:
stops_ex9 = sample_df(stops2, "exercise_9_stops_sample")

### Dask Delayed 02

In [35]:
GCS_FILE_PATH = ("gs://calitp-analytics-data/data-analyses/"
                 "rt_delay/v2_rt_trips/"
                )

analysis_date = "2023-03-15"
la_metro = 182
big_blue_bus = 300
muni = 282

operators = [la_metro, big_blue_bus, muni]

In [36]:
big_blue_bus = pd.read_parquet(
    f"{GCS_FILE_PATH}{big_blue_bus}_{analysis_date}.parquet")

In [37]:
big_blue_bus_dask= sample_df(big_blue_bus, "dask_02_delayed_big_blue_bus_sample")

In [38]:
la_metro = pd.read_parquet(
    f"{GCS_FILE_PATH}{la_metro}_{analysis_date}.parquet")

In [39]:
la_metro_dask= sample_df(la_metro, "dask_02_delayed_la_metro_sample")

In [40]:
muni = pd.read_parquet(
    f"{GCS_FILE_PATH}{muni}_{analysis_date}.parquet")

In [41]:
muni_dask= sample_df(muni, "dask_02_delayed_muni_sample")