## Data Snippets

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import shared_utils
from calitp_data_analysis.sql import to_snakecase


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
from calitp_data_analysis.tables import tbls
from calitp_data_analysis.sql import query_sql
from siuba import *

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Exercises 2 & 3

In [4]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/bus_service_increase/"
FILE_NAME = "ntd_metrics_2019.csv"

metrics = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME}")

In [5]:
# metrics.to_parquet("./data/exercise_2_3_ntd_metrics_2019.parquet")

In [6]:
FILE_NAME = "ntd_vehicles_2019.csv"
vehicles = pd.read_csv(f"{GCS_FILE_PATH}{FILE_NAME}")


In [7]:
# vehicles.to_parquet("./data/exercise_2_ntd_vehicles_2019.parquet")

### Exercise 4

In [8]:
def sample_df(df, file_name:str):
    rows_to_keep = int(len(df)*0.4)
    
    df = df.sample(rows_to_keep).reset_index(drop = True)
    
    df.to_parquet(f"./data/{file_name}.parquet")
    return df

In [9]:
FEEDS = [
    "25c6505166c01099b2f6f2de173e20b9", # Caltrain
    "52639f09eb535f75b33d2c6a654cb89e", # Merced
    #"e1d7185ffb6f73f7d373787910f0bf30" # Lodi
]

In [10]:
stops = (
    tbls.mart_gtfs.dim_stops()
    >> filter(_.feed_key.isin(FEEDS))
    >> select(_.feed_key, _.stop_id, 
             _.stop_lat, _.stop_lon, _.stop_name)
    >> arrange(_.feed_key, _.stop_id, 
               _.stop_lat, _.stop_lon)
    >> collect() 
)

  sqlalchemy.util.warn(


In [11]:
stops_ex4 = sample_df(stops, "exercise_4_stops_sample")

In [12]:
stops_ex4.head()

Unnamed: 0,feed_key,stop_id,stop_lat,stop_lon,stop_name
0,52639f09eb535f75b33d2c6a654cb89e,768641,37.23,-120.25,Le Grand @ Washington
1,25c6505166c01099b2f6f2de173e20b9,70042,37.66,-122.4,South San Francisco Caltrain Station
2,52639f09eb535f75b33d2c6a654cb89e,768584,37.3,-120.49,Transpo (Outside)
3,52639f09eb535f75b33d2c6a654cb89e,781926,37.33,-120.56,Gurr Rd @ Valley Dr
4,52639f09eb535f75b33d2c6a654cb89e,768416,37.35,-120.59,Juniper Ave @ Valley St (eastbound)


In [13]:
stops_ex4.feed_key.value_counts()

52639f09eb535f75b33d2c6a654cb89e    189
25c6505166c01099b2f6f2de173e20b9     43
Name: feed_key, dtype: int64

### Exercise 5

In [14]:
feeds_to_names = shared_utils.gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(
    selected_date = "2023-03-15",
    get_df = True
)[["feed_key", "name"]].drop_duplicates()

In [15]:
OPERATORS = [
    "Alhambra Schedule", 
    "San Diego Schedule",
    "Big Blue Bus Schedule",
    "Culver City Schedule",
    "OmniTrans Schedule",
    "OCTA Schedule"
]

SUBSET_FEEDS = feeds_to_names[
    feeds_to_names.name.isin(OPERATORS)
].feed_key.tolist()

In [16]:
# Select 5 other feeds to throw in for the sample
OTHER_FEEDS = [i for i in feeds_to_names.feed_key 
               if i not in SUBSET_FEEDS][:5]

In [17]:
feeds_to_names[
    feeds_to_names.name.isin(OPERATORS)
][["feed_key", "name"]].set_index("feed_key").to_dict()["name"]

{'71d91d70ad6c07b1f9b0a618ffceef93': 'Alhambra Schedule',
 'a7ba6f075198e9bf9152fab6c7faf0f6': 'San Diego Schedule',
 'ae93a53469371fb3f9059d2097f66842': 'OmniTrans Schedule',
 '180d48eb03829594478082dca5782ccd': 'Culver City Schedule',
 '4f77ef02b983eccc0869c7540f98a7d0': 'Big Blue Bus Schedule',
 '8a47f5aa51f481e9ddc7c497bd72d264': 'OCTA Schedule'}

In [18]:
stops_gcs = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/stops_2023-03-15.parquet")

In [19]:
stops_gcs_filtered = stops_gcs[
    (stops_gcs.feed_key.isin(SUBSET_FEEDS)) |
    (stops_gcs.feed_key.isin(OTHER_FEEDS))].reset_index(drop = True)

In [20]:
stops_gcs_filtered.feed_key.value_counts()

8a47f5aa51f481e9ddc7c497bd72d264    5188
a7ba6f075198e9bf9152fab6c7faf0f6    4241
a9f08db1bc889c72499a14d329cd7260    2711
ae93a53469371fb3f9059d2097f66842    2280
4f77ef02b983eccc0869c7540f98a7d0     905
180d48eb03829594478082dca5782ccd     431
8086f0d4ae362ae545a4b96a41587e09     118
5ef522e5d899e686fd1ba63de6103b25      85
71d91d70ad6c07b1f9b0a618ffceef93      80
5ccf86b4334e6c6db3eee03b9f65372c      24
Name: feed_key, dtype: int64

In [21]:
stops_ex5 = sample_df(stops_gcs_filtered, "exercise_5_stops_sample")

### Exercise 9

In [22]:
import intake

In [23]:
catalog = intake.open_catalog(
    "../_shared_utils/shared_utils/shared_data_catalog.yml")

In [24]:
GCS_FILE_PATH = ("gs://calitp-analytics-data/data-analyses/"
                 "rt_delay/compiled_cached_views/"
                )

analysis_date = "2023-01-18"
STOP_TIMES_FILE = f"{GCS_FILE_PATH}st_{analysis_date}.parquet"
STOPS_FILE = f"{GCS_FILE_PATH}stops_{analysis_date}.parquet"
highways = catalog.state_highway_network.read()

In [25]:
highways.shape

(1052, 6)

In [26]:
highways_ex9 = sample_df(highways, "exercise_9_highway_sample")

In [27]:
highways_ex9.shape

(420, 6)

In [28]:
stops = catalog.ca_transit_stops.read()[["agency", "stop_id", 
                                         "stop_name", "geometry"]]

In [29]:
agencies = ['Tahoe Transportation District', 'Muni', 'Burbank']

In [30]:
stops2 = stops[stops.agency.isin(agencies)].reset_index()

In [31]:
stops2.agency.value_counts()

Tahoe Transportation District    149
Name: agency, dtype: int64

In [32]:
stops_ex9 = sample_df(stops2, "exercise_9_stops_sample")

### Dask Delayed 02

In [33]:
GCS_FILE_PATH = ("gs://calitp-analytics-data/data-analyses/"
                 "rt_delay/v2_rt_trips/"
                )

analysis_date = "2023-03-15"
la_metro = 182
big_blue_bus = 300
muni = 282

operators = [la_metro, big_blue_bus, muni]

In [34]:
big_blue_bus = pd.read_parquet(
    f"{GCS_FILE_PATH}{big_blue_bus}_{analysis_date}.parquet")

In [35]:
big_blue_bus_dask= sample_df(big_blue_bus, "dask_02_delayed_big_blue_bus_sample")

In [36]:
la_metro = pd.read_parquet(
    f"{GCS_FILE_PATH}{la_metro}_{analysis_date}.parquet")

In [37]:
la_metro_dask= sample_df(la_metro, "dask_02_delayed_la_metro_sample")

In [38]:
muni = pd.read_parquet(
    f"{GCS_FILE_PATH}{muni}_{analysis_date}.parquet")

In [39]:
muni_dask= sample_df(muni, "dask_02_delayed_muni_sample")