# Investigate missing RT speed info

* Use `rt_vs_schedule` table for 5/4/22 to see what % of scheduled trips had corresponding RT info

In [1]:
#import os
#os.environ["CALITP_BQ_MAX_BYTES"] = str(200_000_000_000)

import intake
import pandas as pd

from IPython.display import Markdown

from calitp.magics import query_sql
from E0_bus_oppor_vars import GCS_FILE_PATH, ANALYSIS_DATE

catalog = intake.open_catalog("*.yml")



In [2]:
'''
query = """
SELECT *
FROM `cal-itp-data-infra-staging.natalie_views.gtfs_rt_vs_schedule_trips_may4_sample`

"""

rt_vs_sched = query_sql(query)

rt_vs_sched.to_parquet(
    f"{GCS_FILE_PATH}rt_vs_schedule_{ANALYSIS_DATE}.parquet")
'''

'\nquery = """\nSELECT *\nFROM `cal-itp-data-infra-staging.natalie_views.gtfs_rt_vs_schedule_trips_may4_sample`\n\n"""\n\nrt_vs_sched = query_sql(query)\n\nrt_vs_sched.to_parquet(\n    f"{GCS_FILE_PATH}rt_vs_schedule_{ANALYSIS_DATE}.parquet")\n'

In [3]:
bus_routes = catalog.bus_routes_aggregated_stats.read()

In [4]:
keep_cols = [
    "calitp_itp_id", "route_id", 
     "trips_all_day",
     "num_competitive",
     "mean_speed_mph"
]

route_table = bus_routes[keep_cols].drop_duplicates()

In [5]:
route_table[["calitp_itp_id", "route_id"]].drop_duplicates().shape

(465, 2)

In [6]:
route_table[route_table.num_competitive.isna()]

Unnamed: 0,calitp_itp_id,route_id,trips_all_day,num_competitive,mean_speed_mph


In [7]:
route_table[route_table.mean_speed_mph.isna()].shape

(355, 5)

In [8]:
missing_speeds = route_table[route_table.mean_speed_mph.isna()]

In [9]:
rt_vs_sched = pd.read_parquet(
    f"{GCS_FILE_PATH}rt_vs_schedule_{ANALYSIS_DATE}.parquet")

In [10]:
route_cols = ["calitp_itp_id", "route_id"]

df = pd.merge(
    missing_speeds,
    rt_vs_sched.drop_duplicates(subset=route_cols),
    on = route_cols,
    how = "inner",
    validate = "1:1"
)

In [11]:
df[df.pct_w_vp >= 0.5].shape

(65, 14)

In [12]:
df[(df.calitp_itp_id==182)]

Unnamed: 0,calitp_itp_id,route_id,trips_all_day,num_competitive,mean_speed_mph,agency_name,calitp_url_number,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp
87,182,DSE-HG,47,0,,,0,South Bay Dodger Stadium Express,2022-05-04,2022-03-30,2099-01-01,47,0,0.0


In [13]:
# only the top 15-20% has RT data
df.pct_w_vp.describe(percentiles = [0.1, 0.2, 0.3, 0.4, 
                                    0.5, 0.6, 0.7, 0.75, 
                                    0.8, 0.85, 0.9, 0.95])

count    355.000000
mean       0.173010
std        0.364903
min        0.000000
10%        0.000000
20%        0.000000
30%        0.000000
40%        0.000000
50%        0.000000
60%        0.000000
70%        0.000000
75%        0.000000
80%        0.000000
85%        0.827273
90%        1.000000
95%        1.000000
max        1.000000
Name: pct_w_vp, dtype: float64

In [14]:
has_info = df[df.pct_w_vp >= 0.8]

has_info_with_geom = pd.merge(
    bus_routes.drop(columns = "service_date"),
    has_info[route_cols],
    on = route_cols,
    how = "inner"
)

In [15]:
districts = has_info_with_geom.caltrans_district.unique().tolist()

for d in sorted(districts):
    display(Markdown(f"### District {d}"))
    
    district_df = (has_info_with_geom
                   [has_info_with_geom.caltrans_district == d]
                   [keep_cols  + ["route_type", "geometry"]])
    
    display(district_df)
    
    display(district_df.explore("route_id", 
                                categorical=True,
                                legend=False,
                                tiles = "CartoDBPositron"))

### District 03 - Marysville

Unnamed: 0,calitp_itp_id,route_id,trips_all_day,num_competitive,mean_speed_mph,route_type,geometry
45,372,07320499-ab75-48c5-99c7-5967c4b22f1b,1,0,,,"LINESTRING (5561747.060 3508794.224, 5562943.1..."
46,372,1b9912ea-a0f8-46d1-a45b-768dbbc96187,1,0,,,"LINESTRING (5561747.060 3508794.224, 5562943.1..."
47,372,3bc003ec-9243-4b49-a873-e0884b5e6ff7,1,0,,,"LINESTRING (5560354.332 3506522.024, 5560375.7..."
48,372,46c8ab4f-27df-422a-b0b5-9c50275c2588,1,0,,,"LINESTRING (5561747.060 3508794.224, 5562943.1..."
49,372,4bc2bfa4-be32-47c1-9a57-11a815f7429b,12,12,,,"LINESTRING (5377616.978 3572075.548, 5377358.5..."
50,372,8bebc5d1-8c71-4646-9b43-6513d4589bae,1,0,,,"LINESTRING (5487091.962 3497575.412, 5487136.7..."
51,372,ad7444fd-3e73-4a75-b2dc-68413b6dc649,1,0,,,"LINESTRING (5482767.518 3546973.353, 5481671.8..."
52,372,b6e92239-4632-47de-8ef8-2515e1ef4b1e,1,0,,,"LINESTRING (5487091.962 3497575.412, 5487141.2..."
53,372,f7e6fe88-8770-4143-886b-333f0aa0a498,1,0,,,"LINESTRING (5488170.621 3508456.689, 5488441.5..."


### District 04 - Oakland

Unnamed: 0,calitp_itp_id,route_id,trips_all_day,num_competitive,mean_speed_mph,route_type,geometry
0,4,702,1,0,,,"LINESTRING (5390537.055 3276716.998, 5390490.0..."
1,4,706,1,0,,,"LINESTRING (5366694.518 3193021.659, 5366751.1..."
2,61,97X,12,0,,,"LINESTRING (5432224.760 3192692.754, 5432193.9..."
4,194,1237,6,0,,,"LINESTRING (5266667.915 3255833.390, 5266684.3..."
5,194,16,31,0,,,"LINESTRING (5257871.135 3271366.413, 5257897.5..."
6,194,1784,13,0,,,"LINESTRING (5255510.023 3315216.778, 5255513.8..."
7,194,1788,38,0,,,"LINESTRING (5246250.063 3352932.695, 5246240.3..."
8,194,1789,26,0,,,"LINESTRING (5257931.316 3261751.018, 5257938.7..."
9,194,17X,7,0,,,"LINESTRING (5266667.915 3255833.390, 5266684.3..."
10,194,2297,27,0,,,"LINESTRING (5266667.915 3255833.390, 5266684.3..."


### District 07 - Los Angeles

Unnamed: 0,calitp_itp_id,route_id,trips_all_day,num_competitive,mean_speed_mph,route_type,geometry
3,170,405,4,0,,,"LINESTRING (6426853.925 1847918.134, 6426855.1..."


### District 08 - San Bernardino

Unnamed: 0,calitp_itp_id,route_id,trips_all_day,num_competitive,mean_speed_mph,route_type,geometry
14,269,205,3,0,,,"LINESTRING (6820176.251 1650139.980, 6820189.4..."
38,360,3205,16,0,,,"LINESTRING (6915359.972 1983500.819, 6909004.4..."
39,360,3206,21,5,,,"LINESTRING (6774071.390 2019018.006, 6774207.5..."
40,360,3210,20,11,,,"LINESTRING (6808325.704 2005176.708, 6808266.5..."
41,360,3220,16,0,,,"LINESTRING (6774615.572 1859706.628, 6774702.9..."
42,360,3240,28,0,,,"LINESTRING (6854609.131 2140936.334, 6854978.8..."
43,360,5699,5,2,,,"LINESTRING (6775862.870 1977354.309, 6775615.8..."
44,360,5701,1,1,,,"LINESTRING (6955770.747 2282352.374, 6955680.9..."


### District 11 - San Diego

Unnamed: 0,calitp_itp_id,route_id,trips_all_day,num_competitive,mean_speed_mph,route_type,geometry
15,278,110,8,0,,,"LINESTRING (6818382.303 1357142.098, 6818442.3..."
16,278,140,76,0,,,"LINESTRING (6803354.343 1388845.434, 6803351.2..."
17,278,20,87,0,,,"LINESTRING (6843662.978 1465931.750, 6843729.2..."
18,278,225,88,0,,,"LINESTRING (6818440.582 1357387.954, 6818439.7..."
19,278,235,132,5,,,"LINESTRING (6817833.021 1357135.724, 6817892.1..."
20,278,25,26,2,,,"LINESTRING (6827904.176 1398202.398, 6828274.4..."
21,278,280,14,0,,,"LINESTRING (6840450.444 1502996.079, 6840413.2..."
22,278,290,14,0,,,"LINESTRING (6843456.585 1465965.535, 6843449.7..."
23,278,60,13,0,,,"LINESTRING (6803168.995 1411874.886, 6803168.3..."
24,278,816,53,0,,,"LINESTRING (6876559.208 1384214.768, 6876562.7..."
