# Exploring Cell Coverage of Routes
* To do later: move data sources to a catalog
* An agency here is defined: as a service and an operator...

In [1]:
# Read in zip files
import fsspec

# Other
import geopandas as gpd
import numpy as np
import pandas as pd
# import shared_utils
from calitp import *

# Display
from IPython.display import HTML, Image, Markdown, display, display_html

# Geometry
from shared_utils import geography_utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/cellular_coverage/"

## California County Borders

In [4]:
ca_gdf = gpd.read_file(
    "https://opendata.arcgis.com/datasets/8713ced9b78a4abb97dc130a691a8695_0.geojson"
)

## Trip Routes 

In [5]:
routes_df = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes.parquet"
)

In [6]:
routes_df["route_type"].value_counts()

3    107417
0      3892
1      1322
2       710
5       514
4       241
Name: route_type, dtype: int64

In [7]:
routes_subset = routes_df[["route_type", "route_name", "agency"]]

In [8]:
cols_without_geometry = [
    "agency",
    "route_name",
    "itp_id",
    "route_id",
    "shape_id",
    "route_type",
]

In [9]:
# Drop Amtrak
routes_df2 = routes_df.loc[routes_df["agency"] != "Amtrak"]

In [10]:
# Filter for only bus routes
routes_df2 = routes_df.loc[routes_df["route_type"] == "3"]

In [11]:
# routes_df2[['agency','route_name']].sample(10)

In [12]:
f"{len(routes_df2)} rows left after dropping - compared to {len(routes_df)} rows before."

'107417 rows left after dropping - compared to 114096 rows before.'

In [13]:
routes_df2['route_id'].nunique()

1730

In [14]:
# Fill in NA for route names and agency names
routes_df2[["agency", "route_name"]] = routes_df2[["agency", "route_name"]].fillna("NA")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
# Keep only one row per route_id
# is there another way I should pick each route?
routes_df3 = routes_df2.drop_duplicates(subset=["route_id" ])

In [16]:
# Keep only one instance of a unique ITP ID - route_name combo
routes_df3 = routes_df3.drop_duplicates(subset=["itp_id", "route_name"])

In [17]:
# Reset index after dropping
routes_df3 = routes_df3.reset_index()

In [18]:
f"{routes_df3.route_id.nunique()} unique route ids, {routes_df3.route_name.nunique()} different route names, and {routes_df3.shape_id.nunique()} different shape ids."

'1467 unique route ids, 1415 different route names, and 1447 different shape ids.'

In [19]:
# routes_df3[['agency','route_name', 'route_id']].head(20).sort_values('route_name')

In [20]:
routes_df3.shape

(1467, 8)

In [21]:
# Test a single route that goes through 2 counties
# This one goes through San Francisco & Marin.
single_route = routes_df3.loc[routes_df3['route_name'] == 'via Civic Center']

In [22]:
# Reproject
ca_gdf2 = ca_gdf.to_crs("epsg:3395")
single_route = single_route.to_crs("epsg:3395")

## FCC AT&T Data Map 
* Testing with AT&T first.

### Clip AT&T map to only include California Counties

In [23]:
def create_california_coverage(file_zip_name:str, new_file_name:str):
    
    # Open zip file first
    PATH = f"{GCS_FILE_PATH}{file_zip_name}"
    with fsspec.open(PATH) as file:
        fcc_gdf = gpd.read_file(file)
    
    # Open file with California Counties.
    ca_gdf = gpd.read_file(
    "https://opendata.arcgis.com/datasets/8713ced9b78a4abb97dc130a691a8695_0.geojson")

    # Clip 
    fcc_ca_gdf = gpd.clip(fcc_gdf, ca_gdf)

    # Save this into a parquet so don't have to clip all the time
    utils.geoparquet_gcs_export(fcc_ca_gdf, GCS_FILE_PATH, new_file_name)


In [24]:
# Open
fcc_ca_gdf = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/cellular_coverage/AT_T_Data_California_Only.parquet"
)

In [25]:
# fcc_ca_gdf.plot(figsize = (30,12))

## Overlay 
* I want to obtain routes that are NOT contained in the AT & T coverage map.
* https://geopandas.org/en/stable/docs/user_guide/set_operations.html
* To obtain the geometries that are part of df1 but are not contained in df2, you can use how='difference'.

In [26]:
# Check that they are the same CRS
fcc_ca_gdf.crs == routes_df3.crs

True

In [165]:
fcc_ca_gdf.crs

<Derived Projected CRS: EPSG:3395>
Name: WGS 84 / World Mercator
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: World between 80°S and 84°N.
- bounds: (-180.0, -80.0, 180.0, 84.0)
Coordinate Operation:
- name: World Mercator
- method: Mercator (variant A)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [130]:
# Testing with intersection per Tiffany's suggestion
overlay_df  = routes_df3.overlay(fcc_ca_gdf, how="intersection", keep_geom_type= "false")

In [163]:
# But difference still shows...how can these 2 things be possible at the same time?
overlay_df_diff= routes_df3.overlay(fcc_ca_gdf, how="difference", keep_geom_type= "false")

In [164]:
display(
    Markdown(
        f"""There are {overlay_df['route_id'].nunique()} routes (run by {overlay_df['agency'].nunique()} different agencies) 
       that intersects w/ the AT&T coverage map. However there are {overlay_df_diff['route_id'].nunique()} routes when 
       setting how to difference. In comparison, there are {routes_df3.route_id.nunique()} total routes in the original dataframe."""
    )
)

There are 1467 routes (run by 141 different agencies) 
       that intersects w/ the AT&T coverage map. However there are 232 routes when 
       setting how to difference. In comparison, there are 1467 total routes in the original dataframe.

In [131]:
overlay_df.shape

(1646, 12)

In [134]:
# Previewing the routes left
# overlay_df[cols_without_geometry].sort_values('route_name')

In [135]:
# Agencies with the most routes
(
    overlay_df.groupby(["agency"])
    .agg({"route_id": "nunique"})
    .rename(columns={"route_id": "total_routes"})
    .sort_values("total_routes", ascending=False)
    .head(10)
)

Unnamed: 0_level_0,total_routes
agency,Unnamed: 1_level_1
AC Transit,130
Metro,112
Ojai Trolley,57
MUNI,56
San Diego Metropolitan Transit System,52
SamTrans,48
Los Angeles Department of Transportation,45
Sacramento Regional Transit District,40
Foothill Transit,35
Irvine Shuttle,32


In [136]:
# List for unique itp_ids
agencies_no_coverage = overlay_df["itp_id"].unique().tolist()

In [137]:
# List for unique route ids
routes_no_coverage = overlay_df["route_id"].unique().tolist()

In [138]:
# List for unique route ids
routes_no_coverage_name = overlay_df["route_name"].unique().tolist()

In [139]:
# A table with all the itp id & agencies
agency_df = overlay_df[["agency", "itp_id"]].drop_duplicates().reset_index()

In [140]:
# Subset of overlap without geometry for when I wnat to preview the df
overlay_df_no_geo = overlay_df[
    ["itp_id", "route_id", "shape_id", "route_type", "route_name", "agency"]
]

In [141]:
# Surprised by AC Transit
# overlay_df_no_geo.loc[overlay_df["agency"] == "AC Transit"]

## Trips
* How many trips are running for a route?

In [142]:
trips_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/trips_2022-05-04_all.parquet"
)

In [143]:
trips_df.shape

(104934, 14)

In [144]:
trips_df.head(1)

Unnamed: 0,calitp_itp_id,calitp_url_number,service_date,trip_key,trip_id,route_id,direction_id,shape_id,calitp_extracted_at,calitp_deleted_at,route_short_name,route_long_name,route_desc,route_type
0,4,0,2022-05-04,1582047897989018365,272020,21,1,shp-21-23,2022-03-26,2099-01-01,21,Dimond - Fruitvale - Bay Farm,,3


In [145]:
# Add agency name
trips_df = pd.merge(
    trips_df, agency_df, how="inner", left_on="calitp_itp_id", right_on="itp_id"
)

In [146]:
# Try to standardize route id for the 2 dfs
overlay_df["route_id"] = overlay_df["route_id"].str.lower().str.strip()
trips_df["route_id"] = trips_df["route_id"].str.lower().str.strip()

In [147]:
# Filter out for route IDS
trips_df2 = trips_df[trips_df["route_id"].isin(routes_no_coverage)]

In [148]:
# So many route ids disappeared
f'{trips_df2["route_id"].nunique()} routes left after filtering trips_df compared to {len(routes_no_coverage)} routes in overlay.'

'901 routes left after filtering trips_df compared to 1467 routes in overlay.'

In [149]:
# Checking which ones are missing.
id_routes = set(routes_no_coverage)
id_trips = set(trips_df2.route_id.unique().tolist())
missing_routes = list(id_routes - id_trips)

In [150]:
f"{len(missing_routes)/len(routes_no_coverage)}% of routes missing"

'0.3858214042263122% of routes missing'

In [151]:
# Sum up number of trips taken by route using trip id?
trips_ran = (
    trips_df2.groupby(["calitp_itp_id", "route_long_name", "agency"])
    .agg({"trip_id": "count"})
    .reset_index()
    .rename(columns={"trip_id": "total_trips"})
)

In [152]:
trips_ran.shape

(1405, 4)

In [153]:
# trips_ran

In [154]:
trips_ran["agency"].nunique()

100

## NTD 

In [155]:
ntd_df = pd.read_excel(
    f"gs://calitp-analytics-data/data-analyses/5311 /2020-Vehicles_1.xlsm",
    sheet_name="Vehicle Type Count by Agency",
)

In [156]:
# ntd_df.columns

In [157]:
# Only get bus related columns
columns_wanted = [
    "Agency",
    "City",
    "State",
    "Bus",
    "Over-The-Road Bus",
    "Articulated Bus",
    "Double Decker Bus",
    "School Bus",
    "Van",
    "Cutaway",
    "Minivan",
]

In [158]:
# Have to add snakecase after b/c some columns have integers
ntd_df2 = to_snakecase(ntd_df[columns_wanted])

In [159]:
# Only grab California
ntd_df2 = ntd_df2.loc[ntd_df2["state"] == "CA"]

In [160]:
# Add up buses
ntd_df2["total_buses"] = ntd_df2.sum(numeric_only=True, axis=1)

In [161]:
# Drop agencies with 0 buses
ntd_df2 = ntd_df2.loc[ntd_df2['total_buses'] !=0]

In [162]:
ntd_df2.shape

(218, 12)