# Analysis
* Create yml for all my files. 
* Save overlay? 

In [1]:
import altair as alt
import fsspec
import geopandas as gpd
import intake
import numpy as np
import pandas as pd
import utilities
from calitp import *
from IPython.display import HTML, Image, Markdown, display, display_html
from shared_utils import geography_utils, utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/cellular_coverage/"

In [4]:
routes_df = utilities.load_unique_routes_df()

In [5]:
routes_df.shape

(2829, 7)

## T-Mobile

In [10]:
tmobile_df = gpd.read_parquet(f"{GCS_FILE_PATH}tmobile_california.parquet")

In [11]:
tmobile_m1 = utilities.route_cell_coverage(tmobile_df, routes_df, "_tmobile")

In [13]:
tmobile_m1.shape

(2828, 9)

In [15]:
(tmobile_m1["binned_tmobile"].value_counts() / len(tmobile_m1)) * 100

(90, 100]   71.78
(80, 90]     6.12
(70, 80]     3.08
(60, 70]     1.31
(50, 60]     0.74
(40, 50]     0.39
(30, 40]     0.28
(20, 30]     0.14
(10, 20]     0.04
(0, 10]      0.00
Name: binned_tmobile, dtype: float64

## AT&T

In [16]:
att_df = utilities.load_att()

In [17]:
att_m1 = utilities.route_cell_coverage(att_df, routes_df, "_att")

In [19]:
att_m1.shape

(2822, 9)

In [20]:
(att_m1["binned_att"].value_counts() / len(att_m1)) * 100

(90, 100]   73.32
(80, 90]     4.93
(70, 80]     2.66
(60, 70]     1.56
(50, 60]     1.35
(10, 20]     0.14
(20, 30]     0.07
(30, 40]     0.07
(40, 50]     0.07
(0, 10]      0.00
Name: binned_att, dtype: float64

## Verizon

In [21]:
verizon_df = utilities.load_verizon()

In [22]:
verizon_m1 = utilities.route_cell_coverage(verizon_df, routes_df, "_verizon")

In [24]:
verizon_m1.shape

(2822, 9)

In [25]:
(verizon_m1["binned_verizon"].value_counts() / len(verizon_m1)) * 100

(90, 100]   85.54
(80, 90]     5.95
(70, 80]     2.62
(60, 70]     1.35
(50, 60]     1.03
(40, 50]     0.11
(10, 20]     0.07
(30, 40]     0.07
(0, 10]      0.00
(20, 30]     0.00
Name: binned_verizon, dtype: float64

## Merge all three providers

In [38]:
common_cols = ["route_id", "route_name", "agency", "itp_id", "route_length_original_df"]

In [39]:
merge1 = att_m1.merge(verizon_m1, how="outer", on=common_cols).merge(
    tmobile_m1, how="outer", on=common_cols
)

In [40]:
# Del extra cols that are the same across dfs.
merge1 = merge1.drop(
    columns=[
        "binned_verizon",
        "binned_att",
        "binned_tmobile",
    ]
)

In [41]:
# Ensure this remains a gdf.
merge1 = gpd.GeoDataFrame(merge1, geometry="geometry_overlay_verizon", crs="EPSG:4326")

In [42]:
type(merge1)

geopandas.geodataframe.GeoDataFrame

In [43]:
merge1.shape

(2828, 14)

### Some checks
* Original df against merged.

In [44]:
# 1 extra row missing??
routes_df.shape, merge1.shape

((2829, 7), (2828, 14))

In [45]:
# Find the missing row
route_names_og = set(routes_df.route_name.unique().tolist())
route_names_merge = set(merge1.route_name.unique().tolist())
route_names_og - route_names_merge

{'Arizona Western College/Northern Arizona University/University of Arizona to Wellton via Fortuna Foothills'}

In [46]:
routes_df.route_id.nunique(), merge1.route_id.nunique()

(1730, 1730)

In [47]:
routes_df.itp_id.nunique(), merge1.itp_id.nunique()

(169, 169)

### Filter out for low threshold 
* Chose 70% or less cell coverage as the threshold.
* Most routes across the 3 providers have 90% or more coverage.
* Verizon has the best coverage. 85% of the routes have 90% or more coverage with Verizon. 

In [51]:
threshold = 71

In [52]:
# Only keep routes that have less than threshold across all 3 providers.
low_coverage = merge1[
    (merge1["percentage_verizon"] < threshold)
    & (merge1["percentage_tmobile"] < threshold)
    & (merge1["percentage_att"] < threshold)
]

In [53]:
# Create a column with median of all three providers
low_coverage["median_cell_coverage_percentage"] = low_coverage[
    ["percentage_verizon", "percentage_tmobile", "percentage_att"]
].mean(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [143]:
len(low_coverage), type(low_coverage), low_coverage.route_id.nunique()

(68, geopandas.geodataframe.GeoDataFrame, 37)

In [144]:
# (low_coverage.loc[:,~low_coverage.columns.str.contains('^geometry', case=False)])

## Add trips
* Add one column for total trips that an agency completes across all routes.
* Add another column for total trips ONLY for routes with low coverage. 

In [145]:
trips = utilities.trip_df()

In [146]:
# Merge the 2 filtered at&t and verizon dataframes so we can see which routes overlap
trips_routes = pd.merge(
    low_coverage,
    trips,
    how="left",
    left_on=["itp_id", "route_id"],
    right_on=["calitp_itp_id", "route_id"],
    indicator=True,
)

In [147]:
# Should all route ids match?
trips_routes["_merge"].value_counts()

both          58
left_only     10
right_only     0
Name: _merge, dtype: int64

In [148]:
# Fill NA for total trips by agency/by route with median of each col
trips_routes = trips_routes.assign(
     total_trips_by_route = trips_routes.total_trips_by_route.fillna(trips_routes.total_trips_by_route.median()),
     total_trips_by_agency = trips_routes.total_trips_by_agency.fillna(trips_routes.total_trips_by_agency.median())
) 

In [174]:
trips_routes.total_trips_by_route.median(), trips_routes.total_trips_by_agency.median()

(20.5, 501.0)

In [149]:
# Add column for percentage of trips for the route in question compared with
# All the trips an agency has done
trips_routes["percentage_of_trips_w_low_cell_service"] = (
    trips_routes["total_trips_by_route"] / trips_routes["total_trips_by_agency"]
)

In [150]:
trips_routes = trips_routes.drop(columns=["_merge", "calitp_itp_id"])

## Add NTD

In [151]:
# Load NTD vehicles
ntd_df = utilities.ntd_vehicles()[["agency", "total_buses"]]

In [190]:
ntd_df.agency.unique()

array(['Los Angeles County Metropolitan Transportation Authority',
       'Orange County Transportation Authority', 'Access Services',
       'City and County of San Francisco',
       'San Diego Metropolitan Transit System',
       'California Vanpool Authority',
       'Alameda-Contra Costa Transit District',
       'San Diego Association of Governments',
       'Santa Clara Valley Transportation Authority', 'SamTrans',
       'City of Los Angeles', 'Sacramento Regional Transit District',
       'Victor Valley Transit Authority', 'Foothill Transit',
       'Riverside Transit Agency', 'Omnitrans',
       'North County Transit District', 'Long Beach Transit',
       'City of Santa Monica', 'Metropolitan Transportation Commission',
       'City of Fresno',
       'Golden Gate Bridge Highway and Transportation District',
       'Central Contra Costa Transit Authority',
       'Peninsula Corridor Joint Powers Board', 'San Joaquin Council',
       'Paratransit', 'San Joaquin Regional Trans

In [191]:
# Replace agency names in NTD to match routes_df above
ntd_df["agency"] = ntd_df["agency"].replace(
    {
        "Trinity County": "Trinity Transit",
        "City of Calabasas": "Calabasas Transit System",
        "County of Sonoma": "Sonoma County Transit",
        "Tehama County": "Tehama Rural Area eXpress",
        "Los Angeles County Department of Public Works - East L.A.": "East Los Angeles Shuttle",
        "Sacramento Regional Transit District": "Sacramento Regional Transit District",
        "Eastern Sierra Transit Authority": "Mammoth Lakes Transit System",
        "City of Lompoc": "City of Lompoc Transit",
        "San Luis Obispo Regional Transit Authority": "South County Transit Link",
        "City of Roseville": "Roseville Transit",
        "Los Angeles County Dept. of Public Works - Athens Shuttle Service": "the Link-Athens",
        "Los Angeles County Department of Public Works - Avocado Heights": "Avocado Heights/Bassett/West Valinda Shuttle",
        "Susanville Indian Rancheria": "Susanville Indian Rancheria Public Transportation Program",
        "Transit Joint Powers Authority for Merced County": "Merced The Bus",
        "City of Eureka": "Eureka Transit Service",
        "Nevada County Transit Services": "Gold Country Stage",
        "San Mateo County Transit District": "SamTrans",
        "Redwood Coast Transit Authority": "Redwood Coast Transit",
        "City of Avalon": "Avalon Transit",
        "City of Lodi": "Grapeline",
        "Golden Gate Bridge": "Golden Gate Bridge Highway and Transportation District",
        "City of Santa Maria": "Santa Maria Area Transit",
        'City and County of San Francisco': 'MUNI',
    }
)

In [192]:
# Replace agency names in agency4 to match ntd
trips_routes["agency"] = trips_routes["agency"].replace(
    {
        "Cloverdale Transit": "Sonoma County Transit",
    }
)

In [193]:
# Merge
trips_routes_ntd = pd.merge(
    trips_routes,
    ntd_df,
    how="left",
    on="agency",
    indicator=True,
)

In [194]:
# Create col to estimate number of buses that run in low cell coverage routes.

trips_routes_ntd["estimate_of_buses_in_low_cell_zones"] = (
    (trips_routes_ntd["total_buses"] * trips_routes_ntd["percentage_of_trips_w_low_cell_service"]
    )
    .fillna(1)
    .astype('int64')
   
)

In [195]:
# Fill 0 with 1?
trips_routes_ntd["estimate_of_buses_in_low_cell_zones"] = trips_routes_ntd[
    "estimate_of_buses_in_low_cell_zones"
].replace({0: 1})


## Final

### There are a lot of same route-id and same route-name combos but run by different agencies.
* Scanned [agencies](https://github.com/cal-itp/data-infra/blob/main/airflow/data/agencies.yml) to make sure all duplicates lead to the same gtsf schedule url.
* If so, del duplicates.

In [196]:
trips_routes_ntd.route_id.value_counts().head(10)

13050      9
13054      9
13061      9
14         4
1292       2
144        2
566        2
6          2
1042       1
705-240    1
Name: route_id, dtype: int64

In [197]:
# Del duplicates based on whichever agency with the same route has the most buses.
final = trips_routes_ntd.sort_values(
    ["route_id", "route_name", "total_buses"], ascending=[True, True, False]
).drop_duplicates(subset=["route_id", "route_name"])

In [198]:
final.shape, type(final), final.route_id.nunique()

((38, 21), geopandas.geodataframe.GeoDataFrame, 37)

In [199]:
final = final.set_geometry("geometry_overlay_att")

### Some of the results are surprising...wouldn't expect urban areas to have routes on this list

In [200]:
# https://stackoverflow.com/questions/38383886/drop-column-based-on-a-string-condition
final.loc[:, ~final.columns.str.contains("route_length|geometry|_merge|percentage_")].sort_values(
    "route_id"
)

Unnamed: 0,route_id,route_name,agency,itp_id,median_cell_coverage_percentage,total_trips_by_route,total_trips_by_agency,total_buses,estimate_of_buses_in_low_cell_zones
0,1042,"Guerneville, Monte Rio",Sonoma County Transit,70,36.55,8.0,251.0,77.0,2
1,1094,Sonora HWY 120,Yosemite Area Regional Transportation System,374,56.3,20.5,501.0,10.0,1
2,1210,Planada Commuter,Merced The Bus,343,66.81,6.0,565.0,67.0,1
4,1292,,Tehama Rural Area eXpress,334,46.68,2.0,57.0,10.0,1
6,13050,Avocado Heights/Bassett/West Valinda Shuttle,East Los Angeles Shuttle,172,67.17,23.0,501.0,17.0,1
15,13054,Edmund D. Edelman Children’s Court Shuttle,East Los Angeles Shuttle,172,53.88,49.0,501.0,17.0,1
24,13061,Wellness Center Shuttle,East Los Angeles Shuttle,172,70.96,48.0,501.0,17.0,1
35,14,"The Gold Route is operated by Arcata and Mad River Transit System and serves downtown Arcata, Humboldt State University, Valley West Shopping Center, and Alliance Rd.",Humboldt Transit Authority,135,67.41,6.0,147.0,29.0,1
37,144,South Main & Walnut,Tehama Rural Area eXpress,334,68.5,11.0,57.0,10.0,1
38,16672,Grass Valley to North San Juan,Gold Country Stage,221,66.3,10.0,113.0,25.0,2


#### How many total buses?

In [201]:
final.estimate_of_buses_in_low_cell_zones.sum()

100

In [208]:
final.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 38 entries, 0 to 67
Data columns (total 21 columns):
 #   Column                                  Non-Null Count  Dtype   
---  ------                                  --------------  -----   
 0   route_id                                38 non-null     object  
 1   route_name                              38 non-null     object  
 2   agency                                  38 non-null     object  
 3   itp_id                                  38 non-null     int64   
 4   geometry_overlay_att                    38 non-null     geometry
 5   route_length_overlay_att                38 non-null     float64 
 6   route_length_original_df                38 non-null     float64 
 7   percentage_att                          38 non-null     float64 
 8   geometry_overlay_verizon                38 non-null     geometry
 9   route_length_overlay_verizon            38 non-null     float64 
 10  percentage_verizon                      38 n

In [202]:
final.explore(
    "route_name",
    width=800,
    height=400,
    tooltip=["route_name"],
    style_kwds={"weight": 6},
    legend=False,
    color="tab20c",
)

### Checking with original df

In [203]:
# Grab route ID & agency
final_routes = final[["route_id", "agency"]]

In [210]:
final_routes.shape

(38, 2)

In [204]:
# Only keep final routes
routes_df2 = pd.merge(routes_df, final_routes, how="inner", on=["route_id", "agency"])

In [211]:
routes_df2.shape

(37, 7)

In [212]:
routes_df2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 37 entries, 0 to 36
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   itp_id        37 non-null     int64   
 1   route_id      37 non-null     object  
 2   geometry      37 non-null     geometry
 3   route_type    37 non-null     object  
 4   route_name    37 non-null     object  
 5   agency        37 non-null     object  
 6   route_length  37 non-null     float64 
dtypes: float64(1), geometry(1), int64(1), object(4)
memory usage: 2.3+ KB


In [209]:
routes_df2.explore(
    "route_length",
    width=800,
    height=400,
    tooltip=["route_name", "agency"],
    style_kwds={"weight": 6},
    legend=False,
    color="tab20c",
)