# Analysis
* Create yml for all my files. 
* Save overlay? 

In [None]:
 21QA21QWADzcazcAzcXVazcf# Read in zip files
# Graphs
import altair as alt
import fsspec
import geopandas as gpd
import intake
import numpy as np
import pandas as pd

# My utilities
import utilities
from calitp import *

# Display
from IPython.display import HTML, Image, Markdown, display, display_html

# Geometry
from shared_utils import geography_utils, utils

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/cellular_coverage/"

In [None]:
routes_df = utilities.load_unique_routes_df()

In [None]:
routes_df.shape

In [None]:
routes_df.columns

In [None]:
suffix = "test"

In [None]:
routes_df2 = routes_df.rename(
    columns={
        c: c + suffix
        for c in routes_df.columns
        if c in ["geometry", "route_type", "route_name"]
    }
)

In [None]:
routes_df2.columns

## T-Mobile

In [None]:
tmobile_df = gpd.read_parquet(f"{GCS_FILE_PATH}tmobile_california.parquet")

In [None]:
tmobile_m1 = utilities.route_cell_coverage(tmobile_df, routes_df, "_tmobile")

In [None]:
type(tmobile_m1)

In [None]:
tmobile_m1.shape

In [None]:
tmobile_m1.columns

In [None]:
(tmobile_m1["binned_tmobile"].value_counts() / len(tmobile_m1)) * 100

## AT&T

In [None]:
att_df = utilities.load_att()

In [None]:
att_m1 = utilities.route_cell_coverage(att_df, routes_df, "_att")

In [None]:
type(att_m1)

In [None]:
att_m1.shape

In [None]:
(att_m1["binned_att"].value_counts() / len(att_m1)) * 100

## Verizon

In [None]:
verizon_df = utilities.load_verizon()

In [None]:
verizon_m1 = utilities.route_cell_coverage(verizon_df, routes_df, "_verizon")

In [None]:
type(verizon_m1)

In [None]:
verizon_m1.shape

In [None]:
(verizon_m1["binned_verizon"].value_counts() / len(verizon_m1)) * 100

In [None]:
verizon_m1.columns

## Merge all three providers

In [None]:
common_cols = ["route_id", "route_name", "agency", "itp_id", "route_length_original_df"]

In [None]:
merge1 = att_m1.merge(verizon_m1, how="inner", on=common_cols).merge(
    tmobile_m1, how="outer", on=common_cols
)

In [None]:
# Del extra cols that are the same across dfs.
merge1 = merge1.drop(
    columns=[
        "binned_verizon",
        "binned_att",
        "binned_tmobile",
    ]
)

In [None]:
# Ensure this is a geometry col
merge1 = gpd.GeoDataFrame(merge1, geometry= 'geometry_overlay_verizon', crs="EPSG:4326")

In [None]:
type(merge1)

In [None]:
merge1.shape

### Some checks
* Original df against merged.

In [None]:
# 1 extra row missing??
routes_df.shape, merge1.shape

In [None]:
# Find the missing row
route_names_og = set(routes_df.route_name.unique().tolist())
route_names_merge = set(merge1.route_name.unique().tolist())
route_names_og - route_names_merge

In [None]:
routes_df.route_id.nunique(), merge1.route_id.nunique()

In [None]:
routes_df.itp_id.nunique(), merge1.itp_id.nunique()

### Filter out for low threshold 
* Chose 70% or less cell coverage as the threshold.
* Most routes across the 3 providers have 90% or more coverage.
* Verizon has the best coverage. 85% of the routes have 90% or more coverage with Verizon. 

In [None]:
# Merge low att & verizon coverage
m1 = low_att_coverage.merge(
    low_verizon_coverage,
    how="outer",
    left_on=["route_id_att", "route_name_att", "agency_att", "itp_id_att"],
    right_on=[
        "route_id_verizon",
        "route_name_verizon",
        "agency_verizon",
        "itp_id_verizon",
    ],
    indicator=True,
)

In [None]:
m1["_merge"].value_counts()

In [None]:
# Turn to gpd
m1 = gpd.GeoDataFrame(m1, geometry="geometry_overlay_att", crs="EPSG:4326")

In [None]:
type(m1)

In [None]:
""" m1.loc[m1["_merge"] == "both"][
    ["route_name_att", "route_name_verizon", "percentage_verizon", "percentage_att"]
].drop_duplicates(subset=["route_name_att"]) """

#### Check a left only value for Verizon
* Left only routes have more than 70% coverage through Verizon.

In [None]:
verizon_m1.loc[verizon_m1["route_name_verizon"] == "Ridgecrest Shuttle"][
    [
        "route_name_verizon",
        "percentage_verizon",
    ]
]

In [None]:
verizon_m1.loc[verizon_m1["route_name_verizon"] == "Mainline AM/PM"][
    [
        "route_name_verizon",
        "percentage_verizon",
    ]
]

In [None]:
# Filter out for only both, these are routes that really dont have any more than 60% cellular coverage among AT&T and Verizon
m1 = m1.loc[m1["_merge"] == "both"]

In [None]:
# Drop unwanted cols
m1 = m1.drop(
    columns=[
        "_merge",
        "route_type_att",
        "binned_att",
        "route_id_verizon",
        "route_name_verizon",
        "agency_verizon",
        "itp_id_verizon",
        "geometry_overlay_verizon",
        "route_length_overlay_verizon",
        "geometry_original_df_verizon",
        "route_type_verizon",
        "route_length_original_df_verizon",
        "binned_verizon",
    ]
)

In [None]:
m1.shape

In [None]:
# Merge att & verizon coverage
m2 = pd.merge(
    m1,
    low_tmobile_coverage,
    how="outer",
    left_on=["route_id_att", "route_name_att", "agency_att", "itp_id_att"],
    right_on=[
        "route_id_tmobile",
        "route_name_tmobile",
        "agency_tmobile",
        "itp_id_tmobile",
    ],
    indicator=True,
)

In [None]:
# Make sure it remains a GDF
m1 = gpd.GeoDataFrame(m1, geometry="geometry_overlay_att", crs="EPSG:4326")

In [None]:
m2["_merge"].value_counts()

In [None]:
# Filter out for only both, these are routes that really dont have any more than treshold % cellular coverage among all 3
m3 = m2.loc[m2["_merge"] == "both"]

In [None]:
m3 = m3.drop(
    columns=[
        "route_id_tmobile",
        "route_name_tmobile",
        "agency_tmobile",
        "itp_id_tmobile",
        "geometry_overlay_tmobile",
        "route_length_overlay_tmobile",
        "geometry_original_df_tmobile",
        "route_type_tmobile",
        "route_length_original_df_tmobile",
        "binned_tmobile",
        "_merge",
    ]
)

In [None]:
# preview df
m3.drop(columns=["geometry_original_df_att", "geometry_overlay_att"]).sample(3)

In [None]:
threshold = 70

In [None]:
# Only keep routes that have less than threshold across all 3 providers.
low_coverage = merge1[
    (merge1["percentage_verizon"] < threshold)
    & (merge1["percentage_tmobile"] < threshold)
    & (merge1["percentage_att"] < threshold)
]

In [None]:
# Create a column with median of all three providers
low_coverage["median_cell_coverage_percentage"] = low_coverage[
    ["percentage_verizon", "percentage_tmobile", "percentage_att"]
].mean(axis=1)

In [None]:
len(low_coverage), type(low_coverage), low_coverage.route_id.nunique()

In [None]:
low_coverage.columns

In [None]:
# (low_coverage.loc[:,~low_coverage.columns.str.contains('^geometry', case=False)]).head(3)

## Add trips
* Add one column for total trips that an agency completes across all routes.
* Add another column for total trips ONLY for routes with low coverage. 

In [None]:
trips  = utilities.trip_df()

In [None]:
trips.columns

In [None]:
# Merge the 2 filtered at&t and verizon dataframes so we can see which routes overlap
trips_routes = pd.merge(
    low_coverage,
    trips,
    how="left",
    left_on=["itp_id", "route_id"],
    right_on=["calitp_itp_id", 'route_id'],
    indicator=True,
)

In [None]:
trips_routes["_merge"].value_counts()

In [None]:
# Add column for percentage of trips for the route in question compared with
# All the trips an agency has done
trips_routes["percentage_of_trips_w_low_cell_service"] = (
    trips_routes["total_trips_by_route"] / trips_routes["total_trips_by_agency"]
)

In [None]:
trips_routes = trips_routes.drop(columns=["_merge", "calitp_itp_id"])

In [None]:
trips_routes.shape

## Add NTD

In [None]:
# Load NTD vehicles
ntd_df = utilities.ntd_vehicles()[["agency", "total_buses"]]

In [None]:
# Replace agency names in NTD to match routes_df above
ntd_df["agency"] = ntd_df["agency"].replace(
    {
        "Trinity County": "Trinity Transit",
        "City of Calabasas": "Calabasas Transit System",
        "County of Sonoma": "Sonoma County Transit",
        "Tehama County": "Tehama Rural Area eXpress",
        "Los Angeles County Department of Public Works - East L.A.": "East Los Angeles Shuttle",
        "Sacramento Regional Transit District": "Sacramento Regional Transit District",
        "Eastern Sierra Transit Authority": "Mammoth Lakes Transit System",
        "City of Lompoc": "City of Lompoc Transit",
        "San Luis Obispo Regional Transit Authority": "South County Transit Link",
        "City of Roseville": "Roseville Transit",
        "Los Angeles County Dept. of Public Works - Athens Shuttle Service": "the Link-Athens",
        "Los Angeles County Department of Public Works - Avocado Heights": "Avocado Heights/Bassett/West Valinda Shuttle",
        "Susanville Indian Rancheria": "Susanville Indian Rancheria Public Transportation Program",
        "Transit Joint Powers Authority for Merced County": "Merced The Bus",
        "City of Eureka": "Eureka Transit Service",
        "Nevada County Transit Services": "Gold Country Stage",
        "San Mateo County Transit District": "SamTrans",
        "Redwood Coast Transit Authority": "Redwood Coast Transit",
        "City of Avalon": "Avalon Transit",
        "City of Lodi": "Grapeline",
        "Golden Gate Bridge": "Golden Gate Bridge Highway and Transportation District",
        "City of Santa Maria": "Santa Maria Area Transit",
    }
)

In [None]:
# Replace agency names in agency4 to match ntd
trips_routes["agency"] = trips_routes["agency"].replace(
    {
        "Cloverdale Transit": "Sonoma County Transit",
    }
)

In [None]:
trips_routes_ntd = pd.merge(
    trips_routes,
    ntd_df,
    how="left",
    on="agency",
    indicator=True,
)

In [None]:
# Create col to find number of buses that run in low cell coverage routes.
trips_routes_ntd["estimate_of_buses_in_low_cell_zones"] = (
    (trips_routes_ntd["total_buses"] * trips_routes_ntd["percentage_of_trips_w_low_cell_service"])
    .fillna(0)
    .astype("int64")
)

## Final

### There are a lot of same route-id and same route-name combos but run by different agencies.
* Scanned [agencies](https://github.com/cal-itp/data-infra/blob/main/airflow/data/agencies.yml) to make sure all duplicates lead to the same gtsf schedule url.
* If so, del duplicates.

In [None]:
trips_routes_ntd.route_id.value_counts()

In [None]:
# Del duplicates based on whichever agency with the same route has the most buses.
final = trips_routes_ntd.sort_values(
    ["route_id", "route_name", "total_buses"], ascending=[True, True, False]
).drop_duplicates(subset=["route_id", "route_name"])

In [None]:
final.shape, type(final)

In [None]:
final= final.set_geometry('geometry_overlay_att')

### Some of the results are surprising...wouldn't expect urban areas to have routes on this list

In [None]:
# https://stackoverflow.com/questions/38383886/drop-column-based-on-a-string-condition
final.loc[:, ~final.columns.str.contains('route_length|geometry|_merge')]

In [None]:
final.explore(
    "route_name",
    width=800,
    height=400,
    tooltip=["route_name"],
    style_kwds={"weight": 6},
    legend=False,
    color="tab20c",
)

In [None]:
# Grab route ID&agency
final_routes = final[["route_id", "agency"]]

In [None]:
routes_df2 = pd.merge(routes_df, final_routes, how="inner", on=["route_id", "agency"])

In [None]:
routes_df2.columns

In [None]:
routes_df2.explore(
    "route_length",
    width=800,
    height=400,
    tooltip=["route_name", "agency"],
    style_kwds={"weight": 6},
    legend=False,
    color="tab20c",
)