# Analysis
* Create yml for all my files. 
* Save overlay? 

In [1]:
import altair as alt
import fsspec
import geopandas as gpd
import intake
import numpy as np
import pandas as pd
import A1_provider_prep
import A2_analysis
import A3_other
from calitp import *
from IPython.display import HTML, Image, Markdown, display, display_html
from shared_utils import geography_utils, utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
routes_df = A3_other.load_unique_routes_df()

In [4]:
routes_df.shape

(2829, 7)

## T-Mobile

In [5]:
tmobile_df = gpd.read_parquet(f"{A1_provider_prep.GCS_FILE_PATH}tmobile_california.parquet")

In [None]:
tmobile_m1 = A2_analysis.route_cell_coverage(tmobile_df, routes_df, "_tmobile")

In [None]:
tmobile_m1.shape

In [None]:
(tmobile_m1["binned_tmobile"].value_counts() / len(tmobile_m1)) * 100

## AT&T

In [None]:
att_df = A1_provider_prep.load_att()

In [None]:
att_m1 = A2_analysis.route_cell_coverage(att_df, routes_df, "_att")

In [None]:
att_m1.shape

In [None]:
(att_m1["binned_att"].value_counts() / len(att_m1)) * 100

In [None]:
#att_m1.loc[att_m1['agency'] == "Roseville Transit"].drop(columns = ['geometry_overlay_att'])

## Verizon
* Verizon has the best coverage. 85% of the routes have 90% or more coverage with Verizon. 

In [None]:
verizon_df = A1_provider_prep.load_verizon()

In [None]:
verizon_m1 = A2_analysis.route_cell_coverage(verizon_df, routes_df, "_verizon")

In [None]:
verizon_m1.shape

In [None]:
(verizon_m1["binned_verizon"].value_counts() / len(verizon_m1)) * 100

## Merge all three providers

In [None]:
common_cols = ["route_id", "route_name", "agency", "itp_id", "route_length_original_df"]

In [None]:
merge1 = att_m1.merge(verizon_m1, how="outer", on=common_cols).merge(
    tmobile_m1, how="outer", on=common_cols
)

In [None]:
# Del extra cols that are the same across dfs.
merge1 = merge1.drop(
    columns=[
        "binned_verizon",
        "binned_att",
        "binned_tmobile",
    ]
)

In [None]:
# Ensure this remains a gdf.
merge1 = gpd.GeoDataFrame(merge1, geometry="geometry_overlay_verizon", crs="EPSG:4326")

In [None]:
type(merge1)

In [None]:
merge1.shape

### Some checks
* Original df against merged.

In [None]:
# 1 extra row missing??
routes_df.shape, merge1.shape

In [None]:
# Find the missing row
original = set(routes_df.route_name.unique().tolist())
merge = set(merge1.route_name.unique().tolist())
original - merge

In [None]:
routes_df.route_id.nunique(), merge1.route_id.nunique()

In [None]:
routes_df.itp_id.nunique(), merge1.itp_id.nunique()

### Route Coverage - using median % of coverage across 3 providers.
* 78% of routes have 90-100% of coverage. 

In [None]:
# Create a column with median of all three providers
merge1["median_cell_coverage_percentage"] = merge1[
    ["percentage_verizon", "percentage_tmobile", "percentage_att"]
].median(axis=1)

In [None]:
# Bin the median?
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [None]:
(pd.cut(merge1["median_cell_coverage_percentage"],bins).value_counts()  / len(merge1)) * 100

### Filter out for low threshold 
* Using the median cell coverage percentage, choose 70% or less cell coverage as the threshold.

In [None]:
threshold = 70

In [None]:
# Filter based on median
low_coverage = merge1[
    (merge1["median_cell_coverage_percentage"] < threshold)]

In [None]:
len(low_coverage), type(low_coverage), low_coverage.route_id.nunique()

In [None]:
# (low_coverage.loc[:,~low_coverage.columns.str.contains('^geometry', case=False)])

## Add trips
* Add one column for total trips that an agency completes across all routes.
* Add another column for total trips ONLY for routes with low coverage. 
* Divide these 2 columns for percentage of trips an agency runs in a low coverage zone.
* Fill NA for total trips by agency/by route with median of each col

In [None]:
trips = A3_other.trip_df()

In [None]:
# Merge the 2 filtered at&t and verizon dataframes so we can see which routes overlap
trips_routes = pd.merge(
    low_coverage,
    trips,
    how="left",
    left_on=["itp_id", "route_id"],
    right_on=["calitp_itp_id", "route_id"],
    indicator=True,
)

In [None]:
# Should all route ids match?
trips_routes["_merge"].value_counts()

In [None]:
trips_routes.total_trips_by_route.median(), trips_routes.total_trips_by_agency.median()

In [None]:
# Fill NA for total trips by agency/by route with median of each col
trips_routes = trips_routes.assign(
     total_trips_by_route = trips_routes.total_trips_by_route.fillna(trips_routes.total_trips_by_route.median()),
     total_trips_by_agency = trips_routes.total_trips_by_agency.fillna(trips_routes.total_trips_by_agency.median())
) 

In [None]:
# Add column for percentage of trips for the route in question compared with
# All the trips an agency has done
trips_routes["percentage_of_trips_w_low_cell_service"] = (
    trips_routes["total_trips_by_route"] / trips_routes["total_trips_by_agency"]
)

In [None]:
trips_routes = trips_routes.drop(columns=["_merge", "calitp_itp_id"])

## Add NTD

In [None]:
# Load NTD vehicles
ntd_df = utilities.ntd_vehicles()[["agency", "total_buses"]]

In [None]:
# ntd_df.agency.unique()

In [None]:
# Replace agency names in NTD to match routes_df above
ntd_df["agency"] = ntd_df["agency"].replace(
    {
        "Trinity County": "Trinity Transit",
        "City of Calabasas": "Calabasas Transit System",
        "County of Sonoma": "Sonoma County Transit",
        "Tehama County": "Tehama Rural Area eXpress",
        "Los Angeles County Department of Public Works - East L.A.": "East Los Angeles Shuttle",
        "Sacramento Regional Transit District": "Sacramento Regional Transit District",
        "Eastern Sierra Transit Authority": "Mammoth Lakes Transit System",
        "City of Lompoc": "City of Lompoc Transit",
        "San Luis Obispo Regional Transit Authority": "South County Transit Link",
        "City of Roseville": "Roseville Transit",
        "Los Angeles County Dept. of Public Works - Athens Shuttle Service": "the Link-Athens",
        "Los Angeles County Department of Public Works - Avocado Heights": "Avocado Heights/Bassett/West Valinda Shuttle",
        "Susanville Indian Rancheria": "Susanville Indian Rancheria Public Transportation Program",
        "Transit Joint Powers Authority for Merced County": "Merced The Bus",
        "City of Eureka": "Eureka Transit Service",
        "Nevada County Transit Services": "Gold Country Stage",
        "San Mateo County Transit District": "SamTrans",
        "Redwood Coast Transit Authority": "Redwood Coast Transit",
        "City of Avalon": "Avalon Transit",
        "City of Lodi": "Grapeline",
        "Golden Gate Bridge": "Golden Gate Bridge Highway and Transportation District",
        "City of Santa Maria": "Santa Maria Area Transit",
        'City and County of San Francisco': 'MUNI',
        'Alameda-Contra Costa Transit District': 'AC Transit',
        'Kern Regional Transit': 'Kern Transit',
        'County of Placer': 'Tahoe Transportation',
        'County of Placer':'Tahoe Truckee Area Regional Transportation'
    }
)

In [None]:
# Replace agency names in agency4 to match ntd
trips_routes["agency"] = trips_routes["agency"].replace(
    {
        "Cloverdale Transit": "Sonoma County Transit",
    }
)

In [None]:
# Merge
trips_routes_ntd = pd.merge(
    trips_routes,
    ntd_df,
    how="left",
    on="agency",
    indicator=True,
)

#### Estimate # of buses that run in low cellular coverage areas
* Create col to estimate number of buses that run in low cell coverage routes based on % of trips ran for the low coverage routes out of total trips. 
* Multiply the aforementioend percentage by total buses.
* Fill missing values for total buses & total buses run in low coverage zones columns with the median of each. 
* Fill values of 0 with 1, since at least one bus ran through that route. 

In [None]:
# Fill na with median total buses
median_total_buses = trips_routes_ntd["total_buses"].median()
median_total_buses

In [None]:
trips_routes_ntd["total_buses"] = trips_routes_ntd["total_buses"].fillna(median_total_buses)

In [None]:

trips_routes_ntd["estimate_of_buses_in_low_cell_zones"] = (
    (trips_routes_ntd["total_buses"] * trips_routes_ntd["percentage_of_trips_w_low_cell_service"]
    )
    .fillna((trips_routes_ntd["total_buses"] * trips_routes_ntd["percentage_of_trips_w_low_cell_service"]
    ).median())
    .astype('int64')
   
)

In [None]:
# Fill 0 with 1?
trips_routes_ntd["estimate_of_buses_in_low_cell_zones"] = trips_routes_ntd[
    "estimate_of_buses_in_low_cell_zones"
].replace({0: 1})


## Final

### There are a lot of same route-id and same route-name combos but run by different agencies.
* Scanned [agencies](https://github.com/cal-itp/data-infra/blob/main/airflow/data/agencies.yml) to make sure all duplicates lead to the same gtsf schedule url.
* If so, del duplicates.

In [None]:
trips_routes_ntd.route_id.value_counts().head(10)

In [None]:
# Del duplicates based on whichever agency with the same route has the most buses.
final = trips_routes_ntd.sort_values(
    ["route_id", "route_name", "total_buses"], ascending=[True, True, False]
).drop_duplicates(subset=["route_id", "route_name"])

In [None]:
final.shape, final.route_id.nunique(), type(final)

In [None]:
# Set geometry.
final = final.set_geometry("geometry_overlay_verizon")

### Results
* Some of the results are surprising...wouldn't expect urban areas to have routes on this list
* Also there is one Arizona route here? 

In [None]:
# https://stackoverflow.com/questions/38383886/drop-column-based-on-a-string-condition
final.loc[:, ~final.columns.str.contains("route_length|geometry|_merge|percentage_")].sort_values(
    "route_id"
)

#### How many total buses? Routes? 

In [None]:
f"{final.estimate_of_buses_in_low_cell_zones.sum()} total buses running in areas with low cell coverage & {final.route_id.nunique()} different routes."

#### How often do agencies appear?

In [None]:
final.agency.value_counts()

#### Checking with original dataframes
* Explore why there are some urban areas that appear in the final dataframe?

In [None]:
# Grab route ID & agency
final_routes = final[["route_id", "agency"]]

In [None]:
final_routes_list = final.route_id.unique().tolist()

In [None]:
# Only keep final routes in the original routes_df
routes_original_df = pd.merge(routes_df, final_routes, how="inner", on=["route_id", "agency"])

In [None]:
"""
final_routes_merge.loc[:, ~final_routes_merge.columns.str.contains("itp_id|geometry|_merge")].sort_values(
    "route_id"
)
"""

In [None]:
routes_original_df.explore(
    "route_id",
    width=800,
    height=400,
    tooltip=["route_name", "agency", "route_id"],
    style_kwds={"weight": 6},
    legend=False,
    color="tab20c",
)

In [None]:
def comparison(gdf_left, gdf_right):

    # Overlay
    overlay_df = gpd.overlay(
        gdf_left, gdf_right, how="intersection", keep_geom_type=True
    )

    # Create a new route length for portions covered by cell coverage
    overlay_df = overlay_df.assign(
        route_length=overlay_df.geometry.to_crs(geography_utils.CA_StatePlane).length
    )

    return overlay_df

In [None]:
# Subset original df
routes_test= routes_df[routes_df["agency"].isin(["Sacramento Regional Transit District",
                                                 "Golden Gate Bridge Highway and Transportation District"])]

In [None]:
routes_test.shape

In [None]:
# Break out comparison function in utilities
verizon_test = gpd.overlay(
        routes_test, verizon_df, how="intersection", keep_geom_type=False
    )

In [None]:
verizon_test=verizon_test.to_crs(geography_utils.CA_StatePlane)

In [None]:
verizon_test["route_length"] = verizon_test.geometry.to_crs(geography_utils.CA_StatePlane).length

#### Sac Route 30

In [None]:
# Original dataframe
sac_og = routes_test[(routes_test['route_id'] == "30")]
sac_og.drop(columns = ['geometry'])

In [None]:
sac_og.plot(figsize=(10,10), lw=6 )

In [None]:
# Only the comparison length: intersect Verizon df with Routes df & grab the length 
sac_verizon = verizon_test[(verizon_test['route_id'] == "30")]
sac_verizon.drop(columns = ['geometry'])

In [None]:
sac_verizon.plot(figsize=(10,10), lw=6)

In [None]:
# The original routes_cell_coverage function
verizon_m1[(verizon_m1['agency'] == "Sacramento Regional Transit District") & (verizon_m1['route_id'] == "30")].drop(columns = ['geometry_overlay_verizon'])

#### Golden Gate Bridge Highway and Transportation District 705-240

In [None]:
# Original dataframe
sf_og = routes_test[(routes_test['agency'] == "Golden Gate Bridge Highway and Transportation District") & (routes_test['route_id'] == "705-240")]

In [None]:
sf_og.plot(figsize=(10,10), lw=6 )

In [None]:
# Intersection between original routes & verizon.
# Split? 
sf_test = verizon_test[(verizon_test['agency'] == "Golden Gate Bridge Highway and Transportation District") & (verizon_test['route_id'] == "705-240")] 
sf_test.drop(columns = ['geometry'])

In [None]:
sf_test.route_length.sum()

In [None]:
sf_test.plot(figsize=(10,10), lw=6 )

In [None]:
(28044.65+9725.39)/56918.54

In [None]:
# The entire function route_cell_coverage
verizon_m1[(verizon_m1['agency'] == "Golden Gate Bridge Highway and Transportation District") & (verizon_m1['route_id'] == "705-240")].drop(columns = ['geometry_overlay_verizon'])