## Analysis

In [1]:
import A1_provider_prep
import A2_other
import A3_analysis

import pandas as pd
import geopandas as gpd
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# Read in no coverage maps
verizon = gpd.read_parquet(f"{A1_provider_prep.GCS_FILE_PATH}verizon_no_coverage_cal.parquet")

In [4]:
att = gpd.read_parquet(f"{A1_provider_prep.GCS_FILE_PATH}att_no_coverage_cal.parquet")

In [5]:
tmobile = gpd.read_parquet(f"{A1_provider_prep.GCS_FILE_PATH}tmobile_no_coverage_cal.parquet")

### Which routes touch areas without data coverage among all 3 providers?

In [26]:
# Find routes that run in areas without coverage 
# Across all 3 providers
routes1 = A3_analysis.merge_all_providers()

In [7]:
# Load in original geometries of the routes.
one_dist_routes, multi_dist_routes, all_routes = A2_other.find_multi_district_routes()

In [27]:
# Create bins to easily see the median percentage of the route WITH coverage.
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
routes1["binned"] = pd.cut(routes1["median_percent_with_coverage"], bins)

In [28]:
f"There are {len(routes1)} routes that cross a zone w/o data coverage among ATT, T-Mobile, and Verizon out of {len(all_routes)} routes."

'There are 865 routes that cross a zone w/o data coverage among ATT, T-Mobile, and Verizon out of 2914 routes.'

### Filter out for low coverage routes
* Most routes (776 out of 865) have data coverage for 90-100% of the routes.
* Cut off based on bins.

In [34]:
routes1.binned.value_counts()

(90, 100]    776
(80, 90]      40
(70, 80]      30
(60, 70]       9
(50, 60]       6
(30, 40]       2
(40, 50]       2
(0, 10]        0
(10, 20]       0
(20, 30]       0
Name: binned, dtype: int64

In [35]:
# Filter out for routes where 75% or less of its length runs in a no data-coverage zone.
threshold = 75
low_coverage = (routes1.loc[routes1.median_percent_with_coverage < threshold]).reset_index(drop = True)

In [36]:
# Subset
subset_cols = ["agency", 'itp_id', 'route_id', "long_route_name", "District", "median_percent_with_coverage", "median_percent_no_coverage"]

In [37]:
low_coverage = low_coverage[subset_cols]

In [48]:
f"{len(low_coverage)} routes are considered low-data coverage"

'31 routes are considered low-data coverage'

### How many buses run through a route that has "low data coverage?"
* Find number of trips ran per these "low coverage" routes
* Find number of buses each agency from this "low coverage" dataframe owns.

In [159]:
def merge_trips(gdf):
    
    trips = A2_other.trip_df()
    
    m1 = pd.merge(
    gdf,
    trips,
    how="left",
    left_on=["itp_id", "route_id"],
    right_on=["calitp_itp_id", "route_id"])
    
    # Divide the total of trips for this particular low coverage route
    # by the total trips the agency run among all its routes on that particular day
    m1["percentage_of_trips_w_low_cell_service"] = (
    m1["total_trips_by_route"] / m1["total_trips_by_agency"]).astype('int64')
    
    return m1

In [None]:
def merge_ntd(gdf):
    
    # Load NTD
    ntd = A2_other.ntd_vehicles()[["agency", "total_buses"]]
    
    # Replace so it will merge properly with NTD
    gdf["agency"] = gdf["agency"].replace(
    {"Mammoth Lakes Transit System": "Eastern Sierra Transit Authority"})
    
    # Merge
    m1 = pd.merge(
    gdf,
    ntd,
    how="left",
    on="agency",
    indicator=True,)
    
    # Fill agencies with NA buses with median total buses
    median_total_buses = m1["total_buses"].median()
    m1["total_buses"] = m1["total_buses"].fillna(median_total_buses)
    
    # To get an estimate of buses that run in a low data zone.
    # Multiply the agency's total buses by the % of its total trips that 
    # run in a low data zone.Fill NA with the median of this.
    m1["estimate_of_buses_in_low_cell_zones"] = (
    (m1.total_buses * m1.percentage_of_trips_w_low_cell_service
    )
    .fillna((m1.total_buses * m1.percentage_of_trips_w_low_cell_service
    ).median())
    .astype('int64'))
    
    # Replace estimate of buses from 0 to 1. 
    m1.estimate_of_buses_in_low_cell_zones = m1.estimate_of_buses_in_low_cell_zones.replace({0:1})
    
    return m1

In [131]:
# m2.loc[m2._merge == "right_only"][['agency']].sort_values(by = 'agency')

In [132]:
# m2.loc[m2._merge == "left_only"][['agency']]

In [136]:
# Fill agencies with NA buses with median total buses
median_total_buses = m2["total_buses"].median()

In [137]:
m2["total_buses"] = m2["total_buses"].fillna(median_total_buses)

In [138]:
# To get an estimate of buses that run in a low data zone.
# Multiply the agency's total buses by the % of its total trips that 
# run in a low data zone.Fill NA with the median of this.
m2["estimate_of_buses_in_low_cell_zones"] = (
    (m2.total_buses * m2.percentage_of_trips_w_low_cell_service
    )
    .fillna((m2.total_buses * m2.percentage_of_trips_w_low_cell_service
    ).median())
    .astype('int64')
   
)

In [144]:
# Replace 0 buses in low cell zone to 1
m2.estimate_of_buses_in_low_cell_zones = m2.estimate_of_buses_in_low_cell_zones.replace({0:1})

In [145]:
cols_to_drop = ['itp_id', 'route_id', '_merge','total_buses']

### Conclusions

In [146]:
m2 = m2.drop(columns = cols_to_drop)

KeyError: "['itp_id' 'route_id' '_merge' 'total_buses'] not found in axis"

In [150]:
m2.estimate_of_buses_in_low_cell_zones.sum()

90

In [152]:
m2.long_route_name.nunique()

31

In [153]:
m2.District.value_counts()

D-4            8
D-1            7
D-6            3
D-2            3
D-9            2
D-5            2
D-10           2
D-1,D-4,D-4    1
D-1,D-1,D-2    1
D-3            1
D-7            1
Name: District, dtype: int64

In [151]:
m2.agency.value_counts()

Trinity Transit                                           4
Golden Gate Bridge Highway and Transportation District    3
Eastern Sierra Transit Authority                          2
Santa Cruz Metropolitan Transit District                  2
Redwood Coast Transit                                     2
MUNI                                                      2
Mendocino Transit Authority                               2
AC Transit                                                2
Yosemite Area Regional Transportation System              2
Eureka Transit Service                                    1
Gold Country Stage                                        1
Capitol Corridor                                          1
Humboldt Transit Authority                                1
Kern Transit                                              1
Los Angeles Department of Transportation                  1
Blue Lake Rancheria                                       1
Arcata and Mad River Transit System     

#### Map
* How to toggle map layers to switch between AT&T/T Mobile/Verizon?

In [127]:
# Grab the names of routes are "low coverage."
low_coverage_routes = low_coverage.long_route_name.unique().tolist()

In [128]:
# Get original geometry
low_coverage_og_geometry = (
    all_routes[all_routes["long_route_name"].isin(low_coverage_routes)]
).reset_index(drop=True)

In [129]:
m = tmobile.explore(
    tiles="CartoDB positron",
    width=800,
    height=500,
)

In [156]:
m = low_coverage_og_geometry.explore("long_route_name",
    m=m,legend = False,
)

# m