# Analysis
* Create yml for all my files. 
* Save overlay? 

In [1]:
# Read in zip files
# Graphs
import altair as alt
import fsspec
import geopandas as gpd
import intake
import numpy as np
import pandas as pd

# My utilities
import utilities
from calitp import *

# Display
from IPython.display import HTML, Image, Markdown, display, display_html

# Geometry
from shared_utils import geography_utils, utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/cellular_coverage/"

In [4]:
routes_df = utilities.load_unique_routes_df()

In [5]:
routes_df.shape

(2829, 7)

## T-Mobile

In [6]:
tmobile_df = gpd.read_parquet(f"{GCS_FILE_PATH}tmobile_california.parquet")

In [7]:
tmobile_m1 = utilities.route_cell_coverage(tmobile_df, routes_df, "_tmobile")

In [8]:
type(tmobile_m1)

geopandas.geodataframe.GeoDataFrame

In [9]:
(tmobile_m1["binned_tmobile"].value_counts() / len(tmobile_m1)) * 100

(90, 100]   71.78
(80, 90]     6.12
(70, 80]     3.08
(60, 70]     1.31
(50, 60]     0.74
(40, 50]     0.39
(30, 40]     0.28
(20, 30]     0.14
(10, 20]     0.04
(0, 10]      0.00
Name: binned_tmobile, dtype: float64

## AT&T

In [10]:
att_df = utilities.load_att()

In [11]:
att_m1 = utilities.route_cell_coverage(att_df, routes_df, "_att")

In [12]:
type(att_m1)

geopandas.geodataframe.GeoDataFrame

In [13]:
(att_m1["binned_att"].value_counts() / len(att_m1)) * 100

(90, 100]   73.32
(80, 90]     4.93
(70, 80]     2.66
(60, 70]     1.56
(50, 60]     1.35
(10, 20]     0.14
(20, 30]     0.07
(30, 40]     0.07
(40, 50]     0.07
(0, 10]      0.00
Name: binned_att, dtype: float64

## Verizon

In [14]:
verizon_df = utilities.load_verizon()

In [15]:
verizon_m1 = utilities.route_cell_coverage(verizon_df, routes_df, "_verizon")

In [16]:
type(verizon_m1)

geopandas.geodataframe.GeoDataFrame

In [17]:
(verizon_m1["binned_verizon"].value_counts() / len(verizon_m1)) * 100

(90, 100]   85.54
(80, 90]     5.95
(70, 80]     2.62
(60, 70]     1.35
(50, 60]     1.03
(40, 50]     0.11
(10, 20]     0.07
(30, 40]     0.07
(0, 10]      0.00
(20, 30]     0.00
Name: binned_verizon, dtype: float64

## Compare routes across providers.
* https://geopandas.org/en/stable/docs/user_guide/mergingdata.html
* Make sure CRS are the same.

In [18]:
# Threshold
theshold = 70

In [19]:
# Filter dfs to meet threshold
low_att_coverage = att_m1.loc[att_m1["percentage_att"] < theshold].reset_index(
    drop=True
)
low_verizon_coverage = verizon_m1.loc[
    verizon_m1["percentage_verizon"] < theshold
].reset_index(drop=True)
low_tmobile_coverage = tmobile_m1.loc[
    tmobile_m1["percentage_tmobile"] < theshold
].reset_index(drop=True)

In [20]:
low_att_coverage.shape, low_verizon_coverage.shape, low_tmobile_coverage.shape,

((92, 11), (74, 11), (82, 11))

In [21]:
type(low_att_coverage), type(low_verizon_coverage), type(low_tmobile_coverage)

(geopandas.geodataframe.GeoDataFrame,
 geopandas.geodataframe.GeoDataFrame,
 geopandas.geodataframe.GeoDataFrame)

### Merge Verizon & ATT

In [62]:
# Merge low att & verizon coverage
m1 = low_att_coverage.merge(
    low_verizon_coverage,
    how="outer",
    left_on=["route_id_att", "route_name_att", "agency_att", "itp_id_att"],
    right_on=[
        "route_id_verizon",
        "route_name_verizon",
        "agency_verizon",
        "itp_id_verizon",
    ],
    indicator=True,
)

In [63]:
m1["_merge"].value_counts()

both          64
left_only     28
right_only    10
Name: _merge, dtype: int64

In [64]:
# Turn to gpd
m1 = gpd.GeoDataFrame(m1, geometry="geometry_overlay_att", crs="EPSG:4326")

In [65]:
type(m1)

geopandas.geodataframe.GeoDataFrame

In [66]:
""" m1.loc[m1["_merge"] == "both"][
    ["route_name_att", "route_name_verizon", "percentage_verizon", "percentage_att"]
].drop_duplicates(subset=["route_name_att"]) """

' m1.loc[m1["_merge"] == "both"][\n    ["route_name_att", "route_name_verizon", "percentage_verizon", "percentage_att"]\n].drop_duplicates(subset=["route_name_att"]) '

#### Check a left only value for Verizon
* Left only routes have more than 70% coverage through Verizon.

In [67]:
verizon_m1.loc[verizon_m1["route_name_verizon"] == "Ridgecrest Shuttle"][
    [
        "route_name_verizon",
        "percentage_verizon",
    ]
]

Unnamed: 0,route_name_verizon,percentage_verizon
246,Ridgecrest Shuttle,99.99


In [68]:
verizon_m1.loc[verizon_m1["route_name_verizon"] == "Mainline AM/PM"][
    [
        "route_name_verizon",
        "percentage_verizon",
    ]
]

Unnamed: 0,route_name_verizon,percentage_verizon
2719,Mainline AM/PM,72.45
2720,Mainline AM/PM,72.45


In [69]:
# Filter out for only both, these are routes that really dont have any more than 60% cellular coverage among AT&T and Verizon
m1 = m1.loc[m1["_merge"] == "both"]

In [70]:
# Drop unwanted cols
m1 = m1.drop(
    columns=[
        "_merge",
        "route_type_att",
        "binned_att",
        "route_id_verizon",
        "route_name_verizon",
        "agency_verizon",
        "itp_id_verizon",
        "geometry_overlay_verizon",
        "route_length_overlay_verizon",
        "geometry_original_df_verizon",
        "route_type_verizon",
        "route_length_original_df_verizon",
        "binned_verizon",
    ]
)

In [71]:
m1.shape

(64, 10)

### Merge m1 with T-Mobile

In [72]:
# Merge att & verizon coverage
m2 = pd.merge(
    m1,
    low_tmobile_coverage,
    how="outer",
    left_on=["route_id_att", "route_name_att", "agency_att", "itp_id_att"],
    right_on=[
        "route_id_tmobile",
        "route_name_tmobile",
        "agency_tmobile",
        "itp_id_tmobile",
    ],
    indicator=True,
)

In [73]:
# Make sure it remains a GDF
m1 = gpd.GeoDataFrame(m1, geometry="geometry_overlay_att", crs="EPSG:4326")

In [74]:
m2["_merge"].value_counts()

both          55
right_only    27
left_only      9
Name: _merge, dtype: int64

In [75]:
# Filter out for only both, these are routes that really dont have any more than treshold % cellular coverage among all 3
m3 = m2.loc[m2["_merge"] == "both"]

In [76]:
m3 = m3.drop(columns=['route_id_tmobile', 'route_name_tmobile',
       'agency_tmobile', 'itp_id_tmobile', 'geometry_overlay_tmobile',
       'route_length_overlay_tmobile', 'geometry_original_df_tmobile',
       'route_type_tmobile', 'route_length_original_df_tmobile','binned_tmobile', '_merge'])

In [77]:
# preview df 
m3.drop(columns = ['geometry_original_df_att', 'geometry_overlay_att']).sample(3)

Unnamed: 0,route_id_att,route_name_att,agency_att,itp_id_att,route_length_overlay_att,route_length_original_df_att,percentage_att,percentage_verizon,percentage_tmobile
33,225,South Coast / Ukiah,Mendocino Transit Authority,198.0,260518.01,508378.59,51.24,51.59,49.86
28,144,South Main & Walnut,Tehama Rural Area eXpress,334.0,46711.56,67178.13,69.53,66.44,69.53
15,13054,Edmund D. Edelman Children’s Court Shuttle,East Los Angeles Shuttle,172.0,4041.64,7500.92,53.88,53.87,53.88


# Add trips

### Add trips for only the routes w/ low coverage.

In [207]:
trips_route, trips_agency = utilities.trip_df()

In [208]:
trips_route.calitp_itp_id.nunique()

165

In [209]:
len(trips_route)

2949

In [210]:
# Merge the 2 filtered at&t and verizon dataframes so we can see which routes overlap
m4 = pd.merge(
    m3,
    trips_route,
    how="left",
    left_on=["route_id_att", "itp_id_att"],
    right_on=["route_id", "calitp_itp_id"],
    indicator=True,
)

In [211]:
m4["_merge"].value_counts()

both          46
left_only      9
right_only     0
Name: _merge, dtype: int64

In [212]:
m4 = m4.drop(columns=["_merge"])

### Add total trips across the agency.

In [213]:
# Merge the 2 filtered at&t and verizon dataframes so we can see which routes overlap
m5 = pd.merge(
    m4,
    trips_agency,
    how="left",
    left_on=["itp_id_att"],
    right_on=["calitp_itp_id"],
    indicator=True,
)

In [214]:
# Why are the merge count so low?
m5["_merge"].value_counts()

both          50
left_only      5
right_only     0
Name: _merge, dtype: int64

In [215]:
m5.shape

(55, 17)

In [216]:
# Add column for percentage of trips for the route in question compared with
# All the trips an agency has done
m5['percentage_of_trips_w_low_cell_service'] = m5['total_trips_by_route']/m5['total_trips_by_agency'] 

In [217]:
# Preview
m5[['agency_att','itp_id_att','total_trips_by_agency','total_trips_by_route','percentage_of_trips_w_low_cell_service']]

Unnamed: 0,agency_att,itp_id_att,total_trips_by_agency,total_trips_by_route,percentage_of_trips_w_low_cell_service
0,Cloverdale Transit,70.0,251.0,8.0,0.03
1,Yosemite Area Regional Transportation System,374.0,12.0,,
2,Merced The Bus,343.0,565.0,6.0,0.01
3,Susanville Indian Rancheria Public Transportation Program,329.0,57.0,2.0,0.04
4,Tehama Rural Area eXpress,334.0,57.0,2.0,0.04
5,Avocado Heights/Bassett/West Valinda Shuttle,171.0,501.0,23.0,0.05
6,East Los Angeles Shuttle,172.0,501.0,23.0,0.05
7,East Valinda Shuttle,173.0,501.0,23.0,0.05
8,Sunshine Bus(South Whittier),174.0,501.0,23.0,0.05
9,the Link Florence-Firestone/Walnut Park,177.0,501.0,23.0,0.05


In [218]:
# Del unwanted cols
m5 = m5.drop(columns = ['calitp_itp_id_x', 'route_id', 'total_trips_by_route',
       'calitp_itp_id_y', 'total_trips_by_agency', '_merge',])

# Add NTD
* How to incorporate total trips w/ total buses?

In [219]:
# Load NTD vehicles
ntd_df = utilities.ntd_vehicles()

In [220]:
# ntd_df["agency"].sort_values().unique().tolist()

In [221]:
# Replace agency names in NTD to match m4 above
ntd_df["agency"] = ntd_df["agency"].replace(
    {
        "Trinity County": "Trinity Transit",
        "City of Calabasas": "Calabasas Transit System",
        "County of Sonoma": "Sonoma County Transit",
        "Tehama County": "Tehama Rural Area eXpress",
        "Los Angeles County Department of Public Works - East L.A.": "East Los Angeles Shuttle",
        "Sacramento Regional Transit District": "Sacramento Regional Transit District",
        "Eastern Sierra Transit Authority": "Mammoth Lakes Transit System",
        "City of Lompoc": "City of Lompoc Transit",
        "San Luis Obispo Regional Transit Authority": "South County Transit Link",
        "City of Roseville": "Roseville Transit",
        "Los Angeles County Dept. of Public Works - Athens Shuttle Service": "the Link-Athens",
        "Los Angeles County Department of Public Works - Avocado Heights": "Avocado Heights/Bassett/West Valinda Shuttle",
        "Susanville Indian Rancheria": "Susanville Indian Rancheria Public Transportation Program",
        'Transit Joint Powers Authority for Merced County':'Merced The Bus',
        'City of Eureka':'Eureka Transit Service',
        'Nevada County Transit Services':'Gold Country Stage',
        'San Mateo County Transit District':'SamTrans',
        'Redwood Coast Transit Authority':'Redwood Coast Transit',
        'City of Avalon':'Avalon Transit',
        'City of Lodi':'Grapeline',
        'Golden Gate Bridge':'Golden Gate Bridge Highway and Transportation District',
        'City of Santa Maria':'Santa Maria Area Transit',
        
        
    }
)

In [222]:
# Replace agency names in agency4 to match ntd
m5["agency_att"] = m5["agency_att"].replace(
    {
        "Cloverdale Transit": "Sonoma County Transit",
    }
)

In [223]:
m6 = pd.merge(
    m5,
    ntd_df[['agency','total_buses']],
    how="left",
    left_on="agency_att",
    right_on="agency",
    indicator=True,
)

In [224]:
# Create col to find % of bus fleet that doesn't have coverage
m6['percentage_of_buses_in_low_cell_zones'] = (m6['total_buses']*m6['percentage_of_trips_w_low_cell_service']).fillna(0).astype('int64')

In [225]:
# m6.loc[m6['_merge']=='left_only'][['agency_att']]

# Final

### There are a lot of same route-id and same route-name combos but run by different agencies?

In [226]:
# Cols to shorten dataframe
subset_cols = [
    "route_id_att",
    "route_name_att",
    "agency_att",
    "itp_id_att",
    "percentage_att",
    "percentage_verizon",
    "percentage_tmobile",
    "total_buses",
    'percentage_of_trips_w_low_cell_service',
   'percentage_of_buses',
]

In [227]:
m6[subset_cols].sort_values(['route_id_att','route_name_att','total_buses'], ascending=[True, True, False])

KeyError: "['percentage_of_buses'] not in index"

In [None]:
# Drop duplicates of route_id, keeping the agencey with the most buses?
m7 = (m6
      .sort_values(['route_id_att','route_name_att','total_buses'], ascending=[True, True, False])
      .drop_duplicates(subset=["route_id_att"])
      .reset_index(drop = True)
     )

In [None]:
m6.shape

### Some of the results are surprising...wouldn't expect urban areas to have routes on this list

In [None]:
m7[subset_cols]

In [None]:
tooltip_list = ['route_id_att', 'route_name_att', 'agency_att','percentage_att', 'percentage_verizon', 'percentage_tmobile',
                'total_buses']

In [None]:
# Grab route ID&agency
m7_test  = m6[['route_id_att','agency_att']]


In [None]:
len(m7_test)

In [None]:
m7.explore(
    "route_name_att",
    width=800,
    height=400,
    tooltip=tooltip_list,
    style_kwds={"weight": 6},
    legend = False,
    color =  'tab20c'
)

#### Double check against original routes_df

In [None]:
routes_df2= pd.merge(
  
    routes_df,
      m7_test,
    how="inner",
    right_on=["agency_att", "route_id_att"],
    left_on=["agency", "route_id"]
    
)

In [None]:
routes_df2.explore(
    "route_name",
    width=800,
    height=400,
    tooltip=['route_name', 'agency'],
    style_kwds={"weight": 6},
    legend = False,
    color =  'tab20c'
)