# Exploring Cell Coverage of Routes
* To do later: move data sources to a catalog
* An agency here is defined: as a service and an operator...

In [1]:
# Read in zip files
import fsspec
import geopandas as gpd
import intake
import numpy as np
import pandas as pd

# My utilities
import utilities
from calitp import *

# Display
from IPython.display import HTML, Image, Markdown, display, display_html

# Geometry
from shared_utils import geography_utils, utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/cellular_coverage/"

## Trip Routes 
* Bring in trip routes.

In [4]:
# Just grab some columns w/o geo, in case I want to preview the df
cols_without_geometry = [
    "agency",
    "route_name",
    "itp_id",
    "route_id",
    "route_type",
]

In [5]:
routes_df = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes.parquet"
)

In [6]:
# Find unique routes for bus only
routes_df = utilities.unique_routes(routes_df)

In [7]:
routes_df.dtypes

itp_id             int64
route_id          object
geometry        geometry
route_type        object
route_name        object
agency            object
route_length     float64
dtype: object

In [8]:
# Fill in NA for route names and agency names
routes_df["route_name"] = routes_df["route_name"].replace({"": "None"})

In [9]:
f"""{routes_df.route_id.nunique()} unique route ids, {routes_df.route_name.nunique()} different route names, and {routes_df.route_length.nunique()} different shape ids. The dataframe has {len(routes_df)} rows."""

'1719 unique route ids, 1951 different route names, and 2370 different shape ids. The dataframe has 2934 rows.'

In [10]:
# Route ID is only unique to each agency.
# routes_df[["agency",  "route_name",  "route_id",]].sort_values(['agency',"route_name", 'route_id', ]).drop_duplicates().reset_index(drop = True)

In [11]:
# Create a df for agency-itp id, drop duplicates to get only one record per agency
agency_itp = routes_df[["agency", "itp_id"]].drop_duplicates()

In [12]:
agency_itp.shape

(167, 2)

## FCC Maps
* T-Mobile file is split for each state and I can't find documentation on fcc's website on which zip file corresponds to CA. 
* Emailed FCC for clarification.
* FCC said "there is no way to tell which file corresponds with which state."

In [13]:
# verizon_df = gpd.read_parquet(f"{GCS_FILE_PATH}verizon_ca_only.parquet")

In [14]:
att_df = gpd.read_parquet(f"{GCS_FILE_PATH}att_ca_only.parquet")

In [15]:
# att_df.plot()

In [16]:
# att_df.plot(), verizon_df.plot(),

## Overlay AT&T with Trip Routes

In [17]:
# overlay_verizon = utilities.comparison(routes_df, verizon_df)

In [18]:
overlay_att = utilities.comparison(routes_df, att_df)

In [19]:
# f"Verizon: The length of the overlay dataframe is {len(overlay_verizon)}"

In [20]:
# f"AT&T: The length of the overlay dataframe is {len(overlay_att)}"

In [21]:
# overlay_att.plot("route_length", legend=True), overlay_verizon.plot("route_length")

### Test with bus routes x California Counties first
* Using counties just because it seems like a lot more routes cross counties instead of districts.
* Tested with these agencies that are in a remote area:
* "Tahoe Truckee Area Regional Transportation"
    * https://tahoetruckeetransit.com/interactive-map/
* 'Trinity Transit' 
* 'Lassen Transit Service Agency'

In [22]:
ca_gdf = (
    "https://opendata.arcgis.com/datasets/8713ced9b78a4abb97dc130a691a8695_0.geojson"
)

In [23]:
geojson = gpd.read_file(f"{ca_gdf}").to_crs(epsg=4326)

In [24]:
agency_wanted = "Tahoe Transportation"

In [25]:
# routes_df.agency.unique()

In [26]:
# Filter original df
single_agency = routes_df.loc[routes_df["agency"] == agency_wanted].reset_index(
    drop=True
)

In [27]:
single_agency.shape, single_agency.route_id.nunique()

((4, 7), 4)

In [28]:
single_agency["route_name"] = single_agency["route_name"].replace({"": "None"})

In [29]:
single_agency.drop(columns=["geometry", "route_type"])

Unnamed: 0,itp_id,route_id,route_name,agency,route_length
0,331,12133,South Shore Service & Lake Express Daily,Tahoe Transportation,136537.22
1,331,12134,South Shore Daily,Tahoe Transportation,41787.65
2,331,12135,Valley Express Daily,Tahoe Transportation,105348.23
3,331,12137,Neighborhood Connection,Tahoe Transportation,59893.58


In [30]:
# Explore county

single_agency.explore(
    "route_name",
    width=800,
    height=400,
    tooltip=[
        "route_name",
    ],
    style_kwds={"weight": 5},
)

In [31]:
# Merge with counties for testing
county_test = utilities.comparison(single_agency, geojson)

In [32]:
# More rows
county_test.shape

(4, 17)

In [33]:
# Test - merge. Goal: find % of each route in each county
county_merge = pd.merge(
    county_test,
    single_agency,
    how="left",
    on=["agency", "route_name", "route_type", "route_id"],
    suffixes=["_countytest", "_original"],
)

In [34]:
county_merge["percentage"] = (
    county_merge["route_length_countytest"] / county_merge["route_length_original"]
)

In [35]:
county_merge[
    [
        "route_id",
        "route_name",
        "COUNTY_NAME",
        "percentage",
        "route_length_countytest",
        "route_length_original",
    ]
].sort_values("route_id")

Unnamed: 0,route_id,route_name,COUNTY_NAME,percentage,route_length_countytest,route_length_original
0,12133,South Shore Service & Lake Express Daily,El Dorado,0.01,845.63,136537.22
1,12134,South Shore Daily,El Dorado,0.97,40430.68,41787.65
2,12135,Valley Express Daily,El Dorado,0.01,845.97,105348.23
3,12137,Neighborhood Connection,El Dorado,0.89,53439.62,59893.58


In [36]:
# Sum up percentages by route name -> but shouldn't they all add up to 1.00?
county_merge.groupby(["route_id", "route_name"]).agg({"percentage": "sum"}).sort_values(
    "percentage"
)

Unnamed: 0_level_0,Unnamed: 1_level_0,percentage
route_id,route_name,Unnamed: 2_level_1
12133,South Shore Service & Lake Express Daily,0.01
12135,Valley Express Daily,0.01
12137,Neighborhood Connection,0.89
12134,South Shore Daily,0.97


### Test single route with AT&T

In [37]:
# Filter overlay AT&T
single_agency_overlay = overlay_att.loc[
    overlay_att["agency"] == agency_wanted
].reset_index(drop=True)

In [38]:
f"""route id after overlay w/ AT&T: {single_agency_overlay.route_id.nunique()}, length after overlay w/ AT&T: {len(single_agency_overlay)}, 
original df length: {len(single_agency)}"""

'route id after overlay w/ AT&T: 4, length after overlay w/ AT&T: 4, \noriginal df length: 4'

In [39]:
# Route ids become split up?
single_agency_overlay.drop(columns=["geometry"]).sort_values("route_id")

Unnamed: 0,itp_id,route_id,route_type,route_name,agency,route_length
0,331,12133,3,South Shore Service & Lake Express Daily,Tahoe Transportation,845.63
1,331,12134,3,South Shore Daily,Tahoe Transportation,40430.68
2,331,12135,3,Valley Express Daily,Tahoe Transportation,845.97
3,331,12137,3,Neighborhood Connection,Tahoe Transportation,53439.62


## Compare One Route: AT&T Overlay with original df

In [40]:
single_route_overlay = single_agency_overlay.loc[single_agency_overlay['route_name'] =='Mainline AM/PM']

In [41]:
# Mainline AM/PM goes from 1 line to 2.
single_route_overlay.drop(columns = ['geometry'])

Unnamed: 0,itp_id,route_id,route_type,route_name,agency,route_length


In [42]:
# Route length after overlaying
single_route_overlay.groupby(['route_name']).agg({'route_length':'sum'})

Unnamed: 0_level_0,route_length
route_name,Unnamed: 1_level_1


In [43]:
# There are 2 rows: coverage isn't complete, there are gaps of coverage.
single_route_overlay.plot('route_length', figsize = (4,4), lw = 5)



<AxesSubplot:>

ValueError: cannot convert float NaN to integer

<Figure size 400x400 with 1 Axes>

In [44]:
# Original length for Mainline AM/PM
single_agency.loc[single_agency['route_name'] =='Mainline AM/PM'][['route_length']]

Unnamed: 0,route_length


In [45]:
91143.89/151470.13

0.6017284728018653

## Single Agency

In [46]:
# Sum up routes...
single_agency_overlay2 = (
    single_agency_overlay.groupby(["route_id", "route_name", "agency", "itp_id"])
    .agg({"route_length": "sum"})
    .reset_index()
)

In [47]:
# Play with ascending

single_agency_overlay2 = (
    single_agency_overlay.sort_values(["route_id", "route_length"], ascending=False)
    .drop_duplicates("route_id")
    .reset_index(drop=True)
)

In [48]:
single_agency_overlay2.shape, single_agency_overlay2.route_name.nunique()

((4, 7), 4)

In [49]:
# Merge to get original route length and old route length
att_merge = pd.merge(
    single_agency_overlay2,
    single_agency,
    how="inner",
    on=["agency", "route_id", "route_name", "itp_id"],
    suffixes=["_overlay", "_original"],
)

In [50]:
att_merge.route_id.nunique()

4

In [51]:
att_merge["percentage"] = (
    att_merge["route_length_overlay"] / att_merge["route_length_original"]
) * 100

In [52]:
# Bin percentages
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [53]:
att_merge["binned"] = pd.cut(att_merge["percentage"], bins)

In [54]:
att_merge.binned.value_counts()

(0, 10]      2
(80, 90]     1
(90, 100]    1
(10, 20]     0
(20, 30]     0
(30, 40]     0
(40, 50]     0
(50, 60]     0
(60, 70]     0
(70, 80]     0
Name: binned, dtype: int64

In [55]:
att_merge.drop(columns=["geometry"])

KeyError: "['geometry'] not found in axis"