## Analysis

In [1]:
# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
import A1_provider_prep
import A2_other
import A3_analysis
import altair as alt
import geopandas as gpd
import pandas as pd
from shared_utils import geography_utils, utils

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
# Read in no coverage maps
verizon = A1_provider_prep.load_verizon()

In [5]:
att = A1_provider_prep.load_att()

In [6]:
tmobile = A1_provider_prep.load_tmobile()

In [7]:
# Find routes that run in areas without coverage across all 3 providers
final = A3_analysis.final_merge(A3_analysis.merge_all_providers())

In [8]:
# Load in original geometries of the routes.
one_dist_routes, multi_dist_routes, all_routes = A2_other.find_multi_district_routes()

In [9]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [10]:
final["Binned"] = pd.cut(final["Median Percent With Coverage"], bins)

In [11]:
final.Binned = final.Binned.astype(str)

In [12]:
district_summary = A3_analysis.summarize_districts(final)

In [13]:
operator_summary = A3_analysis.summarize_operators(final)

In [14]:
route_summary = A3_analysis.summarize_routes_gtfs(final)

### Main Findings

In [15]:
f"There are {len(final)} routes that touch a zone without cellular coverage out the {len(all_routes)} unique routes in California."

'There are 2820 routes that touch a zone without cellular coverage out the 2914 unique routes in California.'

In [16]:
f"Although many routes touch a non-cellular coverage area, in actuality, {int(((2565/len(final))*100))} percent of them still have 90-100% data coverage across their entire length."

'Although many routes touch a non-cellular coverage area, in actuality, 90 percent of them still have 90-100% data coverage across their entire length.'

In [38]:
A3_analysis.chart_with_dropdown(
    route_summary,
    route_summary["Binned"].sort_values().unique().tolist(),
    "Binned",
    "Total Routes",
    "Binned",
    "Gtfs Status",
    ["Binned", "Gtfs Status", "Total Routes"],
    "Operator Level: Total Routes by Median % Cellular Coverage (use dropdown menu below to filter)",
)

In [18]:
A3_analysis.chart_with_dropdown(
    operator_summary,
    operator_summary["Agency"].sort_values().unique().tolist(),
    "Agency",
    "Median Percent of Route with Cell Coverage",
    "Total Routes",
    "Median Percent of Route with Cell Coverage",
    ["Agency", "Median Percent of Route with Cell Coverage", "Total Routes"],
    "Operator Level: Total Routes by Median % Cellular Coverage (use dropdown menu below to filter)",
)

In [19]:
A3_analysis.chart_with_dropdown(
    district_summary,
    district_summary["Median Percent of Route with Cell Coverage"]
    .sort_values()
    .unique()
    .tolist(),
    "Median Percent of Route with Cell Coverage",
    "Total Routes",
    "District Simplified",
    "District Simplified",
    ["District", "Median Percent of Route with Cell Coverage", "Total Routes"],
    "Total Routes by Median % Cellular Coverage",
)

In [20]:
A3_analysis.chart_with_dropdown(
    district_summary,
    district_summary.District.sort_values().unique().tolist(),
    "District",
    "Median Percent of Route with Cell Coverage",
    "Total Routes",
    "Median Percent of Route with Cell Coverage",
    ["District", "Median Percent of Route with Cell Coverage", "Total Routes"],
    "District Level: Median % Cellular Coverage by Total Routes",
)

### Narrow down low coverage routes
* As most routes in this dataset have a significant amount of cellular coverage, let's narrow down routes that have comparatively low data coverage.
* The <i>Median Percent With Coverage</i> is the median percentage of the route that has cellular coverage across all 3 providers.
    * Likewise, <i>Median Percent No Coverage</i> is the median percentage of the route that has <b>no</b> cellular coverage across all 3 providers.
*  Utilize bins to see the median percentage of the route <b>with</b> coverage.

In [21]:
final["Binned"].value_counts()

(90, 100]    2725
(80, 90]       42
(70, 80]       32
(60, 70]        9
(50, 60]        7
(40, 50]        3
(30, 40]        2
Name: Binned, dtype: int64

In [22]:
final["Median Percent No Coverage"].describe()

count   2820.00
mean       1.01
std        4.93
min        0.00
25%        0.00
50%        0.00
75%        0.00
max       64.00
Name: Median Percent No Coverage, dtype: float64

In [23]:
final["Median Percent With Coverage"].describe()

count   2820.00
mean      98.99
std        4.93
min       36.00
25%      100.00
50%      100.00
75%      100.00
max      100.00
Name: Median Percent With Coverage, dtype: float64

#### Choosing a threshold 
* Threshold set at 75 after eyeballing the `value_counts` of the <i>Binned</i> column.

In [24]:
threshold = 75

In [25]:
low_coverage = (
    final.loc[final["Median Percent With Coverage"] < threshold]
).reset_index(drop=True)

In [26]:
f"{len(low_coverage)} routes are considered truly low-data coverage."

'33 routes are considered truly low-data coverage.'

In [27]:
f"Around {low_coverage['Estimate Of Buses In Low Cell Zones'].sum()} buses run low data coverage routes using data from September 14, 2022."

'Around 86 buses run low data coverage routes using data from September 14, 2022.'

In [28]:
f"{low_coverage.Agency.nunique()} agencies appear in this data set."

'19 agencies appear in this data set.'

In [29]:
low_coverage.District.value_counts()

D-4          8
D-1          7
D-6          4
D-2          3
D-9          2
D-5          2
D-10         2
D-1, D-4     1
D-1, D-2     1
D-3          1
D-7          1
D-9, D-10    1
Name: District, dtype: int64

In [30]:
low_coverage.Agency.value_counts()

Trinity Transit                                           4
Golden Gate Bridge Highway and Transportation District    3
Yosemite Area Regional Transportation System              3
Eastern Sierra Transit Authority                          2
AC Transit                                                2
MUNI                                                      2
Redwood Coast Transit                                     2
Santa Cruz Metropolitan Transit District                  2
Tulare County Regional Transit Agency                     2
Mendocino Transit Authority                               2
Arcata and Mad River Transit System                       1
Blue Lake Rancheria                                       1
Capitol Corridor                                          1
Eureka Transit Service                                    1
Gold Country Stage                                        1
Humboldt Transit Authority                                1
Kern Transit                            

#### Closer look at 'low coverage' routes

In [31]:
low_coverage[A3_analysis.subset_for_results ].sort_values("Median Percent No Coverage", ascending=False)

Unnamed: 0,Agency,Long Route Name,District,Median Percent With Coverage,Median Percent No Coverage,Total Trips By Route,Total Buses,Estimate Of Buses In Low Cell Zones,Gtfs Status
1,Trinity Transit,Route travels through all down-river communities between Weaverville and Willow Creek. Key route to the coast; connects with RTS service to Arcata/Eureka. 181 Trinity Transit,"D-1, D-2",36.0,64.0,4,7.0,1,Static OK/RT Incomplete
24,Trinity Transit,Route between Weaverville and Hayfork. Connects to Redding Line in Douglas City. 178 Trinity Transit,D-2,39.0,61.0,4,7.0,1,Static OK/RT Incomplete
19,Mendocino Transit Authority,The Coaster 230 Mendocino Transit Authority,D-1,45.0,55.0,4,35.0,1,Static OK/RT Incomplete
26,Trinity Transit,Redding Line 288 Trinity Transit,D-2,46.0,54.0,4,7.0,1,Static OK/RT Incomplete
32,Yosemite Area Regional Transportation System,Mammoth Lakes HWY 120E/395 582 Yosemite Area Regional Transportation System,"D-9, D-10",48.0,52.0,2,10.0,1,Static OK/RT Incomplete
29,Tulare County Regional Transit Agency,Porterville to Springville Loop c70 Tulare County Regional Transit Agency,D-6,53.0,47.0,4,18.0,1,Static OK/RT Incomplete
28,Tulare County Regional Transit Agency,Porterville to Springville Loop c70 Tulare County Regional Transit Agency,D-6,53.0,47.0,4,18.0,1,Static Incomplete/RT Incomplete
27,Tulare County Area Transit,Porterville to Springville Loop 12181 Tulare County Area Transit,D-6,53.0,47.0,4,24.0,1,Static OK/RT Incomplete
30,Yosemite Area Regional Transportation System,Sonora HWY 120 1094 Yosemite Area Regional Transportation System,D-10,55.0,45.0,2,10.0,1,Static OK/RT Incomplete
21,Redwood Coast Transit,Route 199 Hiouchi / Gasquet 257 Redwood Coast Transit,D-1,55.0,45.0,6,14.0,1,Static OK/RT OK


In [32]:
# Grab the names of routes are "low coverage."
low_coverage_routes = low_coverage["Long Route Name"].unique().tolist()

In [33]:
# Get original geometry
low_coverage_og_geometry = (
    all_routes[all_routes["long_route_name"].isin(low_coverage_routes)]
).reset_index(drop=True)

In [34]:
att_map = att.explore(
    tiles="CartoDB positron",
    width=800,
    height=500,
)

In [35]:
att_map = low_coverage_og_geometry.explore("long_route_name", m=att_map, legend=False)

In [36]:
# att_map