### Analysis V2


In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
import shapely.wkt
import A1_provider_prep
import A2_analysis
import A3_other
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [41]:
geography_utils.WGS84

'EPSG:4326'

In [42]:
geography_utils.CA_StatePlane

'EPSG:2229'

In [50]:
# Simplify provider maps
def simplify_geometry(provider: gpd.GeoDataFrame):
    # Turn to 2229
    provider = provider.to_crs(2229)

    # Simplify
    provider.geometry  = provider.geometry.simplify(tolerance=15)

    provider = provider.to_crs(4326)

    return provider

### Load Provider Maps

In [5]:
att_og = A1_provider_prep.load_att()
tmobile_og = A1_provider_prep.load_tmobile()

In [51]:
# Simple Version
att_simple = simplify_geometry(att_og)

In [54]:
att_simple.geometry.is_valid

0    False
0    False
0    False
dtype: bool

In [56]:
att_simple.geometry.is_valid.all()

False

In [55]:
# Geometries are true across each gdf
# verizon_simple.geometry.is_valid.all(), , tmobile_simple.geometry.is_valid.all(),

(True, False, True)

In [7]:
unique_routes = A3_other.load_unique_routes_df()

In [8]:
def split_routes():
    unique_routes = A3_other.load_unique_routes_df()
    # Split unique_routes to 4 different ones
    # https://stackoverflow.com/questions/17315737/split-a-large-pandas-dataframe
    df_split = np.array_split(unique_routes, 4)
    
    df1 = df_split[0]
    df2 = df_split[1]
    df3 = df_split[2]
    df4 = df_split[3]
    
    return df1,df2,df3,df4

In [9]:
r1,r2,r3,r4 = split_routes()

### Notes
* Verizon_simple does not work with `comparison.` Around 500 routes are passed when using `overlay_single_routes`.
* Verizon_og does not work. Returns the error: `TopologyException: side location conflict at -122.30299999995003 37.938999999750251. This can occur if the input geometry is invalid.`
* Both gdf returns `true` when `.geometry.is_valid.all()` is applied to them.

In [10]:
# TopologyException: side location conflict at -122.30299999995003 37.938999999750251. This can occur if the input geometry is invalid.
# verizon_o = utilities.comparison(verizon_og, unique_routes, "verizon_simplified")

In [11]:
# verizon_o = utilities.comparison(verizon_og, unique_routes, "verizon")

### Function Ideas
* Other Function should:
    * Drop Duplicates
    * Sum up new route length by long route name
    * Find max of original route length
    * Divide new route length by original to get percentage covered. 
    * Del irrelevant columns
    * Does it have to be a gdf at this point or can it just be a normal dataframe?
* Now if something has a low percentage of intersecting: that is a good sign? 
* Maps only contain areas *without* coverage. So if it only intersects a little, means route mostly has coverage?

In [12]:
att_overlay = A2_analysis.comparison(att_simple, unique_routes)

In [13]:
att_simple.plot()



<AxesSubplot:>

ValueError: cannot convert float NaN to integer

<Figure size 640x480 with 1 Axes>

In [14]:
att_overlay

Unnamed: 0,itp_id,route_id,route_name,agency,original_route_length,long_route_name,geometry,route_length


In [15]:
tmobile_overlay = gpd.read_parquet(
    f"{A1_provider_prep.GCS_FILE_PATH}tmobile_simplified_overlaid_with_unique_routes.parquet"
)

In [16]:
tmobile_agg = A2_analysis.dissolve_summarize(tmobile_overlay)

In [17]:
# att_agg[att_agg.agency == 'AC Transit'][['agency','long_route_name','percentage_route_covered']]

In [18]:
len(tmobile_agg), tmobile_agg.long_route_name.nunique()

(1678, 1678)

In [19]:
ac_transit_only = tmobile_agg[tmobile_agg.agency == 'AC Transit']

In [20]:
"""
ac_transit_only.explore('long_route_name',
               width=800,
    height=400,
    style_kwds={"weight": 6},
    legend=False,
    color="tab10c",)
"""

'\nac_transit_only.explore(\'long_route_name\',\n               width=800,\n    height=400,\n    style_kwds={"weight": 6},\n    legend=False,\n    color="tab10c",)\n'