## Long Range Transportation Plan

In [1]:
import _harmonization_utils as harmonization_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from shared_utils import geography_utils, utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### SANDAG
* https://www.arcgis.com/home/item.html?id=ba8b2a6247ef463cb7411f039959f3ee
* https://www.arcgis.com/apps/mapviewer/index.html?webmap=ba8b2a6247ef463cb7411f039959f3ee

In [3]:
columns_to_keep = [
    "project_title",
    "lead_agency",
    "project_year",
    "project_category",
    "grant_program",
    "project_description",
    "total_project_cost",
    "location",
    "city",
    "county",
    "data_source",
    "notes",
]

In [4]:
def create_notes(df, note_cols: list):
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df["notes"] = df.apply(combine_notes, axis=1)
    df.notes = df.notes.str.replace("_", " ")

    return df

In [5]:
def harmonizing_lrtp(
    df,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    phase_col: str,
    program: str,
    data_source: str,
    note_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    the same names and formats.
    """
    rename_columns = {
        project_name_col: "project_title",
        phase_col: "phase",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "geometry",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        try:
            df[i] = (
                df[i]
                .str.replace("$", "")
                .str.replace(",", "")
                .apply(pd.to_numeric, errors="coerce")
                .fillna(0)
            )
        except:
            df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Add MPO & grant program
    df["lead_agency"] = program
    df["grant_program"] = f"{program} {data_source}"

    # Add data source
    df["data_source"] = data_source

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)

    # Fill in any nulls
    # df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_year",
        "project_category",
        "phase",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
            
    # Create notes
    df = create_notes(df, note_cols)

    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "project_description",
        "phase",
        "total_project_cost",
        "geometry",
        "city",
        "county",
        "data_source",
        "notes",
    ]

    df = df[columns_to_keep]

    return df

In [6]:
def open_rest_server(url_pt_1: str, url_pt_2: str, layer_name: list):
    full_gdf = pd.DataFrame()
    for i in layer_name:
        gdf = to_snakecase(gpd.read_file(f"{url_pt_1}{i}{url_pt_2}"))
        gdf["layer_name"] = i
        full_gdf = pd.concat([full_gdf, gdf], axis=0)

    return full_gdf

In [7]:
def sandag_lrtp():
    sandag_layers_list = [
        "Complete_Corridors_Connectors_Intersections_Ramps_RP2021",
        "Complete_Corridors_Highway_RP2021",
        "Transit_Leap_RP2021",
        "Adopted_Regional_Bike_Network_RP2021",
        "Mobility_Hubs_and_Flexible_Fleets",
        "Complete_Corridors_Regional_Arterials",
        "Goods_Movement",
    ]

    sandag_url_pt1 = (
        "https://services1.arcgis.com/HG80xaIVT1z1OdO5/ArcGIS/rest/services/"
    )
    sandag_url_pt2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    sandag = open_rest_server(sandag_url_pt1, sandag_url_pt2, sandag_layers_list)
   
    return sandag

In [8]:
sandag = sandag_lrtp()

In [9]:
sandag.columns

Index(['objectid', 'type', 'freeway', 'intersecti', 'caption', 'project_id',
       'phase', 'direction', 'project_id_1', 'year_built', 'category',
       'project_name', 'description', 'cost2020m', 'aqc_2016_1', 'aqc_2020_1',
       'pricmcp', 'conncmcp', 'status', 'categoría', 'corredor',
       'nombre_del_proyecto', 'descripción', 'corredores_de_conexión',
       'geometry', 'layer_name', 'corridor_i', 'type_1', 'existing', 'limits',
       'description_1', 'shape__length', 'route', 'routetype', 'route_desc',
       'rp_2021_id', 'rp_2021_id_1', 'primary_corridor',
       'connecting_corridor_s_', 'capital_cost___2020__millions', 'name',
       'mohubtype', 'mohubsize', 'transitservices', 'globalid', 'icon_url',
       'shape__area', 'mohubtype_sp', 'mohubsize_sp', 'area', 'project',
       'connecting_corridor', 'cost', 'improvementtype'],
      dtype='object')

In [10]:
sandag.project_name.value_counts().head(15)

N/A                                                            22
I-15 (I-8)                                                      4
I-15 (SR 52)                                                    4
I-805 (I-8)                                                     4
Commuter Rail 398                                               3
Central Coast Corridor                                          3
Camp Pendleton Trail                                            2
Central Avenue Bikeway                                          2
LRT 399                                                         2
San Diego River Bikeway Connections                             2
Commuter Rail 582                                               2
Border to Bayshore Bikeway                                      2
San Diego River Trail - Mast Park to Lakeside baseball park     2
Bay to Ranch Bikeway                                            2
I-5 (SR 78)                                                     2
Name: proj

In [11]:
len(sandag)

416

In [12]:
sandag.status.value_counts()

Proposed    289
Existing     55
NB           26
Name: status, dtype: int64

In [13]:
# sandag[sandag.project_name == "Commuter Rail 398"].drop(columns = ['geometry','caption'])

In [14]:
# sandag[sandag.project_name == "Bay to Ranch Bikeway"].drop(columns = ['geometry','caption'])

In [15]:
# sandag[sandag.project_name == "Central Avenue Bikeway"].drop(columns = ['geometry','caption'])

In [16]:
# sandag[sandag.project_name == "Central Avenue Bikeway"].explore('description')

In [17]:
# sandag[sandag.project_name == "Bay to Ranch Bikeway"].explore('description')

In [18]:
# sandag[sandag.project_name == "I-15 (I-8)"].drop(columns = ['geometry','caption'])

In [19]:
def harmonize_sandag():
    df = sandag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="cost2020m",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="phase",
        phase_col="status",
        program="SANDAG",
        data_source="LRTP",
        note_cols=[
            "aqc_2016_1",
            "aqc_2020_1",
            "pricmcp",
            "conncmcp",
            "layer_name",
            "corridor_i",
            "type_1",
            "existing",
            "limits",
            "description_1",
            "route",
            "routetype",
            "route_desc",
            "rp_2021_id",
            "rp_2021_id_1",
            "capital_cost___2020__millions",
        ],
        cost_in_millions=False,
    )

    return df

In [20]:
test = harmonize_sandag()



In [21]:
# test.drop(columns = ['geometry']).head()

### MTC
* 133 projects but point and line map returns 240 ish rows.
* Projects that are in different locations can have the same name split by open year...
* I guess the geospatial projects are split into various phases while the regular pandas df is just the project in its entirety.
* https://www.planbayarea.org/2050-plan/final-plan-bay-area-2050/final-supplemental-reports/interactive-transportation-project-list
* https://opendata.mtc.ca.gov/datasets/MTC::2023-transportation-improvement-program-projects-point/about
* https://opendata.mtc.ca.gov/datasets/MTC::plan-bay-area-2050-transportation-projects-line/about
* https://data.bayareametro.gov/Projects/Plan-Bay-Area-2050-Transportation-Project-List/y7ka-jksz

In [22]:
def mtc_lrtp():
    # Open rest server data
    layer_list = ["planbayarea2050_rtp_line", "planbayarea2050_rtp_point"]
    url_pt_1 = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/ArcGIS/rest/services/"
    url_pt_2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    gdf = open_rest_server(url_pt_1, url_pt_2, layer_list)

    # Open all MTC projects. Not all projects are mappable.
    all_projects_url = "https://data.bayareametro.gov/resource/y7ka-jksz.json?$query=SELECT%0A%20%20%60plan_strategy%60%2C%0A%20%20%60rtpid%60%2C%0A%20%20%60title%60%2C%0A%20%20%60scope%60%2C%0A%20%20%60open_period%60%2C%0A%20%20%60funding_millions_yoe%60%2C%0A%20%20%60county%60"
    all_projects = pd.read_json(all_projects_url)

    # Merge info from all projects to gdf
    gdf = pd.merge(
        gdf, all_projects, how="inner", left_on=["proj_title"], right_on=["title"]
    )

    gdf = gdf.drop(columns=["title"])

    # Find projects that are unmappable
    mtc_geometry_titles = set(gdf.proj_title.unique().tolist())
    mtc_all_projects_titles = set(all_projects.title.unique().tolist())
    missing_projects = list(mtc_all_projects_titles - mtc_geometry_titles)

    # Add back in the  unmappable projects
    all_projects = all_projects[all_projects.title.isin(missing_projects)].reset_index(
        drop=True
    )
    all_projects = all_projects.rename(columns={"title": "proj_title"})
    final = pd.concat([all_projects, gdf])

    # Correct typo
    final = final.rename({"mode detai": "mode_detail"})

    # Set geometry again
    final = final.set_geometry("geometry").set_crs(geography_utils.WGS84)
    return final

In [23]:
mtc  = mtc_lrtp()



In [24]:
mtc.columns

Index(['plan_strategy', 'rtpid', 'proj_title', 'scope', 'open_period',
       'funding_millions_yoe', 'county', 'objectid', 'map_id', 'subcompone',
       'open_year', 'strategy', 'rtp_id', 'mode_', 'mode_detai', 'investment',
       'shape__length', 'geometry', 'layer_name'],
      dtype='object')

In [25]:
mtc.proj_title.value_counts().head(15)

Minor Roadway Improvements | Regional                                                      36
Express Lanes | Regional                                                                   32
Other Investments to Expand & Modernize the Regional Rail Network | Regional               15
Bay Area Forward Program | Regional                                                         8
Rapid Bus | AC Transit | Modernization                                                      8
Other Investments to Enhance Local Transit Frequency, Capacity & Reliability | Regional     7
Corridor & Interchange Improvements | US-101 | Santa Clara County                           7
Corridor & Interchange Improvements | I-280 | Santa Clara County                            6
Corridor & Interchange Improvements | SR-237 | Santa Clara County                           5
Corridor & Interchange Improvements | US-101 | Sonoma County                                5
Corridor & Interchange Improvements | SR-29 | Napa County   

In [26]:
# mtc[mtc.proj_title == "Congestion Pricing | Treasure Island"].drop(columns = ['geometry','scope'])

In [27]:
# mtc[mtc.proj_title == "Minor Roadway Improvements | Regional"].drop(columns = ['geometry']).head()

In [28]:
def harmonize_mtc():
    df = mtc_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="proj_title",
        project_description_col="subcompone",
        project_category_col="plan_strategy",
        project_cost_col="funding_millions_yoe",
        location_col="geometry",
        county_col="county",
        city_col="",
        project_year_col="open_year",
        phase_col="",
        program="MTC",
        data_source="LRTP",
        note_cols=[
            "strategy",
            "layer_name",
            "mode_detai",
        ],
        cost_in_millions=False,
    )

    return df

In [29]:
mtc_test = harmonize_mtc()



In [30]:
# mtc_test.project_description.value_counts()

In [31]:
# mtc_test.drop(columns = ['geometry']).head()

### Tahoe
* Open data portal: https://www.tahoeopendata.org/search?tags=transportation
* https://www.tahoeopendata.org/datasets/TRPA::2020-regional-transportation-plan-projects/about

In [32]:
def tmpo_lrtp():
    tahoe_url = "https://maps.trpa.org/server/rest/services/Datadownloader_Transportation/MapServer/19/query?where=1%3D1&text=&objectIds=&time=&timeRelation=esriTimeRelationOverlaps&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&returnTrueCurves=false&maxAllowableOffset=&geometryPrecision=&outSR=&havingClause=&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&historicMoment=&returnDistinctValues=false&resultOffset=&resultRecordCount=&returnExtentOnly=false&sqlFormat=none&datumTransformation=&parameterValues=&rangeValues=&quantizationParameters=&featureEncoding=esriDefault&f=geojson"
    gdf = to_snakecase(gpd.read_file(tahoe_url))
    gdf.complete_year = gdf.complete_year.astype(int)

    # Filter out projects that are not yet completed
    gdf2 = gdf[gdf.complete_year > 2022].reset_index(drop=True)

    gdf2["project_year"] = (
        gdf2.start_year.astype(str) + "-" + gdf.complete_year.astype(str)
    )
    return gdf2

In [33]:
tahoe_original = tmpo_lrtp()

In [34]:
len(tahoe_original)

75

In [35]:
tahoe_original.project_name.value_counts().head()

South Lake Tahoe Airport Improvement Project                               2
Tallac Historic Site, Valhalla, and the Visitor Center Improvements        1
Upper Truckee River Class 1 Trail Widening - Tahoe City to Squaw Valley    1
TTD Transit Operations - Phase 2035                                        1
Private Transit Operations - Phase 2025                                    1
Name: project_name, dtype: int64

In [36]:
# tahoe_original[tahoe_original.project_name == "South Lake Tahoe Airport Improvement Project"].explore("category")

In [37]:
# tahoe_original[tahoe_original.project_name == "South Lake Tahoe Airport Improvement Project"].drop(columns = ['geometry','description'])

In [38]:
def harmonize_tahoe():
    df = tmpo_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="estimated_cost",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="project_year",
        program="TMPO",
        phase_col="phase",
        data_source="LRTP",
        note_cols=[
            "implementer",
            "funding_type",
            "plan_name",
            "label",
            "financial_status",
            "url",
        ],
        cost_in_millions=True,
    )

    return df

In [39]:
tahoe_test = harmonize_tahoe()



In [40]:
# tahoe_test.columns

### Concat all 3

In [41]:
def all_mpo():
    tahoe = harmonize_tahoe().reset_index(drop = True)
    sandag = harmonize_sandag().reset_index(drop = True)
    mtc = harmonize_mtc().reset_index(drop = True)

    df = pd.concat([tahoe, mtc, sandag])

    return df

In [42]:
all_mpo_df = all_mpo()



In [43]:
all_mpo_df.groupby(["grant_program"]).size()

grant_program
MTC LRTP       282
SANDAG LRTP    416
TMPO LRTP       75
dtype: int64

In [44]:
all_mpo_df.groupby(['grant_program']).agg({'project_title':'nunique'})

Unnamed: 0_level_0,project_title
grant_program,Unnamed: 1_level_1
MTC LRTP,133
SANDAG LRTP,309
TMPO LRTP,74


In [45]:
congest_keywords = [
    "congest",
    "variabl",
    "variable tolls",
    "express lane",
    "value",
    "rush",
    "pric",
    "off peak",
    "cordon",
    "dynamic pricing",
    "dynmically priced",
    "hot",
    "mobility pricing",
    "occupancy",
    "toll lane",
    "performance pricing",
    "managed",
    "charg",
    "tollway",
    "transponder",
    "charge",
    "express toll",
    "fixed"
]

In [46]:
def lower_case(df, columns_to_search: list):
    """
    Lowercase certain columns
    """
    new_columns = []
    for i in columns_to_search:
        df[f"lower_case_{i}"] = (df[i].str.lower().fillna("none")
                                 .str.replace("-","")
                                 .str.replace(".","")
                                 .str.replace(":","")
                                )
        new_columns.append(f"lower_case_{i}")

    return df,new_columns

In [47]:
def find_keywords(df, columns_to_search:list, keywords_search:list):
    df2, lower_case_cols_list = lower_case(df, columns_to_search)
    
    keywords_search = f"({'|'.join(keywords_search)})"
    
    for i in lower_case_cols_list:
        df2[f"{i}_keyword_search"] = df2[i].str.extract(keywords_search).fillna("not congestion")
    
    return df2

In [48]:
# test = find_keywords(all_mpo_df,  ["project_title", "project_description", "notes"], congest_keywords)

In [49]:
def filter_cordon(df, columns_to_search:list, keywords_search:list):

    # Filter out for Cordon 
    df = find_keywords(df, columns_to_search, keywords_search)
    df2 = (df[(df.lower_case_project_title_keyword_search != "not congestion")
        | (df.lower_case_project_description_keyword_search != "not congestion")]).reset_index(drop=True)
    
    # Delete out HOV
    projects_to_delete = ["SR 17 Corridor Congestion Relief in Los Gatos",
                         "Interstate 380 Congestion Improvements"]
    df2 = df2[~df2.project_title.isin(projects_to_delete)].reset_index(drop = True)
    
    # One version that's a gdf
    gdf = df2.set_geometry("geometry")
    
    # Drop invalid geometries
    gdf = gdf[~gdf.geometry.is_empty].reset_index(drop = True)
    gdf = gdf[gdf.geometry.is_valid].reset_index(drop = True)
    gdf = gdf.fillna(gdf.dtypes.replace({"float64": 0.0, "object": "None"}))

    # One version that's a df
    columns_to_drop = ['lower_case_project_title', 'lower_case_project_description']
    df2 = df2.drop(columns = columns_to_drop + ['geometry'])
    df2 = df2.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    return gdf,df2

In [50]:
cordon_gdf, cordon_df = filter_cordon(all_mpo_df, ["project_title", "project_description",], congest_keywords)



In [51]:
len(cordon_gdf)

87

In [52]:
len(cordon_df.drop_duplicates(subset = ['project_title']))

54

In [53]:
cordon_df.groupby(["grant_program"]).agg({"project_title":"nunique"})

Unnamed: 0_level_0,project_title
grant_program,Unnamed: 1_level_1
MTC LRTP,7
SANDAG LRTP,46
TMPO LRTP,1


In [55]:
# cordon_gdf.to_file("./congestion_pricing_sample.geojson", driver="GeoJSON")

In [None]:
cordon_df.to_excel("gs://calitp-analytics-data/data-analyses/project_list/LRTP/congestion_pricing_sample.xlsx", sheet_name="Sheet_name_1", index=False)

In [None]:
cordon_df.grant_program.value_counts()

In [None]:
cordon_df.drop_duplicates(subset = ['total_project_cost','project_title',]).groupby(["grant_program"]).agg({'total_project_cost':'sum'})

In [None]:
cols_to_drop = ['city','county', 'lower_case_project_description_keyword_search', 'lower_case_project_title_keyword_search']

In [None]:
# cordon_df.drop(columns =cols_to_drop).drop_duplicates(subset = ['project_title', 'project_description'])

In [None]:
cordon_gdf.explore('project_title', cmap= 'tab20c', style_kwds={'weight':6}, legend = False)