## Long Range Transportation Plan

In [1]:
import _harmonization_utils as harmonization_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from shared_utils import geography_utils, utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### SANDAG
* https://www.arcgis.com/home/item.html?id=ba8b2a6247ef463cb7411f039959f3ee
* https://www.arcgis.com/apps/mapviewer/index.html?webmap=ba8b2a6247ef463cb7411f039959f3ee

In [3]:
columns_to_keep = [
    "project_title",
    "lead_agency",
    "project_year",
    "project_category",
    "grant_program",
    "project_description",
    "total_project_cost",
    "location",
    "city",
    "county",
    "data_source",
    "notes",
]

In [4]:
def create_notes(df, note_cols: list):
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df["notes"] = df.apply(combine_notes, axis=1)
    df.notes = df.notes.str.replace("_", " ")

    return df

In [5]:
def harmonizing_lrtp(
    df,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    phase_col: str,
    program: str,
    data_source: str,
    note_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.

    Add metric if the project is fully funded or not.
    """
    rename_columns = {
        project_name_col: "project_title",
        phase_col: "phase",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "geometry",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        try:
            df[i] = (
                df[i]
                .str.replace("$", "")
                .str.replace(",", "")
                .apply(pd.to_numeric, errors="coerce")
                .fillna(0)
            )
        except:
            df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Add MPO & grant program
    df["lead_agency"] = program
    df["grant_program"] = f"{program} {data_source}"

    # Add data source
    df["data_source"] = data_source

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)

    # Fill in any nulls
    # df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_year",
        "project_category",
        "phase",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "grant_program" not in df:
        df["grant_program"] = data_source

    # Create notes
    df = create_notes(df, note_cols)

    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "project_description",
        "phase",
        "total_project_cost",
        "geometry",
        "city",
        "county",
        "data_source",
        "notes",
    ]

    df = df[columns_to_keep]

    return df

In [6]:
def open_rest_server(url_pt_1: str, url_pt_2: str, layer_name: list):
    full_gdf = pd.DataFrame()
    for i in layer_name:
        gdf = to_snakecase(gpd.read_file(f"{url_pt_1}{i}{url_pt_2}"))
        gdf["layer_name"] = i
        full_gdf = pd.concat([full_gdf, gdf], axis=0)

    return full_gdf

In [7]:
sandag_columns_keep = [
    "type",
    "caption",
    "phase",
    "year_built",
    "category",
    "project_name",
    "description",
    "cost2020m",
    "aqc_2016_1",
    "aqc_2020_1",
    "pricmcp",
    "conncmcp",
    "status",
    "geometry",
    "layer_name",
    "corridor_i",
    "type_1",
    "existing",
    "limits",
    "description_1",
    "route",
    "routetype",
    "route_desc",
    "rp_2021_id",
    "rp_2021_id_1",
    "capital_cost___2020__millions",
]

In [8]:
def sandag_lrtp():
    sandag_layers_list = [
        "Complete_Corridors_Connectors_Intersections_Ramps_RP2021",
        "Complete_Corridors_Highway_RP2021",
        "Transit_Leap_RP2021",
        "Adopted_Regional_Bike_Network_RP2021",
        "Mobility_Hubs_and_Flexible_Fleets",
        "Complete_Corridors_Regional_Arterials",
        "Goods_Movement",
    ]

    sandag_url_pt1 = (
        "https://services1.arcgis.com/HG80xaIVT1z1OdO5/ArcGIS/rest/services/"
    )
    sandag_url_pt2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    sandag = open_rest_server(sandag_url_pt1, sandag_url_pt2, sandag_layers_list)
    sandag = sandag[sandag_columns_keep]

    return sandag

In [9]:
sandag_rp_21 = sandag_lrtp()

In [10]:
sandag_rp_21.layer_name.nunique()

7

In [11]:
sandag_rp_21.layer_name.value_counts()

Adopted_Regional_Bike_Network_RP2021                        185
Transit_Leap_RP2021                                          70
Complete_Corridors_Connectors_Intersections_Ramps_RP2021     64
Complete_Corridors_Highway_RP2021                            56
Mobility_Hubs_and_Flexible_Fleets                            31
Goods_Movement                                                9
Complete_Corridors_Regional_Arterials                         1
Name: layer_name, dtype: int64

In [12]:
len(sandag_rp_21)

416

In [13]:
def harmonize_sandag():
    df = sandag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="cost2020m",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="",
        phase_col="",
        program="SANDAG",
        data_source="LRTP",
        note_cols=[
            "year_built",
            "aqc_2016_1",
            "aqc_2020_1",
            "pricmcp",
            "conncmcp",
            "status",
            "layer_name",
            "corridor_i",
            "type_1",
            "existing",
            "limits",
            "description_1",
            "route",
            "routetype",
            "route_desc",
            "rp_2021_id",
            "rp_2021_id_1",
            "capital_cost___2020__millions",
        ],
        cost_in_millions=False,
    )

    return df

In [14]:
sandag_rp_21.cost2020m.nunique()

110

### MTC
* 133 projects but point and line map returns 240 ish rows.
* Projects that are in different locations can have the same name split by open year...
* I guess the geospatial projects are split into various phases while the regular pandas df is just the project in its entirety.
* https://www.planbayarea.org/2050-plan/final-plan-bay-area-2050/final-supplemental-reports/interactive-transportation-project-list
* https://opendata.mtc.ca.gov/datasets/MTC::2023-transportation-improvement-program-projects-point/about
* https://opendata.mtc.ca.gov/datasets/MTC::plan-bay-area-2050-transportation-projects-line/about
* https://data.bayareametro.gov/Projects/Plan-Bay-Area-2050-Transportation-Project-List/y7ka-jksz

In [15]:
def mtc_lrtp():
    # Open rest server data
    layer_list = ["planbayarea2050_rtp_line", "planbayarea2050_rtp_point"]
    url_pt_1 = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/ArcGIS/rest/services/"
    url_pt_2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    gdf = open_rest_server(url_pt_1, url_pt_2, layer_list)

    # Open all MTC projects. Not all projects are mappable.
    all_projects_url = "https://data.bayareametro.gov/resource/y7ka-jksz.json?$query=SELECT%0A%20%20%60plan_strategy%60%2C%0A%20%20%60rtpid%60%2C%0A%20%20%60title%60%2C%0A%20%20%60scope%60%2C%0A%20%20%60open_period%60%2C%0A%20%20%60funding_millions_yoe%60%2C%0A%20%20%60county%60"
    all_projects = pd.read_json(all_projects_url)

    # Merge info from all projects to gdf
    gdf = pd.merge(
        gdf, all_projects, how="inner", left_on=["proj_title"], right_on=["title"]
    )

    gdf = gdf.drop(columns=["title"])

    # Find projects that are unmappable
    mtc_geometry_titles = set(gdf.proj_title.unique().tolist())
    mtc_all_projects_titles = set(all_projects.title.unique().tolist())
    missing_projects = list(mtc_all_projects_titles - mtc_geometry_titles)

    # Add back in the  unmappable projects
    all_projects = all_projects[all_projects.title.isin(missing_projects)].reset_index(
        drop=True
    )
    all_projects = all_projects.rename(columns={"title": "proj_title"})
    final = pd.concat([all_projects, gdf])

    # Correct typo
    final = final.rename({"mode detai": "mode_detail"})

    # Set geometry again
    final = final.set_geometry("geometry").set_crs(geography_utils.WGS84)
    return final

In [16]:
def harmonize_mtc():
    df = mtc_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="proj_title",
        project_description_col="subcompone",
        project_category_col="plan_strategy",
        project_cost_col="funding_millions_yoe",
        location_col="geometry",
        county_col="county",
        city_col="",
        project_year_col="open_year",
        phase_col="",
        program="MTC",
        data_source="LRTP",
        note_cols=[
            "strategy",
            "layer_name",
            "mode_detai",
        ],
        cost_in_millions=False,
    )

    return df

In [17]:
mtc = harmonize_mtc()



In [18]:
type(mtc)

geopandas.geodataframe.GeoDataFrame

### Tahoe
* Open data portal: https://www.tahoeopendata.org/search?tags=transportation
* https://www.tahoeopendata.org/datasets/TRPA::2020-regional-transportation-plan-projects/about

In [19]:
def tmpo_lrtp():
    tahoe_url = "https://maps.trpa.org/server/rest/services/Datadownloader_Transportation/MapServer/19/query?where=1%3D1&text=&objectIds=&time=&timeRelation=esriTimeRelationOverlaps&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&returnTrueCurves=false&maxAllowableOffset=&geometryPrecision=&outSR=&havingClause=&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&historicMoment=&returnDistinctValues=false&resultOffset=&resultRecordCount=&returnExtentOnly=false&sqlFormat=none&datumTransformation=&parameterValues=&rangeValues=&quantizationParameters=&featureEncoding=esriDefault&f=geojson"
    gdf = to_snakecase(gpd.read_file(tahoe_url))
    gdf.complete_year = gdf.complete_year.astype(int)

    # Filter out projects that are not yet completed
    gdf2 = gdf[gdf.complete_year > 2022].reset_index(drop=True)

    gdf2["project_year"] = (
        gdf2.start_year.astype(str) + "-" + gdf.complete_year.astype(str)
    )
    return gdf2

In [20]:
tahoe_original = tmpo_lrtp()

In [21]:
def harmonize_tahoe():
    df = tmpo_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="estimated_cost",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="project_year",
        program="TMPO",
        phase_col="phase",
        data_source="LRTP",
        note_cols=[
            "implementer",
            "funding_type",
            "plan_name",
            "label",
            "financial_status",
            "url",
        ],
        cost_in_millions=True,
    )

    return df

### Concat all 3

In [22]:
def all_mpo():
    tahoe = harmonize_tahoe()
    sandag = harmonize_sandag()
    mtc = harmonize_mtc()
    
    df = pd.concat([tahoe, mtc, sandag])
    
    return df

In [23]:
all_mpo_df = all_mpo()



In [24]:
all_mpo_df.groupby(['grant_program']).size()

grant_program
MTC LRTP       282
SANDAG LRTP    416
TMPO LRTP       75
dtype: int64

In [203]:
congest_keywords = [
    "congest",
    "vary",
    "variable tolls",
    "express lane",
    "tolling",
    "value",
    "rush",
    "pricing",
    "value",
    "price",
    "off peak",
    "variabl",
    "cordon",
    "dynamically priced",
    "dynamic pricing",
    "hot",
    "mobility pricing",
    "hov",
    "occupancy",
    "toll lane",
    "performance pricing",
]

In [204]:
all_mpo_df.columns

Index(['project_title', 'lead_agency', 'project_year', 'project_category',
       'grant_program', 'project_description', 'phase', 'total_project_cost',
       'geometry', 'city', 'county', 'data_source', 'notes',
       'lower_case_project_title', 'lower_case_project_description',
       'lower_case_notes', 'project_keyword_found', 'title_keyword_found',
       'notes_keyword_found'],
      dtype='object')

In [205]:
def lower_case_columns(df, columns_to_search: list):
    new_columns = []
    for i in columns_to_search:
        df[f"lower_case_{i}"] = df[i].str.lower().fillna("none")
        new_columns.append(f"lower_case_{i}")
        
    return df, new_columns

In [206]:
all_mpo_df, lowercase_list = lower_case_columns(all_mpo_df,  ['project_title','project_description', 'notes'])

In [207]:
congestion_search = f"({'|'.join(congest_keywords)})"

In [208]:
all_mpo_df.columns

Index(['project_title', 'lead_agency', 'project_year', 'project_category',
       'grant_program', 'project_description', 'phase', 'total_project_cost',
       'geometry', 'city', 'county', 'data_source', 'notes',
       'lower_case_project_title', 'lower_case_project_description',
       'lower_case_notes', 'project_keyword_found', 'title_keyword_found',
       'notes_keyword_found'],
      dtype='object')

In [209]:
all_mpo_df["project_keyword_found"] = all_mpo_df.lower_case_project_description.str.extract(congestion_search).fillna("not congestion")

In [210]:
all_mpo_df["project_keyword_found"].value_counts()

not congestion    724
express lane       33
congest             7
hov                 4
tolling             2
fee                 1
occupancy           1
toll lane           1
Name: project_keyword_found, dtype: int64

In [211]:
all_mpo_df["title_keyword_found"] = all_mpo_df.lower_case_project_title.str.extract(congestion_search).fillna("not congestion")

In [212]:
all_mpo_df["title_keyword_found"].value_counts()

not congestion    734
express lane       32
congest             4
tolling             2
hov                 1
Name: title_keyword_found, dtype: int64

In [213]:
all_mpo_df["notes_keyword_found"] = all_mpo_df.lower_case_notes.str.extract(congestion_search).fillna("not congestion")

In [214]:
all_mpo_df["notes_keyword_found"].value_counts()

not congestion    725
express lane       42
hov                 3
toll lane           2
tolling             1
Name: notes_keyword_found, dtype: int64

In [216]:
len(all_mpo_df[(all_mpo_df.title_keyword_found != "not congestion") | (all_mpo_df.project_keyword_found != "not congestion") | (all_mpo_df.notes_keyword_found != "not congestion") ])

63

In [217]:
cordon_pricing = (all_mpo_df[(all_mpo_df.title_keyword_found != "not congestion") 
                             | (all_mpo_df.project_keyword_found != "not congestion") 
                             | (all_mpo_df.notes_keyword_found != "not congestion")]).reset_index(drop = True)

In [218]:
cordon_pricing[['project_title','title_keyword_found','project_description','project_keyword_found', 'notes', 'notes_keyword_found']]

Unnamed: 0,project_title,title_keyword_found,project_description,project_keyword_found,notes,notes_keyword_found
0,State Route 89 Recreation Corridor Improvements,not congestion,"The project will produce a SR 89 Recreation Corridor Management Plan and will implement the plan’s recommendations. Expected improvements include enhancing parking, creating a seamless fee system, improve wayfinding, enhance visitor experience, redesign or repurpose existing facilities to enhance day-use activities, utilize adaptive traffic management on SR 89 that could prioritize transit, incentivize multi-modal transportation and deliver targeted advertising to inform visitors of their tran",fee,"implementer: Tahoe Regional Planning Agency/US Forest Service, funding type: Plan, plan name: , label: SR 89 Corridor Plan, financial status: Constrained, url: https://eip.laketahoeinfo.org/Project/FactSheet/03.01.02.0044",not congestion
1,Adaptive Traffic Management on US 50,not congestion,"This is a coordinated project between El Dorado County, TRPA, Caltrans, and the City of South Lake Tahoe to adaptively manage the Hwy 50 basin entry. Examples of Adaptive Traffic Management strategies include bus only lanes or modified lane configurations during peak travel times. Through improved operations of basin entry roads, user satisfaction improves, congestion decreases, transit use increases, and impacts to the environment (including GHG emissions) are expected to decrease.",congest,"implementer: California Department of Transportation, funding type: Capital, plan name: , label: US 50 Adaptive Traffic Management, financial status: Unconstrained, url: https://eip.laketahoeinfo.org/Project/FactSheet/03.02.01.0047",not congestion
2,All Electronic Tolling Program | Regional,tolling,,not congestion,"strategy: nan, layer name: nan, mode detai: nan",not congestion
3,Express Lanes | Regional,express lane,Express Lanes: I-880 NB and SB,express lane,"strategy: T12. Build an Integrated Regional Express Lane and Express Bus Network, layer name: planbayarea2050 rtp line, mode detai: Express Lane",express lane
4,Express Lanes | Regional,express lane,Express Lanes: I-680 NB,express lane,"strategy: T12. Build an Integrated Regional Express Lane and Express Bus Network, layer name: planbayarea2050 rtp line, mode detai: Express Lane",express lane
5,Express Lanes | Regional,express lane,Express Lanes: I-680 SB,express lane,"strategy: T12. Build an Integrated Regional Express Lane and Express Bus Network, layer name: planbayarea2050 rtp line, mode detai: Express Lane",express lane
6,Express Lanes | Regional,express lane,Express Lanes: I-680 NB,express lane,"strategy: T12. Build an Integrated Regional Express Lane and Express Bus Network, layer name: planbayarea2050 rtp line, mode detai: Express Lane",express lane
7,Express Lanes | Regional,express lane,Express Lanes: I-80 NB and SB,express lane,"strategy: T12. Build an Integrated Regional Express Lane and Express Bus Network, layer name: planbayarea2050 rtp line, mode detai: Express Lane",express lane
8,Express Lanes | Regional,express lane,Express Lanes: I-680 NB,express lane,"strategy: T12. Build an Integrated Regional Express Lane and Express Bus Network, layer name: planbayarea2050 rtp line, mode detai: Express Lane",express lane
9,Express Lanes | Regional,express lane,Express Lanes: I-580 EB and WB,express lane,"strategy: T12. Build an Integrated Regional Express Lane and Express Bus Network, layer name: planbayarea2050 rtp line, mode detai: Express Lane",express lane


In [219]:
cordon_pricing = cordon_pricing.set_geometry("geometry")

In [220]:
len(cordon_pricing)

63

In [221]:
cordon_pricing2 = cordon_pricing[~cordon_pricing.geometry.is_empty].reset_index(
    drop=True
)

In [222]:
cordon_pricing2 = cordon_pricing2[cordon_pricing2.geometry.is_valid].reset_index(
    drop=True
)

In [223]:
len(cordon_pricing2)

62

In [225]:
cordon_pricing2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   project_title                   61 non-null     object  
 1   lead_agency                     62 non-null     object  
 2   project_year                    62 non-null     object  
 3   project_category                62 non-null     object  
 4   grant_program                   62 non-null     object  
 5   project_description             62 non-null     object  
 6   phase                           62 non-null     object  
 7   total_project_cost              62 non-null     float64 
 8   geometry                        62 non-null     geometry
 9   city                            62 non-null     object  
 10  county                          62 non-null     object  
 11  data_source                     62 non-null     object  
 12  notes           

In [None]:
cordon_pricing2 = cordon_pricing2.fillna(
    df.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [None]:
cordon_pricing2.drop(columns=["geometry"])