## Long Range Transportation Plan

In [1]:
import _harmonization_utils as harmonization_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from shared_utils import geography_utils, utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### SANDAG
* https://www.arcgis.com/home/item.html?id=ba8b2a6247ef463cb7411f039959f3ee
* https://www.arcgis.com/apps/mapviewer/index.html?webmap=ba8b2a6247ef463cb7411f039959f3ee

In [3]:
columns_to_keep = [
    "project_title",
    "lead_agency",
    "project_year",
    "project_category",
    "grant_program",
    "project_description",
    "total_project_cost",
    "location",
    "city",
    "county",
    "data_source",
    "notes",
]

In [4]:
def create_notes(df, note_cols: list):
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df["notes"] = df.apply(combine_notes, axis=1)
    df.notes = df.notes.str.replace("_", " ")

    return df

In [5]:
def harmonizing_lrtp(
    df,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    phase_col: str,
    program: str,
    data_source: str,
    note_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    the same names and formats.
    """
    rename_columns = {
        project_name_col: "project_title",
        phase_col: "phase",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "geometry",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  
    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce")
 
    # Add MPO & grant program
    df["lead_agency"] = program
    df["grant_program"] = f"{program} {data_source}"

    # Add data source
    df["data_source"] = data_source

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)

    # Fill in any nulls
    # df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_year",
        "project_category",
        "phase",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "geometry" not in df:
        df["geometry"] = None
    # Create notes
    df = create_notes(df, note_cols)

    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "project_description",
        "phase",
        "total_project_cost",
        "geometry",
        "city",
        "county",
        "data_source",
        "notes",
    ]

    df = df[columns_to_keep]

    return df

In [6]:
def open_rest_server(url_pt_1: str, url_pt_2: str, layer_name: list):
    full_gdf = pd.DataFrame()
    for i in layer_name:
        gdf = to_snakecase(gpd.read_file(f"{url_pt_1}{i}{url_pt_2}"))
        gdf["layer_name"] = i
        full_gdf = pd.concat([full_gdf, gdf], axis=0)

    return full_gdf

In [7]:
def sandag_lrtp():
    sandag_layers_list = [
        "Complete_Corridors_Connectors_Intersections_Ramps_RP2021",
        "Complete_Corridors_Highway_RP2021",
        "Transit_Leap_RP2021",
        "Adopted_Regional_Bike_Network_RP2021",
        "Mobility_Hubs_and_Flexible_Fleets",
        "Complete_Corridors_Regional_Arterials",
        "Goods_Movement",
    ]

    sandag_url_pt1 = (
        "https://services1.arcgis.com/HG80xaIVT1z1OdO5/ArcGIS/rest/services/"
    )
    sandag_url_pt2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    sandag = open_rest_server(sandag_url_pt1, sandag_url_pt2, sandag_layers_list)
   
    return sandag

In [8]:
def harmonize_sandag():
    df = sandag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="cost2020m",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="phase",
        phase_col="status",
        program="SANDAG",
        data_source="LRTP",
        note_cols=[
            "aqc_2016_1",
            "aqc_2020_1",
            "pricmcp",
            "conncmcp",
            "layer_name",
            "corridor_i",
            "type_1",
            "existing",
            "limits",
            "description_1",
            "route",
            "routetype",
            "route_desc",
            "rp_2021_id",
            "rp_2021_id_1",
            "capital_cost___2020__millions",
        ],
        cost_in_millions=True,
    )

    return df

### Project Cost Division

In [9]:
def correct_project_cost(df, project_title_col:str, project_total_cost:str):
    """
    For some datasets, the same project 
    (as determined by the same project name, cost,
    and source) is split across multiple rows. 

    Ex: A project costs $500 million and is 
    split on 5 rows by phase/location. Each row still lists 
    the total  cost as $500 million, which is not accurate.
    This function will recalculate each of the row to list
    $100 mil as the total project cost
    """
    # Create a unique identifier
    df['unique_identifier'] =  df[project_title_col] + df[project_total_cost].astype(str)
    
    # Create count for each project
    df['how_many_times_same_proj_appears'] = df.groupby('unique_identifier').cumcount()+1
    
    # Find the total number of times a project title-cost appears. 
    # Sort by descending and keep only the row with the highest level
    df2 = (df[[project_title_col,'how_many_times_same_proj_appears',project_total_cost, 'unique_identifier']]
            .sort_values([project_title_col, 'how_many_times_same_proj_appears'], ascending = False)
            .drop_duplicates(subset = ['unique_identifier'])
           ) 
    # Create new funding estimate
    df2['new_proj_cost'] = df2[project_total_cost] / df2["how_many_times_same_proj_appears"]
    
    # Drop some columns
    df2 = df2.drop(columns = [project_title_col,project_total_cost,'how_many_times_same_proj_appears'])
    
    # Merge
    m1 = pd.merge(df, df2, how = "inner", on = "unique_identifier")
    
    # Clean up
    m1 = m1.drop(columns = ['unique_identifier', 'how_many_times_same_proj_appears', project_total_cost])
    m1['total_project_cost_note'] = "This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset."
    
    # Replace project cost
    m1 = m1.rename(columns = {'new_proj_cost':'total_project_cost'})
    return m1

### MTC
* 133 projects but point and line map returns 240 ish rows.
* Projects that are in different locations can have the same name split by open year...
* I guess the geospatial projects are split into various phases while the regular pandas df is just the project in its entirety.
* https://www.planbayarea.org/2050-plan/final-plan-bay-area-2050/final-supplemental-reports/interactive-transportation-project-list
* https://opendata.mtc.ca.gov/datasets/MTC::2023-transportation-improvement-program-projects-point/about
* https://opendata.mtc.ca.gov/datasets/MTC::plan-bay-area-2050-transportation-projects-line/about
* https://data.bayareametro.gov/Projects/Plan-Bay-Area-2050-Transportation-Project-List/y7ka-jksz

In [15]:
def mtc_lrtp():
    # Open rest server data
    layer_list = ["planbayarea2050_rtp_line", "planbayarea2050_rtp_point"]
    url_pt_1 = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/ArcGIS/rest/services/"
    url_pt_2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    gdf = open_rest_server(url_pt_1, url_pt_2, layer_list)

    # Open all MTC projects. Not all projects are mappable.
    all_projects_url = "https://data.bayareametro.gov/resource/y7ka-jksz.json?$query=SELECT%0A%20%20%60plan_strategy%60%2C%0A%20%20%60rtpid%60%2C%0A%20%20%60title%60%2C%0A%20%20%60scope%60%2C%0A%20%20%60open_period%60%2C%0A%20%20%60funding_millions_yoe%60%2C%0A%20%20%60county%60"
    all_projects = pd.read_json(all_projects_url)

    # Merge info from all projects to gdf
    gdf = pd.merge(
        gdf, all_projects, how="inner", left_on=["proj_title"], right_on=["title"]
    )

    gdf = gdf.drop(columns=["title"])

    # Find projects that are unmappable
    mtc_geometry_titles = set(gdf.proj_title.unique().tolist())
    mtc_all_projects_titles = set(all_projects.title.unique().tolist())
    missing_projects = list(mtc_all_projects_titles - mtc_geometry_titles)

    # Add back in the  unmappable projects
    all_projects = all_projects[all_projects.title.isin(missing_projects)].reset_index(
        drop=True
    )
    all_projects = all_projects.rename(columns={"title": "proj_title"})
    final = pd.concat([all_projects, gdf])

    # Correct typo
    final = final.rename({"mode detai": "mode_detail"})

    # Set geometry again
    final = final.set_geometry("geometry").set_crs(geography_utils.WGS84)
    
    # Same project is split across multiple rows. Divide out project cost
    final = correct_project_cost(final,'proj_title', 'funding_millions_yoe')
    return final

In [17]:
mtc_og = mtc_lrtp()



In [18]:
mtc_og.columns

Index(['plan_strategy', 'rtpid', 'proj_title', 'scope', 'open_period',
       'county', 'objectid', 'map_id', 'subcompone', 'open_year', 'strategy',
       'rtp_id', 'mode_', 'mode_detai', 'investment', 'shape__length',
       'geometry', 'layer_name', 'total_project_cost',
       'total_project_cost_note'],
      dtype='object')

In [19]:
def harmonize_mtc():
    df = mtc_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="proj_title",
        project_description_col="subcompone",
        project_category_col="plan_strategy",
        project_cost_col="funding_millions_yoe",
        location_col="geometry",
        county_col="county",
        city_col="",
        project_year_col="open_year",
        phase_col="",
        program="MTC",
        data_source="LRTP",
        note_cols=[
            "strategy",
            "layer_name",
            "mode_detai",
        ],
        cost_in_millions=False,
    )

    return df

In [20]:
harmonized_mtc = harmonize_mtc()



In [21]:
harmonized_mtc.head()

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,project_description,phase,total_project_cost,geometry,city,county,data_source,notes
0,Operate & Maintain the Existing System | Baseline Public Transit Service Levels | Regional,MTC,,Fix It First,MTC LRTP,,,211000.0,,,Regional,LRTP,"strategy: nan, layer name: nan, mode detai: nan"
1,Operate & Maintain the Existing System | Public Transit Capital Assets | Regional,MTC,,Fix It First,MTC LRTP,,,59500.0,,,Regional,LRTP,"strategy: nan, layer name: nan, mode detai: nan"
2,Operate & Maintain the Existing System | Local Streets & Roads | Regional,MTC,,Fix It First,MTC LRTP,,,61900.0,,,Regional,LRTP,"strategy: nan, layer name: nan, mode detai: nan"
3,Operate & Maintain the Existing System | Local Bridges | Regional,MTC,,Fix It First,MTC LRTP,,,2550.0,,,Regional,LRTP,"strategy: nan, layer name: nan, mode detai: nan"
4,Operate & Maintain the Existing System | Toll Bridges | Regional,MTC,,Fix It First,MTC LRTP,,,21900.0,,,Regional,LRTP,"strategy: nan, layer name: nan, mode detai: nan"


### Tahoe
* Open data portal: https://www.tahoeopendata.org/search?tags=transportation
* https://www.tahoeopendata.org/datasets/TRPA::2020-regional-transportation-plan-projects/about

In [22]:
def tmpo_lrtp():
    tahoe_url = "https://maps.trpa.org/server/rest/services/Datadownloader_Transportation/MapServer/19/query?where=1%3D1&text=&objectIds=&time=&timeRelation=esriTimeRelationOverlaps&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&returnTrueCurves=false&maxAllowableOffset=&geometryPrecision=&outSR=&havingClause=&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&historicMoment=&returnDistinctValues=false&resultOffset=&resultRecordCount=&returnExtentOnly=false&sqlFormat=none&datumTransformation=&parameterValues=&rangeValues=&quantizationParameters=&featureEncoding=esriDefault&f=geojson"
    gdf = to_snakecase(gpd.read_file(tahoe_url))
    gdf.complete_year = gdf.complete_year.astype(int)

    # Filter out projects that are not yet completed
    gdf2 = gdf[gdf.complete_year > 2022].reset_index(drop=True)

    gdf2["project_year"] = (
        gdf2.start_year.astype(str) + "-" + gdf.complete_year.astype(str)
    )
    return gdf2

In [23]:
def harmonize_tahoe():
    df = tmpo_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="estimated_cost",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="project_year",
        program="TMPO",
        phase_col="phase",
        data_source="LRTP",
        note_cols=[
            "implementer",
            "funding_type",
            "plan_name",
            "label",
            "financial_status",
            "url",
        ],
        cost_in_millions=True,
    )

    return df

### Southern California Association of Governments (SCAG) -> Check that project cost is correct.

In [24]:
def delete_embedded_headers(df, column:str, string_search:str) -> pd.DataFrame:
    """
    Some PDFS include the column names embedded mulitple times
    within the df. Delete them out. 
    """
    headers = (df[df[column].str.contains(string_search)==True])
    headers_index_list = headers.index.values.tolist()
    
    print(f"{len(headers_index_list)} rows are headers") 
    
    df2 = df.drop(headers_index_list).reset_index(drop  = True)
    return df2

In [25]:
def row_threshold(df):
    """
    Some PDFS are merged/formatted where there are many
    rows full of NA. Delete them out.
    """
    # Delete rows containing either 75% or more than 75% NaN Values
    # https://thispointer.com/pandas-drop-dataframe-rows-based-on-nan-percentage/
    perc = 90.0 # 
    min_count =  int(((100-perc)/100)*df.shape[1] + 1)
    
    df2 = df.dropna(axis=0, thresh=min_count).reset_index(drop = True)
    
    print(f"dropped {len(df)-len(df2)} rows that were filled with NA")
    
    return df2

In [26]:
def scag_lrtp():
    sheets_list = ["Table 1", "Table 2"]
    dict_df = pd.read_excel(f"{harmonization_utils.GCS_FILE_PATH}LRTP/SCAG.xlsx", sheet_name=sheets_list)
    
    df1 = to_snakecase(dict_df.get("Table 1"))
    # Drop columns where everything is NA
    df1 = df1.dropna(axis=1, how='all')
    
    df2 = to_snakecase(dict_df.get("Table 2"))
    df2 = df2.dropna(axis=1, how='all')
    
    df3 = pd.concat([df1,df2])
    
    df4 = delete_embedded_headers(df3, "rtp_id", "RTP ID")
    
    df4 = row_threshold(df4) 
    
    # No project title
    df4['project_title'] = df4.description
    
    # Project cost is in thousands.
    # Multiple out
    df4["project_cost__$1,000s_"] = df4["project_cost__$1,000s_"]* 1_000
    return df4

In [27]:
scag = scag_lrtp()

360 rows are headers
dropped 307 rows that were filled with NA


In [28]:
scag.shape

(3579, 12)

In [62]:
scag.head()

Unnamed: 0,system,lead_agency,rtp_id,route_#,route_name,from,to,description,completion_year,"project_cost__$1,000s_",county,project_title
0,LOCAL HIGHWAY,BRAWLEY,7120001-IMP160407,0,,,,PAVING OF DIRT ROAD WILDCAT DRIVE FROM S. WESTERN AVENUE TO S. FIRST STREET,2019,1008000,,PAVING OF DIRT ROAD WILDCAT DRIVE FROM S. WESTERN AVENUE TO S. FIRST STREET
1,LOCAL HIGHWAY,BRAWLEY,7120001-IMP161004,0,,,,"ASPHALT REHABILITATION ALONG RIO VISTA AVENUE BETWEEN ALLEN STREET AND CATTLE CALL DRIVE, IN THE CITY OF BRAWLEY, IMPERIAL COUNTY.",2018,109000,,"ASPHALT REHABILITATION ALONG RIO VISTA AVENUE BETWEEN ALLEN STREET AND CATTLE CALL DRIVE, IN THE CITY OF BRAWLEY, IMPERIAL COUNTY."
2,LOCAL HIGHWAY,CALEXICO,6OM0701- IMP091001,0,,,,"WIDEN AND IMPROVE CESAR CHAVEZ BLVD. TO 5 LANES (3+2) FROM 2ND STREET TO SR 98. OTHER IMPROVEMENTS INCLUDE: SURFACE REHAB, TURN LANES, TRAFFIC SIGNAL, LIGHTING, AND SIDEWALKS",2019,9896000,,"WIDEN AND IMPROVE CESAR CHAVEZ BLVD. TO 5 LANES (3+2) FROM 2ND STREET TO SR 98. OTHER IMPROVEMENTS INCLUDE: SURFACE REHAB, TURN LANES, TRAFFIC SIGNAL, LIGHTING, AND SIDEWALKS"
3,LOCAL HIGHWAY,CALEXICO,7120001-IMP160408,0,,,,"PAVING OF DIRT ROAD DE LAS FLORES STREET BETWEEN EADY AVENUE AND KLOKE AVENUE TO INCLUDE THE INSTALLATION OF CURB, GUTTER AND SIDEWALKS",2019,466000,,"PAVING OF DIRT ROAD DE LAS FLORES STREET BETWEEN EADY AVENUE AND KLOKE AVENUE TO INCLUDE THE INSTALLATION OF CURB, GUTTER AND SIDEWALKS"
4,LOCAL HIGHWAY,CALEXICO,7120004-IMP090701,0,,,,"DEVELOP BICYCLE PATHS AND PUBLIC PARK SPACE ADJACENT TO THE NEW\nRIVER, CALEXICO (HPP TEA-LU PROJECT #3092)",2019,4000000,,"DEVELOP BICYCLE PATHS AND PUBLIC PARK SPACE ADJACENT TO THE NEW\nRIVER, CALEXICO (HPP TEA-LU PROJECT #3092)"


In [29]:
def harmonize_scag():
    df = scag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="description",
        project_category_col="system",
        project_cost_col="project_cost__$1,000s_",
        location_col="",
        county_col="county",
        city_col="",
        project_year_col="completion_year",
        program="SCAG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "lead_agency",
            "route_#",
            "route_name",
            "from",
            "to"
        ],
        cost_in_millions=True,
    )

    return df

In [30]:
scag_harmonized = harmonize_scag()

360 rows are headers
dropped 307 rows that were filled with NA




In [31]:
scag.head()

Unnamed: 0,system,lead_agency,rtp_id,route_#,route_name,from,to,description,completion_year,"project_cost__$1,000s_",county,project_title
0,LOCAL HIGHWAY,BRAWLEY,7120001-IMP160407,0,,,,PAVING OF DIRT ROAD WILDCAT DRIVE FROM S. WESTERN AVENUE TO S. FIRST STREET,2019,1008000,,PAVING OF DIRT ROAD WILDCAT DRIVE FROM S. WESTERN AVENUE TO S. FIRST STREET
1,LOCAL HIGHWAY,BRAWLEY,7120001-IMP161004,0,,,,"ASPHALT REHABILITATION ALONG RIO VISTA AVENUE BETWEEN ALLEN STREET AND CATTLE CALL DRIVE, IN THE CITY OF BRAWLEY, IMPERIAL COUNTY.",2018,109000,,"ASPHALT REHABILITATION ALONG RIO VISTA AVENUE BETWEEN ALLEN STREET AND CATTLE CALL DRIVE, IN THE CITY OF BRAWLEY, IMPERIAL COUNTY."
2,LOCAL HIGHWAY,CALEXICO,6OM0701- IMP091001,0,,,,"WIDEN AND IMPROVE CESAR CHAVEZ BLVD. TO 5 LANES (3+2) FROM 2ND STREET TO SR 98. OTHER IMPROVEMENTS INCLUDE: SURFACE REHAB, TURN LANES, TRAFFIC SIGNAL, LIGHTING, AND SIDEWALKS",2019,9896000,,"WIDEN AND IMPROVE CESAR CHAVEZ BLVD. TO 5 LANES (3+2) FROM 2ND STREET TO SR 98. OTHER IMPROVEMENTS INCLUDE: SURFACE REHAB, TURN LANES, TRAFFIC SIGNAL, LIGHTING, AND SIDEWALKS"
3,LOCAL HIGHWAY,CALEXICO,7120001-IMP160408,0,,,,"PAVING OF DIRT ROAD DE LAS FLORES STREET BETWEEN EADY AVENUE AND KLOKE AVENUE TO INCLUDE THE INSTALLATION OF CURB, GUTTER AND SIDEWALKS",2019,466000,,"PAVING OF DIRT ROAD DE LAS FLORES STREET BETWEEN EADY AVENUE AND KLOKE AVENUE TO INCLUDE THE INSTALLATION OF CURB, GUTTER AND SIDEWALKS"
4,LOCAL HIGHWAY,CALEXICO,7120004-IMP090701,0,,,,"DEVELOP BICYCLE PATHS AND PUBLIC PARK SPACE ADJACENT TO THE NEW\nRIVER, CALEXICO (HPP TEA-LU PROJECT #3092)",2019,4000000,,"DEVELOP BICYCLE PATHS AND PUBLIC PARK SPACE ADJACENT TO THE NEW\nRIVER, CALEXICO (HPP TEA-LU PROJECT #3092)"


In [32]:
scag_harmonized.head()

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,project_description,phase,total_project_cost,geometry,city,county,data_source,notes
0,PAVING OF DIRT ROAD WILDCAT DRIVE FROM S. WESTERN AVENUE TO S. FIRST STREET,SCAG,2019,LOCAL HIGHWAY,SCAG LRTP,PAVING OF DIRT ROAD WILDCAT DRIVE FROM S. WESTERN AVENUE TO S. FIRST STREET,,1.01,,,,LRTP,"lead agency: SCAG, route #: 0, route name: nan, from: nan, to: nan"
1,"ASPHALT REHABILITATION ALONG RIO VISTA AVENUE BETWEEN ALLEN STREET AND CATTLE CALL DRIVE, IN THE CITY OF BRAWLEY, IMPERIAL COUNTY.",SCAG,2018,LOCAL HIGHWAY,SCAG LRTP,"ASPHALT REHABILITATION ALONG RIO VISTA AVENUE BETWEEN ALLEN STREET AND CATTLE CALL DRIVE, IN THE CITY OF BRAWLEY, IMPERIAL COUNTY.",,0.11,,,,LRTP,"lead agency: SCAG, route #: 0, route name: nan, from: nan, to: nan"
2,"WIDEN AND IMPROVE CESAR CHAVEZ BLVD. TO 5 LANES (3+2) FROM 2ND STREET TO SR 98. OTHER IMPROVEMENTS INCLUDE: SURFACE REHAB, TURN LANES, TRAFFIC SIGNAL, LIGHTING, AND SIDEWALKS",SCAG,2019,LOCAL HIGHWAY,SCAG LRTP,"WIDEN AND IMPROVE CESAR CHAVEZ BLVD. TO 5 LANES (3+2) FROM 2ND STREET TO SR 98. OTHER IMPROVEMENTS INCLUDE: SURFACE REHAB, TURN LANES, TRAFFIC SIGNAL, LIGHTING, AND SIDEWALKS",,9.9,,,,LRTP,"lead agency: SCAG, route #: 0, route name: nan, from: nan, to: nan"
3,"PAVING OF DIRT ROAD DE LAS FLORES STREET BETWEEN EADY AVENUE AND KLOKE AVENUE TO INCLUDE THE INSTALLATION OF CURB, GUTTER AND SIDEWALKS",SCAG,2019,LOCAL HIGHWAY,SCAG LRTP,"PAVING OF DIRT ROAD DE LAS FLORES STREET BETWEEN EADY AVENUE AND KLOKE AVENUE TO INCLUDE THE INSTALLATION OF CURB, GUTTER AND SIDEWALKS",,0.47,,,,LRTP,"lead agency: SCAG, route #: 0, route name: nan, from: nan, to: nan"
4,"DEVELOP BICYCLE PATHS AND PUBLIC PARK SPACE ADJACENT TO THE NEW\nRIVER, CALEXICO (HPP TEA-LU PROJECT #3092)",SCAG,2019,LOCAL HIGHWAY,SCAG LRTP,"DEVELOP BICYCLE PATHS AND PUBLIC PARK SPACE ADJACENT TO THE NEW\nRIVER, CALEXICO (HPP TEA-LU PROJECT #3092)",,4.0,,,,LRTP,"lead agency: SCAG, route #: 0, route name: nan, from: nan, to: nan"


### Sacramento Area Council of Governments (SACOG)
* https://www.sacog.org/sites/main/files/file-attachments/2020_mtp-scs.pdf?1580330993

In [33]:
# sacog = to_snakecase(gpd.read_file("https://services.sacog.org/hosting/rest/services/Hosted/2040_Major_Stops/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"))

In [34]:
def sacog_lrtp():
    df = to_snakecase(pd.read_excel(f"{harmonization_utils.GCS_FILE_PATH}LRTP/SACOG.xlsx"))
    
    # Delete embedded headers
    df = delete_embedded_headers(df, "description", "Description")
    
    # Delete columns with all nulls
    df = df.dropna(axis=1, how='all')
    
    # df.total_project_cost__2018_dollars_ = df.total_project_cost__2018_dollars_.apply(pd.to_numeric, errors="coerce")
    return df

In [35]:
sacog = sacog_lrtp()

96 rows are headers


In [36]:
sacog.shape

(1604, 10)

In [37]:
def harmonize_sacog():
    df = sacog_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="title",
        project_description_col="description",
        project_category_col="budget_category",
        project_cost_col="total_project_cost__2018_dollars_",
        location_col="",
        county_col="county",
        city_col="",
        project_year_col="completion_timing",
        program="SACOG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "year_of_expenditure_cost_for_planned_projects",
            "status__planned,_programmed_or_project_development_only_",
        ],
        cost_in_millions=True,
    )

    return df

In [38]:
sacog_harmonized = harmonize_sacog()

96 rows are headers




In [39]:
sacog_harmonized.head()

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,project_description,phase,total_project_cost,geometry,city,county,data_source,notes
0,Aux Lane Project: EB Latrobe Road,SACOG,Post-2040,"G- System Management,\nOperations, and ITS",SACOG LRTP,US-50 EB Latrobe Rd to Silva Valley (T); US 50,,1.5,,,ELD,LRTP,"year of expenditure cost for planned projects: nan, status planned, programmed or project development only : Project Development Only"
1,Cameron Park Drive to Ponderosa Road,SACOG,Post-2040,B- Road & Highway Capacity,SACOG LRTP,"Managed Lane facility - Phase 2B (project description may change based on results from the Managed Lanes Study. Project is being evaluated for Expressed Toll Lanes, High Occupancy Toll Lanes, HOV lanes)",,22.64,,,ELD,LRTP,"year of expenditure cost for planned projects: nan, status planned, programmed or project development only : Project Development Only"
2,EB Bass Lake Rd. Diagonal Ramp Meter,SACOG,2031-2035,"G- System Management,\nOperations, and ITS",SACOG LRTP,EB Bass Lake Rd. Diagonal Ramp Meter,,0.38,,,ELD,LRTP,"year of expenditure cost for planned projects: 0.53693, status planned, programmed or project development only : Planned"
3,EB Cambridge Rd. Loop Ramp Meter,SACOG,2031-2035,"G- System Management,\nOperations, and ITS",SACOG LRTP,EB Cambridge Rd. Loop Ramp Meter,,0.38,,,ELD,LRTP,"year of expenditure cost for planned projects: 0.53693, status planned, programmed or project development only : Planned"
4,EB Cameron Park Dr. Diagonal Ramp Meter,SACOG,2031-2035,"G- System Management,\nOperations, and ITS",SACOG LRTP,EB Cameron Park Dr. Diagonal Ramp Meter,,0.38,,,ELD,LRTP,"year of expenditure cost for planned projects: 0.53693, status planned, programmed or project development only : Planned"


### Lost

In [40]:
lost = harmonization_utils.load_lost()

In [41]:
lost.head()

Unnamed: 0,measure,local_returns,project_category,location,project_title,cost__in_millions_,estimated_lost_funds,estimated_federal_funds,estimated_state_funds,estimated_local_funds,estimated_other_funds,project_description,agency,county,city,notes
0,alameda_b_2000,No,,,"AC Transit Quality Bus - San Pablo, Telegraph Corridors",20.0,20.0,0.0,0.0,0,0.0,,AC Transit,alameda,,
1,alameda_b_2000,No,,,Altamont Commuter Express Rail,10.0,10.0,0.0,0.0,0,0.0,,Alameda County CMA,alameda,,
2,alameda_b_2000,No,,,BART Extension to South Fremont,563.3,165.5,0.0,0.0,0,0.0,,BART,alameda,,
3,alameda_b_2000,No,,,BART Oakland Airport Connector,130.0,65.8,0.0,0.0,0,0.0,,BART,alameda,,
4,alameda_b_2000,No,,,Downtown Oakland Streetscape Improvement Project,5.0,5.0,0.0,0.0,0,0.0,,City of Oakland,alameda,,


In [42]:
def harmonize_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="cost__in_millions_",
        location_col="",
        county_col="county",
        city_col="city",
        project_year_col="",
        program="",
        phase_col="phase",
        data_source="LOST",
        note_cols=["measure", 'estimated_lost_funds',
       'estimated_federal_funds', 'estimated_state_funds',
       'estimated_local_funds', 'estimated_other_funds',"notes"],
        cost_in_millions=False,
    )
    
    
    return df

In [43]:
lost_harmonized = harmonize_lost()



In [44]:
lost_harmonized.head()

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,project_description,phase,total_project_cost,geometry,city,county,data_source,notes
0,"AC Transit Quality Bus - San Pablo, Telegraph Corridors",,,,LOST,,,20.0,,,alameda,LOST,"measure: alameda b 2000, estimated lost funds: 20.0, estimated federal funds: 0.0, estimated state funds: 0.0, estimated local funds: 0, estimated other funds: 0.0, notes: nan"
1,Altamont Commuter Express Rail,,,,LOST,,,10.0,,,alameda,LOST,"measure: alameda b 2000, estimated lost funds: 10.0, estimated federal funds: 0.0, estimated state funds: 0.0, estimated local funds: 0, estimated other funds: 0.0, notes: nan"
2,BART Extension to South Fremont,,,,LOST,,,563.3,,,alameda,LOST,"measure: alameda b 2000, estimated lost funds: 165.5, estimated federal funds: 0.0, estimated state funds: 0.0, estimated local funds: 0, estimated other funds: 0.0, notes: nan"
3,BART Oakland Airport Connector,,,,LOST,,,130.0,,,alameda,LOST,"measure: alameda b 2000, estimated lost funds: 65.8, estimated federal funds: 0.0, estimated state funds: 0.0, estimated local funds: 0, estimated other funds: 0.0, notes: nan"
4,Downtown Oakland Streetscape Improvement Project,,,,LOST,,,5.0,,,alameda,LOST,"measure: alameda b 2000, estimated lost funds: 5.0, estimated federal funds: 0.0, estimated state funds: 0.0, estimated local funds: 0, estimated other funds: 0.0, notes: nan"


### Concat

In [45]:
def all_mpo():
    tahoe = harmonize_tahoe()
    sandag = harmonize_sandag()
    mtc = harmonize_mtc()
    lost = harmonize_lost()
    sacog = harmonize_sacog()
    scag = harmonize_scag()
    
    df = pd.concat([tahoe, mtc, sandag, lost, sacog, scag])

    return df

In [46]:
all_mpo_df = all_mpo()



96 rows are headers




360 rows are headers
dropped 307 rows that were filled with NA




In [47]:
all_mpo_df.groupby(["grant_program"]).size()

grant_program
 LOST          1849
MTC LRTP        282
SACOG LRTP     1604
SANDAG LRTP     416
SCAG LRTP      3579
TMPO LRTP        75
dtype: int64

In [63]:
congest_keywords = [
    "congestion pricing",
    "variablly priced",
    "variable tolls",
    "express lane",
    "value",
    "rush hour",
    "cordon",
    "dynamic pricing",
    "dynmically priced",
    "high occupancy toll",
    "high-occupancy"
    "mobility pricing",
    "occupancy",
    "toll lane",
    "performance pricing",
    "peak travel",
    "managed lane",
    "tollway",
    "transponder",
    "express toll",
    "fixed pricing"
]

In [64]:
def lower_case(df, columns_to_search: list):
    """
    Lowercase certain columns
    """
    new_columns = []
    for i in columns_to_search:
        df[f"lower_case_{i}"] = (df[i].str.lower().fillna("none")
                                 .str.replace("-","")
                                 .str.replace(".","")
                                 .str.replace(":","")
                                )
        new_columns.append(f"lower_case_{i}")

    return df,new_columns

In [65]:
def find_keywords(df, columns_to_search:list, keywords_search:list):
    df2, lower_case_cols_list = lower_case(df, columns_to_search)
    
    keywords_search = f"({'|'.join(keywords_search)})"
    
    for i in lower_case_cols_list:
        df2[f"{i}_keyword_search"] = df2[i].str.extract(keywords_search).fillna("not congestion")
    
    return df2

In [66]:
# test = find_keywords(all_mpo_df,  ["project_title", "project_description", "notes"], congest_keywords)

In [67]:
def filter_cordon(df, columns_to_search:list, keywords_search:list, save_to_gcs: bool = False):

    # Filter out for Cordon 
    df = find_keywords(df, columns_to_search, keywords_search)
    df2 = (df[(df.lower_case_project_title_keyword_search != "not congestion")
        | (df.lower_case_project_description_keyword_search != "not congestion")]).reset_index(drop=True)
    
    # Delete out HOV
    projects_to_delete = ["SR 17 Corridor Congestion Relief in Los Gatos",
                         "Interstate 380 Congestion Improvements"]
    df2 = df2[~df2.project_title.isin(projects_to_delete)].reset_index(drop = True)

    # Drop invalid geometries
    gdf = df2[~df2.geometry.is_empty].reset_index(drop = True)
    gdf = gdf[gdf.geometry.is_valid].reset_index(drop = True)
    
    gdf = gdf.set_geometry("geometry")
    gdf = gdf.fillna(gdf.dtypes.replace({"float64": 0.0, "object": "None"}))

    # One version that's a df
    columns_to_drop = ['lower_case_project_title', 'lower_case_project_description']
    df2 = df2.drop(columns = columns_to_drop + ['geometry'])
    # df2 = df2.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    if save_to_gcs:
        df2.to_excel(f"{harmonization_utils.GCS_FILE_PATH}LRTP/congestion_pricing_sample.xlsx", sheet_name="Sheet_name_1", index=False)
        gdf.to_file("./congestion_pricing_sample.geojson", driver="GeoJSON")
        
    return gdf, df2

In [68]:
cordon_gdf, cordon_df = filter_cordon(all_mpo_df, ["project_title", "project_description",], congest_keywords)



In [69]:
len(cordon_df.drop_duplicates(subset = ['project_title']))

117

In [70]:
cordon_df.groupby(["grant_program"]).agg({"project_title":"nunique"})

Unnamed: 0_level_0,project_title
grant_program,Unnamed: 1_level_1
LOST,5
MTC LRTP,4
SACOG LRTP,25
SANDAG LRTP,46
SCAG LRTP,36
TMPO LRTP,1


In [71]:
cordon_df.grant_program.value_counts()

SANDAG LRTP    46
MTC LRTP       37
SCAG LRTP      36
SACOG LRTP     25
 LOST           5
TMPO LRTP       1
Name: grant_program, dtype: int64

In [72]:
cordon_df.drop_duplicates(subset = ['total_project_cost','project_title',]).groupby(["grant_program"]).agg({'total_project_cost':'sum'})

Unnamed: 0_level_0,total_project_cost
grant_program,Unnamed: 1_level_1
LOST,114.31
MTC LRTP,1904.9
SACOG LRTP,2570.77
SANDAG LRTP,0.0
SCAG LRTP,6216.07
TMPO LRTP,0.0


In [76]:
cols_to_drop = ['city','county', 'notes', 'phase','project_year',]

In [77]:
cordon_df.drop(columns =cols_to_drop).drop_duplicates(subset = ['project_title', 'project_description'])

Unnamed: 0,project_title,lead_agency,project_category,grant_program,project_description,total_project_cost,data_source,lower_case_project_title_keyword_search,lower_case_project_description_keyword_search
0,Adaptive Traffic Management on US 50,TMPO,Technology & Transportation System Management,TMPO LRTP,"This is a coordinated project between El Dorado County, TRPA, Caltrans, and the City of South Lake Tahoe to adaptively manage the Hwy 50 basin entry. Examples of Adaptive Traffic Management strategies include bus only lanes or modified lane configurations during peak travel times. Through improved operations of basin entry roads, user satisfaction improves, congestion decreases, transit use increases, and impacts to the environment (including GHG emissions) are expected to decrease.",,LRTP,not congestion,peak travel
1,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: I-880 NB and SB,141.56,LRTP,express lane,express lane
2,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: I-680 NB,141.56,LRTP,express lane,express lane
3,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: I-680 SB,141.56,LRTP,express lane,express lane
5,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: I-80 NB and SB,141.56,LRTP,express lane,express lane
7,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: I-580 EB and WB,141.56,LRTP,express lane,express lane
8,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: I-880 NB,141.56,LRTP,express lane,express lane
11,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: SR-92 WB,141.56,LRTP,express lane,express lane
12,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: SR-84 WB,141.56,LRTP,express lane,express lane
13,Express Lanes | Regional,MTC,Express Lanes and Bus,MTC LRTP,Express Lanes: I-80 WB,141.56,LRTP,express lane,express lane


In [78]:
# cordon_gdf.explore('project_title', cmap= 'tab20c', style_kwds={'weight':6}, legend = False)