## Long Range Transportation Plan

In [1]:
import _harmonization_utils as harmonization_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from shared_utils import geography_utils, utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [70]:
LRTP_GCS = f"{harmonization_utils.GCS_FILE_PATH}LRTP/"

### Project Cost Division

In [9]:
def correct_project_cost(df, project_title_col:str, project_total_cost:str):
    """
    For some datasets, the same project 
    (as determined by the same project name, cost,
    and source) is split across multiple rows. 

    Ex: A project costs $500 million and is 
    split on 5 rows by phase/location. Each row still lists 
    the total  cost as $500 million, which is not accurate.
    This function will recalculate each of the row to list
    $100 mil as the total project cost
    """
    # Create a unique identifier
    df['unique_identifier'] =  df[project_title_col] + df[project_total_cost].astype(str)
    
    # Create count for each project
    df['how_many_times_same_proj_appears'] = df.groupby('unique_identifier').cumcount()+1
    
    # Find the total number of times a project title-cost appears. 
    # Sort by descending and keep only the row with the highest level
    df2 = (df[[project_title_col,'how_many_times_same_proj_appears',project_total_cost, 'unique_identifier']]
            .sort_values([project_title_col, 'how_many_times_same_proj_appears'], ascending = False)
            .drop_duplicates(subset = ['unique_identifier'])
           ) 
    # Create new funding estimate
    df2['new_proj_cost'] = df2[project_total_cost] / df2["how_many_times_same_proj_appears"]
    
    # Drop some columns
    df2 = df2.drop(columns = [project_title_col,project_total_cost,'how_many_times_same_proj_appears'])
    
    # Merge
    m1 = pd.merge(df, df2, how = "inner", on = "unique_identifier")
    
    # Clean up
    m1 = m1.drop(columns = ['unique_identifier', 'how_many_times_same_proj_appears', project_total_cost])
    m1['total_project_cost_note'] = "This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset."
    
    # Replace project cost
    m1 = m1.rename(columns = {'new_proj_cost':'total_project_cost'})
    return m1

### Ambag

### BCAG 

In [72]:
def bcag_lrtp(file_name):
    """
    BCAG.xls
    """
    df = pd.read_excel(f"{LRTP_GCS}{file_name}")
    drop_columns = [2018,
     2020,
     2030,
     2035,
     2040,
     'EXEMPT CODE',
     'PM 1',
     'PM 2',
     'PM 3',]
    
    df = df.drop(columns = drop_columns) 
    df = to_snakecase(df)
    df.cost_estimate = df.cost_estimate * 1_000
    df.fund_estimate = df.fund_estimate.apply(pd.to_numeric, errors="coerce") * 1_000_000
    
    df["geometry"] = gpd.GeoSeries.from_xy(df.x_coord, df.y_coord, crs=geography_utils.WGS84)
    
    df = df.set_geometry("geometry")
    
    return df 

In [73]:
bcag = bcag_lrtp("BCAG.xls")

In [74]:
type(bcag)

geopandas.geodataframe.GeoDataFrame

In [225]:
bcag.shape

(250, 23)

In [226]:
bcag.sample()

Unnamed: 0,rtp_id,agency,ftip,rtp,project_type,title,project_descr,project_id,fund_source,fund_estimate,status,y_coord,x_coord,target_fiscal_year,cost_estimate,fund_source_1,fund_source_2,source,title_vi,non_title_vi,ej,non_ej,geometry
101,102,Chico,-,RTP,Bicycle & Pedestrian,Humboldt Rd Class 1,From Morning Rose Way to Bruce Rd. Class 1 bike facility (0.51 miles),CH-BIKE-LOCAL-2020-5,LOCAL,,Planned,39.74,-121.8,2025,305000.0,LOCAL,-,2019 City of Chico Bike Plan,Y,Y,Y,Y,POINT (-121.79612 39.73992)


In [228]:
bcag.title.value_counts().head()

Local Highway Bridge Program (HBP Grouped)            68
Highway Safety Improvement Program (HSIP Grouped)     19
SR 70 Passing Lanes (Segment 3)                        2
Eaton Rd Widening                                      2
Lincoln Blvd Class 2                                   2
Name: title, dtype: int64

In [232]:
# bcag[bcag.title == "Local Highway Bridge Program (HBP Grouped)"]

In [227]:
bcag.title.nunique()

153

### Fresno COG

In [221]:
def fresnocog_lrtp():
    columns_to_drop = ['unnamed:_7', 'unnamed:_8',
       'unnamed:_9']
    df1 = to_snakecase(pd.read_excel(f"{LRTP_GCS}FRESNO_COG_UNconstrained.xlsx"))
    df1['financial_constraint'] = 'unconstrained'
    
    df2 = to_snakecase(pd.read_excel(f"{LRTP_GCS}FRESNO_COG_Constrained.xlsx"))
    df2['financial_constraint'] = 'constrained'
    
    concat1 = pd.concat([df1,df2], axis=0)
    concat1 = concat1.drop(columns = columns_to_drop)
    
    return concat1

In [222]:
fresno_cog = fresnocog_lrtp()

In [223]:
fresno_cog.shape

(3147, 10)

In [224]:
fresno_cog.head()

Unnamed: 0,agency,project_id,project_title,project_description,project_type,estimated_open_to_traffic,"estimated\ntotal_cost\n_$1,000_",est__o2t_year,est_total_project_cost,financial_constraint
0,Caltrans,FRE504146,SR 180 W-I-5 to Junction SR 33/SR180: 2 Lane on E-W Alignment,2 Lane on New E-W Alignment,"Bike & Ped, Streets & Roads-Maintenance, Streets & Roads-Operations",2032-2036,119000.0,2035-01-01,119000000.0,unconstrained
1,Clovis,FRE500471,Ashlan-Highland to Thompson: 2 LU to 4 LD,"2LU to 4LD, Sidewalks, Bike Lanes, Street Lights, Curb and Gutter, Utility Relocation, Fiber Optics, Traffic Signal at Ashlan and Highland",Streets & Roads - Capacity Increasing,2022-2026,5009.0,2024-01-01,5009000.0,unconstrained
2,Clovis,FRE501601,Alluvial - Fowler to Armstrong : Maintenance,Light Maintenance,Streets & Roads-Maintenance,2032-2036,171.0,2036-01-01,171000.0,unconstrained
3,Clovis,FRE501604,Armstrong - Bullard to Barstow : Maintenance,Light Maintenance,Streets & Roads-Maintenance,2042-2046,481.0,2043-01-01,481000.0,unconstrained
4,Clovis,FRE501605,Armstrong - Herndon to Tollhouse : Maintenance,Light Maintenance,Streets & Roads-Maintenance,2042-2046,85.0,2044-01-01,85000.0,unconstrained


### KCAG

In [75]:
def kcag_lrtp(file_name):
    """
    KCAG.xlsx
    """
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}{file_name}"))
    return df

In [76]:
kcag = kcag_lrtp("KCAG.xlsx")

### MCAGOV

In [143]:
def mcagov_lrtp(file_name):
    """
    MCAGOV.xlsx
    """
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}{file_name}"))
 
    df = df.dropna(subset = ['title']).reset_index(drop = True)
    
    return df 

In [145]:
mcagov = mcagov_lrtp("MCAGOV.xlsx")

### MTC
* 133 projects but point and line map returns 240 ish rows.
* Projects that are in different locations can have the same name split by open year...
* I guess the geospatial projects are split into various phases while the regular pandas df is just the project in its entirety.
* https://www.planbayarea.org/2050-plan/final-plan-bay-area-2050/final-supplemental-reports/interactive-transportation-project-list
* https://opendata.mtc.ca.gov/datasets/MTC::2023-transportation-improvement-program-projects-point/about
* https://opendata.mtc.ca.gov/datasets/MTC::plan-bay-area-2050-transportation-projects-line/about
* https://data.bayareametro.gov/Projects/Plan-Bay-Area-2050-Transportation-Project-List/y7ka-jksz

In [10]:
def mtc_lrtp():
    # Open rest server data
    layer_list = ["planbayarea2050_rtp_line", "planbayarea2050_rtp_point"]
    url_pt_1 = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/ArcGIS/rest/services/"
    url_pt_2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    gdf = open_rest_server(url_pt_1, url_pt_2, layer_list)

    # Open all MTC projects. Not all projects are mappable.
    all_projects_url = "https://data.bayareametro.gov/resource/y7ka-jksz.json?$query=SELECT%0A%20%20%60plan_strategy%60%2C%0A%20%20%60rtpid%60%2C%0A%20%20%60title%60%2C%0A%20%20%60scope%60%2C%0A%20%20%60open_period%60%2C%0A%20%20%60funding_millions_yoe%60%2C%0A%20%20%60county%60"
    all_projects = pd.read_json(all_projects_url)

    # Merge info from all projects to gdf
    gdf = pd.merge(
        gdf, all_projects, how="inner", left_on=["proj_title"], right_on=["title"]
    )

    gdf = gdf.drop(columns=["title"])

    # Find projects that are unmappable
    mtc_geometry_titles = set(gdf.proj_title.unique().tolist())
    mtc_all_projects_titles = set(all_projects.title.unique().tolist())
    missing_projects = list(mtc_all_projects_titles - mtc_geometry_titles)

    # Add back in the  unmappable projects
    all_projects = all_projects[all_projects.title.isin(missing_projects)].reset_index(
        drop=True
    )
    all_projects = all_projects.rename(columns={"title": "proj_title"})
    final = pd.concat([all_projects, gdf])

    # Correct typo
    final = final.rename({"mode detai": "mode_detail"})

    # Set geometry again
    final = final.set_geometry("geometry").set_crs(geography_utils.WGS84)
    
    # Same project is split across multiple rows. Divide out project cost
    final = correct_project_cost(final,'proj_title', 'funding_millions_yoe')
    return final

In [11]:
mtc_og = mtc_lrtp()



In [12]:
mtc_og.columns

Index(['plan_strategy', 'rtpid', 'proj_title', 'scope', 'open_period',
       'county', 'objectid', 'map_id', 'subcompone', 'open_year', 'strategy',
       'rtp_id', 'mode_', 'mode_detai', 'investment', 'shape__length',
       'geometry', 'layer_name', 'total_project_cost',
       'total_project_cost_note'],
      dtype='object')

In [13]:
def harmonize_mtc():
    df = mtc_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="proj_title",
        project_description_col="subcompone",
        project_category_col="plan_strategy",
        project_cost_col="funding_millions_yoe",
        location_col="geometry",
        county_col="county",
        city_col="",
        project_year_col="open_year",
        phase_col="",
        program="MTC",
        data_source="LRTP",
        note_cols=[
            "strategy",
            "layer_name",
            "mode_detai",
        ],
        cost_in_millions=False,
    )

    return df

In [14]:
harmonized_mtc = harmonize_mtc()



In [15]:
harmonized_mtc.head(2)

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,project_description,phase,total_project_cost,geometry,city,county,data_source,notes
0,Operate & Maintain the Existing System | Baseline Public Transit Service Levels | Regional,MTC,,Fix It First,MTC LRTP,,,211000.0,,,Regional,LRTP,"strategy: nan, layer name: nan, mode detai: nan"
1,Operate & Maintain the Existing System | Public Transit Capital Assets | Regional,MTC,,Fix It First,MTC LRTP,,,59500.0,,,Regional,LRTP,"strategy: nan, layer name: nan, mode detai: nan"


### Southern California Association of Governments (SCAG) 

In [18]:
def delete_embedded_headers(df, column:str, string_search:str) -> pd.DataFrame:
    """
    Some PDFS include the column names embedded mulitple times
    within the df. Delete them out. 
    """
    headers = (df[df[column].str.contains(string_search)==True])
    headers_index_list = headers.index.values.tolist()
    
    print(f"{len(headers_index_list)} rows are headers") 
    
    df2 = df.drop(headers_index_list).reset_index(drop  = True)
    return df2

In [101]:
def row_threshold(df):
    """
    Some PDFS are merged/formatted where there are many
    rows full of NA. Delete them out.
    """
    # Delete rows containing either 75% or more than 75% NaN Values
    # https://thispointer.com/pandas-drop-dataframe-rows-based-on-nan-percentage/
    perc = 95.0 # 
    min_count =  int(((100-perc)/100)*df.shape[1] + 1)
    
    df2 = df.dropna(axis=0, thresh=min_count).reset_index(drop = True)
    
    print(f"dropped {len(df)-len(df2)} rows that were filled with NA")
    
    return df2

In [20]:
def scag_lrtp(file_name):
    sheets_list = ["Table 1", "Table 2"]
    dict_df = pd.read_excel(f"{LRTP_GCS}{file_name}", sheet_name=sheets_list)
    
    df1 = to_snakecase(dict_df.get("Table 1"))
    # Drop columns where everything is NA
    df1 = df1.dropna(axis=1, how='all')
    
    df2 = to_snakecase(dict_df.get("Table 2"))
    df2 = df2.dropna(axis=1, how='all')
    
    # Concat
    df3 = pd.concat([df1,df2])
    
    df4 = delete_embedded_headers(df3, "rtp_id", "RTP ID")
    
    df4 = row_threshold(df4) 
    
    # No project title
    df4['project_title'] = df4.description
    
    # Project cost is in thousands.
    # Multiple out
    df4["project_cost_millions"] = df4["project_cost__$1,000s_"]* 1_000
    return df4

In [233]:
scag = scag_lrtp()

360 rows are headers
dropped 0 rows that were filled with NA


In [234]:
scag.head()

Unnamed: 0,system,lead_agency,rtp_id,route_#,route_name,from,to,description,completion_year,"project_cost__$1,000s_",county,project_title
0,County: Imperial,,,,,,,,,,,
1,LOCAL HIGHWAY,BRAWLEY,7120001-IMP160407,0.0,,,,PAVING OF DIRT ROAD WILDCAT DRIVE FROM S. WESTERN AVENUE TO S. FIRST STREET,2019.0,1008000.0,,PAVING OF DIRT ROAD WILDCAT DRIVE FROM S. WESTERN AVENUE TO S. FIRST STREET
2,LOCAL HIGHWAY,BRAWLEY,7120001-IMP161004,0.0,,,,"ASPHALT REHABILITATION ALONG RIO VISTA AVENUE BETWEEN ALLEN STREET AND CATTLE CALL DRIVE, IN THE CITY OF BRAWLEY, IMPERIAL COUNTY.",2018.0,109000.0,,"ASPHALT REHABILITATION ALONG RIO VISTA AVENUE BETWEEN ALLEN STREET AND CATTLE CALL DRIVE, IN THE CITY OF BRAWLEY, IMPERIAL COUNTY."
3,LOCAL HIGHWAY,CALEXICO,6OM0701- IMP091001,0.0,,,,"WIDEN AND IMPROVE CESAR CHAVEZ BLVD. TO 5 LANES (3+2) FROM 2ND STREET TO SR 98. OTHER IMPROVEMENTS INCLUDE: SURFACE REHAB, TURN LANES, TRAFFIC SIGNAL, LIGHTING, AND SIDEWALKS",2019.0,9896000.0,,"WIDEN AND IMPROVE CESAR CHAVEZ BLVD. TO 5 LANES (3+2) FROM 2ND STREET TO SR 98. OTHER IMPROVEMENTS INCLUDE: SURFACE REHAB, TURN LANES, TRAFFIC SIGNAL, LIGHTING, AND SIDEWALKS"
4,LOCAL HIGHWAY,CALEXICO,7120001-IMP160408,0.0,,,,"PAVING OF DIRT ROAD DE LAS FLORES STREET BETWEEN EADY AVENUE AND KLOKE AVENUE TO INCLUDE THE INSTALLATION OF CURB, GUTTER AND SIDEWALKS",2019.0,466000.0,,"PAVING OF DIRT ROAD DE LAS FLORES STREET BETWEEN EADY AVENUE AND KLOKE AVENUE TO INCLUDE THE INSTALLATION OF CURB, GUTTER AND SIDEWALKS"


In [24]:
def harmonize_scag():
    df = scag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="description",
        project_category_col="system",
        project_cost_col="project_cost_millions",
        location_col="",
        county_col="county",
        city_col="",
        project_year_col="completion_year",
        program="SCAG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "lead_agency",
            "route_#",
            "route_name",
            "from",
            "to"
        ],
        cost_in_millions=True,
    )

    return df

In [25]:
scag_harmonized = harmonize_scag()

360 rows are headers
dropped 307 rows that were filled with NA




In [26]:
# scag.head()

In [27]:
# scag_harmonized.head()

### Sacramento Area Council of Governments (SACOG)
* https://www.sacog.org/sites/main/files/file-attachments/2020_mtp-scs.pdf?1580330993

In [28]:
# sacog = to_snakecase(gpd.read_file("https://services.sacog.org/hosting/rest/services/Hosted/2040_Major_Stops/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"))

In [29]:
def sacog_lrtp(file_name):
    """
    SACOG.xlsx
    """
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}{file_name}"))
    
    # Delete embedded headers
    df = delete_embedded_headers(df, "description", "Description")
    
    # Delete columns with all nulls
    df = df.dropna(axis=1, how='all')
    
    return df

In [30]:
sacog = sacog_lrtp()

96 rows are headers


In [31]:
sacog.shape

(1604, 10)

In [32]:
def harmonize_sacog():
    df = sacog_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="title",
        project_description_col="description",
        project_category_col="budget_category",
        project_cost_col="total_project_cost__2018_dollars_",
        location_col="",
        county_col="county",
        city_col="",
        project_year_col="completion_timing",
        program="SACOG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "year_of_expenditure_cost_for_planned_projects",
            "status__planned,_programmed_or_project_development_only_",
        ],
        cost_in_millions=True,
    )

    return df

In [33]:
sacog_harmonized = harmonize_sacog()

96 rows are headers




### SANDAG
* https://www.arcgis.com/home/item.html?id=ba8b2a6247ef463cb7411f039959f3ee
* https://www.arcgis.com/apps/mapviewer/index.html?webmap=ba8b2a6247ef463cb7411f039959f3ee

In [3]:
columns_to_keep = [
    "project_title",
    "lead_agency",
    "project_year",
    "project_category",
    "grant_program",
    "project_description",
    "total_project_cost",
    "location",
    "city",
    "county",
    "data_source",
    "notes",
]

In [4]:
def create_notes(df, note_cols: list):
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df["notes"] = df.apply(combine_notes, axis=1)
    df.notes = df.notes.str.replace("_", " ")

    return df

In [5]:
def harmonizing_lrtp(
    df,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    phase_col: str,
    program: str,
    data_source: str,
    note_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    the same names and formats.
    """
    rename_columns = {
        project_name_col: "project_title",
        phase_col: "phase",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "geometry",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  
    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce")
 
    # Add MPO & grant program
    df["lead_agency"] = program
    df["grant_program"] = f"{program} {data_source}"

    # Add data source
    df["data_source"] = data_source

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)

    # Fill in any nulls
    # df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_year",
        "project_category",
        "phase",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "geometry" not in df:
        df["geometry"] = None
    # Create notes
    df = create_notes(df, note_cols)

    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "project_description",
        "phase",
        "total_project_cost",
        "geometry",
        "city",
        "county",
        "data_source",
        "notes",
    ]

    df = df[columns_to_keep]

    return df

In [6]:
def open_rest_server(url_pt_1: str, url_pt_2: str, layer_name: list):
    full_gdf = pd.DataFrame()
    for i in layer_name:
        gdf = to_snakecase(gpd.read_file(f"{url_pt_1}{i}{url_pt_2}"))
        gdf["layer_name"] = i
        full_gdf = pd.concat([full_gdf, gdf], axis=0)

    return full_gdf

In [7]:
def sandag_lrtp():
    sandag_layers_list = [
        "Complete_Corridors_Connectors_Intersections_Ramps_RP2021",
        "Complete_Corridors_Highway_RP2021",
        "Transit_Leap_RP2021",
        "Adopted_Regional_Bike_Network_RP2021",
        "Mobility_Hubs_and_Flexible_Fleets",
        "Complete_Corridors_Regional_Arterials",
        "Goods_Movement",
    ]

    sandag_url_pt1 = (
        "https://services1.arcgis.com/HG80xaIVT1z1OdO5/ArcGIS/rest/services/"
    )
    sandag_url_pt2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    sandag = open_rest_server(sandag_url_pt1, sandag_url_pt2, sandag_layers_list)
   
    return sandag

In [8]:
def harmonize_sandag():
    df = sandag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="cost2020m",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="phase",
        phase_col="status",
        program="SANDAG",
        data_source="LRTP",
        note_cols=[
            "aqc_2016_1",
            "aqc_2020_1",
            "pricmcp",
            "conncmcp",
            "layer_name",
            "corridor_i",
            "type_1",
            "existing",
            "limits",
            "description_1",
            "route",
            "routetype",
            "route_desc",
            "rp_2021_id",
            "rp_2021_id_1",
            "capital_cost___2020__millions",
        ],
        cost_in_millions=True,
    )

    return df

### SBCAG

In [124]:
def sbcag_lrtp(file_name):
    """
    SBCAG.xlsx
    """
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}{file_name}"))
    drop_columns = ['unnamed:_2', 'project_type', 'unnamed:_4','unnamed:_6', 'unnamed:_8', 'unnamed:_9',
           'unnamed:_10', 'unnamed:_13',]
    df = df.drop(columns = drop_columns)
    df = delete_embedded_headers(df, "description", "Description")
    df = df.dropna(subset = ['description']).reset_index(drop = True)
    
    # df['total_cost_millions'] = df['total_cost__$000s_'].apply(pd.to_numeric, errors="coerce") * 1_000
    return df 

In [125]:
sbcag = sbcag_lrtp("SBCAG.xlsx")

65 rows are headers


In [122]:
sbcag.shape

(480, 8)

###  SJCOG

In [146]:
df = to_snakecase(pd.read_excel(f"{LRTP_GCS}SJCOG.xlsx"))

In [152]:
sheets_list = ["1 Mainline", "2 Interchanges", "3 Reg Roadways",
              "4 RR Xings", "5 Bus Transit", "6 Rail Corridor",
              "7 Airports", "8 Active Trans Facilities", "9 TCMs",
              "10 Ops & Maint",]

In [153]:
dict_df = pd.read_excel(f"{LRTP_GCS}SJCOG.xlsx", sheet_name=sheets_list)

In [154]:
df1 = to_snakecase(dict_df.get("1 Mainline"))

In [159]:
df2 = to_snakecase(dict_df.get("2 Interchanges"))

In [160]:
df3 = to_snakecase(dict_df.get( "3 Reg Roadways"))

In [161]:
df4 = to_snakecase(dict_df.get("4 RR Xings"))

In [162]:
df5 = to_snakecase(dict_df.get("5 Bus Transit"))

In [163]:
df6 = to_snakecase(dict_df.get("6 Rail Corridor"))

In [164]:
df7 = to_snakecase(dict_df.get("7 Airports"))

In [165]:
df8 = to_snakecase(dict_df.get("8 Active Trans Facilities"))

In [166]:
df9 = to_snakecase(dict_df.get("9 TCMs"))

In [167]:
df10 = to_snakecase(dict_df.get("10 Ops & Maint"))

In [170]:
concat1 = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10], axis=1)

In [171]:
concat1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Columns: 177 entries, unnamed:_0 to milestone_years_1
dtypes: float64(92), object(85)
memory usage: 92.8+ KB


### Stancog

In [203]:
def embedded_column_names(df, data_start:int) -> pd.DataFrame:
    """
    Some excel sheets have headers and  column names
    embedded in the dataframe. Take them out.
    """
    # Delete header 
    df = df.iloc[data_start:].reset_index(drop=True)
    # The first row contains column names - update it to the column
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)
    
    return df 

In [204]:
def stancog_lrtp(file_name):
    """
    STANCOG.xlsx
    """
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}{file_name}"))
    df = embedded_column_names(df, 1)
    keep_columns = [
     'Jurisdiction',
     'Location',
     'Project Limits',
     'Description',
     'Total Cost',
     'Open to Traffic',
     'Funding Source',
     'System Preserv.',
     'Capacity Enhance.',
     'Safety',
     'Oper.',
     'Complete Streets',
     'Active\nTransporta tion',
     'Transit',
     'Other']
    
    df = df[keep_columns]
    
    df = to_snakecase(df)
    
    df = df.dropna(subset = ['description']).reset_index(drop = True)
    
    return df

In [205]:
stancog = stancog_lrtp("STANCOG.xlsx")

### Tahoe
* Open data portal: https://www.tahoeopendata.org/search?tags=transportation
* https://www.tahoeopendata.org/datasets/TRPA::2020-regional-transportation-plan-projects/about

In [16]:
def tmpo_lrtp():
    tahoe_url = "https://maps.trpa.org/server/rest/services/Datadownloader_Transportation/MapServer/19/query?where=1%3D1&text=&objectIds=&time=&timeRelation=esriTimeRelationOverlaps&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&returnTrueCurves=false&maxAllowableOffset=&geometryPrecision=&outSR=&havingClause=&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&historicMoment=&returnDistinctValues=false&resultOffset=&resultRecordCount=&returnExtentOnly=false&sqlFormat=none&datumTransformation=&parameterValues=&rangeValues=&quantizationParameters=&featureEncoding=esriDefault&f=geojson"
    gdf = to_snakecase(gpd.read_file(tahoe_url))
    gdf.complete_year = gdf.complete_year.astype(int)

    # Filter out projects that are not yet completed
    gdf2 = gdf[gdf.complete_year > 2022].reset_index(drop=True)

    gdf2["project_year"] = (
        gdf2.start_year.astype(str) + "-" + gdf.complete_year.astype(str)
    )
    return gdf2

In [17]:
def harmonize_tahoe():
    df = tmpo_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="estimated_cost",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="project_year",
        program="TMPO",
        phase_col="phase",
        data_source="LRTP",
        note_cols=[
            "implementer",
            "funding_type",
            "plan_name",
            "label",
            "financial_status",
            "url",
        ],
        cost_in_millions=True,
    )

    return df

### Congestion

#### Lost

In [None]:
lost = harmonization_utils.load_lost()

In [None]:
lost.head()

In [None]:
def harmonize_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="cost__in_millions_",
        location_col="",
        county_col="county",
        city_col="city",
        project_year_col="",
        program="",
        phase_col="phase",
        data_source="LOST",
        note_cols=["measure", 'estimated_lost_funds',
       'estimated_federal_funds', 'estimated_state_funds',
       'estimated_local_funds', 'estimated_other_funds',"notes"],
        cost_in_millions=False,
    )
    
    
    return df

In [None]:
lost_harmonized = harmonize_lost()

In [None]:
lost_harmonized.head()

In [None]:
def all_mpo():
    tahoe = harmonize_tahoe()
    sandag = harmonize_sandag()
    mtc = harmonize_mtc()
    lost = harmonize_lost()
    sacog = harmonize_sacog()
    scag = harmonize_scag()
    
    df = pd.concat([tahoe, mtc, sandag, lost, sacog, scag])

    return df

In [None]:
all_mpo_df = all_mpo()

In [None]:
all_mpo_df.groupby(["grant_program"]).size()

In [None]:
congest_keywords = [
    "congestion pricing",
    "variablly priced",
    "variable tolls",
    "express lane",
    "value",
    "rush hour",
    "cordon",
    "dynamic pricing",
    "dynmically priced",
    "high occupancy toll",
    "high-occupancy"
    "mobility pricing",
    "occupancy",
    "toll lane",
    "performance pricing",
    "peak travel",
    "managed lane",
    "tollway",
    "transponder",
    "express toll",
    "fixed pricing"
]

In [None]:
def lower_case(df, columns_to_search: list):
    """
    Lowercase certain columns
    """
    new_columns = []
    for i in columns_to_search:
        df[f"lower_case_{i}"] = (df[i].str.lower().fillna("none")
                                 .str.replace("-","")
                                 .str.replace(".","")
                                 .str.replace(":","")
                                )
        new_columns.append(f"lower_case_{i}")

    return df,new_columns

In [None]:
def find_keywords(df, columns_to_search:list, keywords_search:list):
    df2, lower_case_cols_list = lower_case(df, columns_to_search)
    
    keywords_search = f"({'|'.join(keywords_search)})"
    
    for i in lower_case_cols_list:
        df2[f"{i}_keyword_search"] = df2[i].str.extract(keywords_search).fillna("not congestion")
    
    return df2

In [None]:
# test = find_keywords(all_mpo_df,  ["project_title", "project_description", "notes"], congest_keywords)

In [None]:
def filter_cordon(df, columns_to_search:list, keywords_search:list, save_to_gcs: bool = False):

    # Filter out for Cordon 
    df = find_keywords(df, columns_to_search, keywords_search)
    df2 = (df[(df.lower_case_project_title_keyword_search != "not congestion")
        | (df.lower_case_project_description_keyword_search != "not congestion")]).reset_index(drop=True)
    
    # Delete out HOV
    projects_to_delete = ["SR 17 Corridor Congestion Relief in Los Gatos",
                         "Interstate 380 Congestion Improvements"]
    df2 = df2[~df2.project_title.isin(projects_to_delete)].reset_index(drop = True)

    # Drop invalid geometries
    gdf = df2[~df2.geometry.is_empty].reset_index(drop = True)
    gdf = gdf[gdf.geometry.is_valid].reset_index(drop = True)
    
    gdf = gdf.set_geometry("geometry")
    gdf = gdf.fillna(gdf.dtypes.replace({"float64": 0.0, "object": "None"}))

    # One version that's a df
    columns_to_drop = ['lower_case_project_title', 'lower_case_project_description']
    df2 = df2.drop(columns = columns_to_drop + ['geometry'])
    # df2 = df2.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    if save_to_gcs:
        df2.to_excel(f"{harmonization_utils.GCS_FILE_PATH}LRTP/congestion_pricing_sample.xlsx", sheet_name="Sheet_name_1", index=False)
        gdf.to_file("./congestion_pricing_sample.geojson", driver="GeoJSON")
        
    return gdf, df2

In [None]:
cordon_gdf, cordon_df = filter_cordon(all_mpo_df, ["project_title", "project_description",], congest_keywords)

In [None]:
len(cordon_df.drop_duplicates(subset = ['project_title']))

In [None]:
cordon_df.groupby(["grant_program"]).agg({"project_title":"nunique"})

In [None]:
cordon_df.grant_program.value_counts()

In [None]:
cordon_df.drop_duplicates(subset = ['total_project_cost','project_title',]).groupby(["grant_program"]).agg({'total_project_cost':'sum'})

In [None]:
cols_to_drop = ['city','county', 'notes', 'phase','project_year',]

In [None]:
cordon_df.drop(columns =cols_to_drop).drop_duplicates(subset = ['project_title', 'project_description'])

In [None]:
# cordon_gdf.explore('project_title', cmap= 'tab20c', style_kwds={'weight':6}, legend = False)