## Long Range Transportation Plan

In [1]:
import _harmonization_utils as harmonization_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from shared_utils import geography_utils, utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
LRTP_GCS = f"{harmonization_utils.GCS_FILE_PATH}LRTP/"

### General Functions

In [4]:
def give_info(df, project_title:str, other_descriptive_col:str):
    print(df[project_title].value_counts().head())
    print(f"# of unique project titles: {df[project_title].nunique()}")
    print(f"After dropping duplicates {len(df.drop_duplicates(subset = [project_title, other_descriptive_col]))}")
    print(f"Df shape: {df.shape}")
    print(df.columns)

In [5]:
def harmonizing_lrtp(
    df,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    phase_col: str,
    program: str,
    data_source: str,
    note_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    the same names and formats.
    """
    rename_columns = {
        project_name_col: "project_title",
        phase_col: "phase",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "geometry",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  
    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce")
 
    # Add MPO & grant program
    df["lead_agency"] = program
    df["grant_program"] = f"{program} {data_source}"

    # Add data source
    df["data_source"] = data_source

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)

    # Fill in any nulls
    # df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_year",
        "project_category",
        "phase",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "geometry" not in df:
        df["geometry"] = None
    # Create notes
    df = create_notes(df, note_cols)

    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "project_description",
        "phase",
        "total_project_cost",
        "geometry",
        "city",
        "county",
        "data_source",
        "notes",
    ]

    df = df[columns_to_keep]

    return df

In [6]:
def embedded_column_names(df, data_start:int) -> pd.DataFrame:
    """
    Some excel sheets have headers and  column names
    embedded in the dataframe. Take them out.
    """
    # Delete header 
    df = df.iloc[data_start:].reset_index(drop=True)
    # The first row contains column names - update it to the column
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)
    
    return df 

In [7]:
def create_notes(df, note_cols: list):
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df["notes"] = df.apply(combine_notes, axis=1)
    df.notes = df.notes.str.replace("_", " ")

    return df

In [8]:
def open_rest_server(url_pt_1: str, url_pt_2: str, layer_name: list):
    full_gdf = pd.DataFrame()
    for i in layer_name:
        gdf = to_snakecase(gpd.read_file(f"{url_pt_1}{i}{url_pt_2}"))
        gdf["layer_name"] = i
        full_gdf = pd.concat([full_gdf, gdf], axis=0)

    return full_gdf

In [9]:
def row_threshold(df):
    """
    Some PDFS are merged/formatted where there are many
    rows full of NA. Delete them out.
    """
    # Delete rows containing either 75% or more than 75% NaN Values
    # https://thispointer.com/pandas-drop-dataframe-rows-based-on-nan-percentage/
    perc = 95.0 # 
    min_count =  int(((100-perc)/100)*df.shape[1] + 1)
    
    df2 = df.dropna(axis=0, thresh=min_count).reset_index(drop = True)
    
    print(f"dropped {len(df)-len(df2)} rows that were filled with NA")
    
    return df2

In [10]:
def delete_embedded_headers(df, column:str, string_search:str) -> pd.DataFrame:
    """
    Some PDFS include the column names embedded mulitple times
    within the df. Delete them out. 
    """
    headers = (df[df[column].str.contains(string_search)==True])
    headers_index_list = headers.index.values.tolist()
    
    print(f"{len(headers_index_list)} rows are headers") 
    
    df2 = df.drop(headers_index_list).reset_index(drop  = True)
    return df2

In [11]:
def correct_project_cost(df, project_title_col:str, project_total_cost:str):
    """
    For some datasets, the same project 
    (as determined by the same project name, cost,
    and source) is split across multiple rows. 

    Ex: A project costs $500 million and is 
    split on 5 rows by phase/location. Each row still lists 
    the total  cost as $500 million, which is not accurate.
    This function will recalculate each of the row to list
    $100 mil as the total project cost
    """
    # Create a unique identifier
    df['unique_identifier'] =  df[project_title_col] + df[project_total_cost].astype(str)
    
    # Create count for each project
    df['how_many_times_same_proj_appears'] = df.groupby('unique_identifier').cumcount()+1
    
    # Find the total number of times a project title-cost appears. 
    # Sort by descending and keep only the row with the highest level
    df2 = (df[[project_title_col,'how_many_times_same_proj_appears',project_total_cost, 'unique_identifier']]
            .sort_values([project_title_col, 'how_many_times_same_proj_appears'], ascending = False)
            .drop_duplicates(subset = ['unique_identifier'])
           ) 
    # Create new funding estimate
    df2['new_proj_cost'] = df2[project_total_cost] / df2["how_many_times_same_proj_appears"]
    
    # Drop some columns
    df2 = df2.drop(columns = [project_title_col,project_total_cost,'how_many_times_same_proj_appears'])
    
    # Merge
    m1 = pd.merge(df, df2, how = "inner", on = "unique_identifier")
    
    # Clean up
    m1 = m1.drop(columns = ['unique_identifier', 'how_many_times_same_proj_appears', project_total_cost])
    m1['total_project_cost_note'] = "This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset."
    
    # Replace project cost
    m1 = m1.rename(columns = {'new_proj_cost':'total_project_cost'})
    return m1

### Ambag

In [12]:
def ambag_lrtp():
    ambag1 = pd.read_excel(f"{LRTP_GCS}AMBAG_2045 Revenue Constrained Project List_Monterey County_by Project Type__022823.xlsx")
    ambag2 = pd.read_excel(f"{LRTP_GCS}AMBAG_2045 Revenue Constrained Project List_San Benito County_by Project Type__022823.xlsx")
    ambag3 = pd.read_excel(f"{LRTP_GCS}AMBAG_2045 Revenue Constrained Project List_Santa Cruz County_by Project Type__041923.xlsx")
    
    # Move column names up 
    ambag1 = embedded_column_names(ambag1, 1)
    ambag2 = embedded_column_names(ambag2, 1)
    ambag3 = embedded_column_names(ambag3, 1)
    
    # Add county
    ambag1['county'] = 'Santa Cruz'
    ambag2['county'] = 'Monterey'
    ambag3['county'] = 'San Benito'
    
    # Conat
    concat1 = pd.concat([ambag1, ambag2, ambag3], axis = 0)
    concat1 = concat1.drop(columns = [2035.0,2045.0,2020.0,2035.0])
    
    concat1 = to_snakecase(concat1)
    return concat1

In [13]:
ambag = ambag_lrtp()

In [14]:
ambag.columns

Index(['ambag_id', 'project', 'project_description', 'total_cost_\n_$_000s_',
       'county'],
      dtype='object', name=0)

In [15]:
give_info(ambag, 'project', 'project_description')

Citywide Bike Lanes                         2
The Alameda - Salinas Road Bike Route       2
Citywide Sidewalk Program                   2
Carmel to Pebble Beach Bike/Ped Facility    1
Second Street Bike Lane                     1
Name: project, dtype: int64
# of unique project titles: 277
After dropping duplicates 280
Df shape: (280, 5)
Index(['ambag_id', 'project', 'project_description', 'total_cost_\n_$_000s_',
       'county'],
      dtype='object', name=0)


In [16]:
ambag.loc[ambag.project == 'Citywide Bike Lanes']

Unnamed: 0,ambag_id,project,project_description,total_cost_\n_$_000s_,county
133,MON-SOL075-SO,Citywide Bike Lanes,"Bike Lanes (2007 TIF M2, 2013 TIF M2); construct bike lanes citywide",1440,Santa Cruz
52,SC-SV-P41-SCV,Citywide Bike Lanes,Construction of additional bike lanes and paths citywide (including Green Hills).,2060,San Benito


In [17]:
ambag.loc[ambag.project == 'The Alameda - Salinas Road Bike Route']

Unnamed: 0,ambag_id,project,project_description,total_cost_\n_$_000s_,county
27,SB-SBC-A62,The Alameda - Salinas Road Bike Route,"Class III, 0.65 miles, 4th Street to Old Stagecoach Road.",9,Monterey
44,SB-SJB-A26,The Alameda - Salinas Road Bike Route,"Class III - Stripping a bike lane from Franklin to Old SJ Hollister Rd., S-10 of the Bike Plan.",50,Monterey


### BCAG 

In [18]:
def bcag_lrtp():
    df = pd.read_excel(f"{LRTP_GCS}BCAG.xls")
    drop_columns = [2018,
     2020,
     2030,
     2035,
     2040,
     'EXEMPT CODE',
     'PM 1',
     'PM 2',
     'PM 3',]
    
    df = df.drop(columns = drop_columns) 
    df = to_snakecase(df)
    df.cost_estimate = df.cost_estimate * 1_000
    df.fund_estimate = df.fund_estimate.apply(pd.to_numeric, errors="coerce") * 1_000_000
    
    df["geometry"] = gpd.GeoSeries.from_xy(df.x_coord, df.y_coord, crs=geography_utils.WGS84)
    # Same project is split across multiple rows. Divide out project cost
    df = correct_project_cost(df,'title', 'cost_estimate')
    df = df.set_geometry("geometry")
    
    
    return df 

In [19]:
bcag = bcag_lrtp()

In [20]:
bcag.head(1)

Unnamed: 0,rtp_id,agency,ftip,rtp,project_type,title,project_descr,project_id,fund_source,fund_estimate,status,y_coord,x_coord,target_fiscal_year,fund_source_1,fund_source_2,source,title_vi,non_title_vi,ej,non_ej,geometry,total_project_cost,total_project_cost_note
0,1,BCAG,FTIP,RTP,Transit,Butte Regional Transt - Capital and Operating Assistance,Federal Transit Admininstration Program Sections 5307 & 5311 programs to support transit services provided by Butte Regional Transit. (Fixed Route and Paratransit),20200000200,Federal Transit Administration Funds & Transportation Development Act Funds,,Programmed,39.71,-121.82,Ongoing,FTA,-,B Line Budget,Y,Y,Y,Y,POINT (-121.82174 39.70508),27300000.0,"This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset."


In [21]:
give_info(bcag, 'title', 'total_project_cost')

Local Highway Bridge Program (HBP Grouped)            68
Highway Safety Improvement Program (HSIP Grouped)     19
SR 70 Passing Lanes (Segment 3)                        2
Eaton Rd Widening                                      2
Lincoln Blvd Class 2                                   2
Name: title, dtype: int64
# of unique project titles: 153
After dropping duplicates 228
Df shape: (250, 24)
Index(['rtp_id', 'agency', 'ftip', 'rtp', 'project_type', 'title',
       'project_descr', 'project_id', 'fund_source', 'fund_estimate', 'status',
       'y_coord', 'x_coord', 'target_fiscal_year', 'fund_source_1',
       'fund_source_2', 'source', 'title_vi', 'non_title_vi', 'ej', 'non_ej',
       'geometry', 'total_project_cost', 'total_project_cost_note'],
      dtype='object')


In [22]:
bcag.shape

(250, 24)

### Fresno COG

In [23]:
def fresnocog_lrtp():
    columns_to_drop = ['unnamed:_7', 'unnamed:_8',
       'unnamed:_9']
    df1 = to_snakecase(pd.read_excel(f"{LRTP_GCS}FRESNO_COG_UNconstrained.xlsx"))
    df1['financial_constraint'] = 'unconstrained'
    
    df2 = to_snakecase(pd.read_excel(f"{LRTP_GCS}FRESNO_COG_Constrained.xlsx"))
    df2['financial_constraint'] = 'constrained'
    
    concat1 = pd.concat([df1,df2], axis=0)
    concat1 = concat1.drop(columns = columns_to_drop)
    
    return concat1

In [24]:
fresno_cog = fresnocog_lrtp()

In [25]:
give_info(fresno_cog, 'project_title', 'estimated\ntotal_cost\n_$1,000_')

Herndon @ SR 99-Widen Undercrossing                            2
Willow-International to Copper Southbound: Widen to 3 Lanes    2
SHIELDS: GARFIELD to CHATEAU FRESNO                            2
Anchor Ave. Park Blvd to Adams                                 2
Adams Ave Reconstruction                                       2
Name: project_title, dtype: int64
# of unique project titles: 3128
After dropping duplicates 3143
Df shape: (3147, 10)
Index(['agency', 'project_id', 'project_title', 'project_description',
       'project_type', 'estimated_open_to_traffic',
       'estimated\ntotal_cost\n_$1,000_', 'est__o2t_year',
       'est_total_project_cost', 'financial_constraint'],
      dtype='object')


In [26]:
fresno_cog.loc[fresno_cog.project_title == "Herndon @ SR 99-Widen Undercrossing"]

Unnamed: 0,agency,project_id,project_title,project_description,project_type,estimated_open_to_traffic,"estimated\ntotal_cost\n_$1,000_",est__o2t_year,est_total_project_cost,financial_constraint
978,Fresno,FRE111353,Herndon @ SR 99-Widen Undercrossing,Widen Undercrossing to 5 LN,Streets & Roads - Capacity Increasing,2027-2031,21262.0,2028-01-01,21262000.0,constrained
2076,Fresno,FRE111353,Herndon @ SR 99-Widen Undercrossing,Widen Undercrossing to 5 LN(Measure C Project K8 in the Urban Regional Program),Streets & Roads-Capacity Increasing,2028,26365.0,NaT,,constrained


### KCAG

In [27]:
def kcag_lrtp():
   
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}KCAG.xlsx"))
    
    # No title column
    df['title'] = (df.category + '-' + df.description + '-' + df.location).fillna("No Title")
    
    # Some duplicates
    df = df.drop_duplicates(['location','category','description']).reset_index(drop = True)
    return df

In [28]:
kcag = kcag_lrtp()

In [29]:
kcag.columns

Index(['category', 'jurisdiction', 'state_route', 'post_mile', 'location',
       'project_limits', 'description', 'title'],
      dtype='object')

In [30]:
kcag[kcag.title == 'Proposed Improvements-Pavement Maintenance-Various Locations']

Unnamed: 0,category,jurisdiction,state_route,post_mile,location,project_limits,description,title
4,Proposed Improvements,CORCORAN,,,Various Locations,,Pavement Maintenance,Proposed Improvements-Pavement Maintenance-Various Locations


In [31]:
give_info(kcag, 'title', 'description')

No Title                                          26
Proposed Improvements-Overlay-Claudia Drive        1
Proposed Improvements-Overlay-Grand Street         1
Proposed Improvements-Overlay-Linda Lee Circle     1
Proposed Improvements-Overlay-Newbridge Drive      1
Name: title, dtype: int64
# of unique project titles: 59
After dropping duplicates 59
Df shape: (84, 8)
Index(['category', 'jurisdiction', 'state_route', 'post_mile', 'location',
       'project_limits', 'description', 'title'],
      dtype='object')


### Madera CTC

In [140]:
madera = to_snakecase(pd.read_excel(f"{LRTP_GCS}Madera_CTC.xlsx"))

In [142]:
madera.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 784 entries, 0 to 783
Data columns (total 33 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   agency        768 non-null    object 
 1   unnamed:_1    16 non-null     object 
 2   unnamed:_2    671 non-null    object 
 3   unnamed:_3    13 non-null     object 
 4   unnamed:_4    1 non-null      object 
 5   project_name  63 non-null     object 
 6   unnamed:_6    16 non-null     object 
 7   unnamed:_7    147 non-null    object 
 8   unnamed:_8    183 non-null    object 
 9   unnamed:_9    354 non-null    object 
 10  unnamed:_10   1 non-null      object 
 11  location      62 non-null     object 
 12  unnamed:_12   34 non-null     object 
 13  unnamed:_13   128 non-null    object 
 14  unnamed:_14   13 non-null     object 
 15  unnamed:_15   18 non-null     object 
 16  project_id    143 non-null    object 
 17  unnamed:_17   238 non-null    object 
 18  unnamed:_18   129 non-null    

In [146]:
madera[['unnamed:_29']].dropna()

Unnamed: 0,unnamed:_29
218,1647000.0
219,336000.0
220,379000.0
221,146000.0
222,150000.0
223,368000.0
224,2500000.0
225,1158000.0
226,1487000.0
227,1250000.0


### MCAGOV

In [32]:
def mcagov_lrtp():

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}MCAGOV.xlsx"))
 
    df = df.dropna(subset = ['title']).reset_index(drop = True)
    
    return df 

In [33]:
mcagov = mcagov_lrtp()

In [34]:
mcagov.sample()

Unnamed: 0,agency,title,limits_description,type,completion\nyear,"total_cost\n_$1,000s_",funding_sources
80,TJPA,TJPA Outreach and Marketing,For transit service in Merced Region,Transit,Ongoing,2500,CMAQ


In [35]:
give_info(mcagov, 'title', 'total_cost\n_$1,000s_')

Bellevue Rd Realignment/Reconstruction    1
Minturn Road Rehabilitation               1
MCAG Vanpool Program (dibs)               1
Measure V Transit Free Ride Program       1
Planada Route Improvements                1
Name: title, dtype: int64
# of unique project titles: 108
After dropping duplicates 108
Df shape: (108, 7)
Index(['agency', 'title', 'limits_description', 'type', 'completion\nyear',
       'total_cost\n_$1,000s_', 'funding_sources'],
      dtype='object')


### MTC
* 133 projects but point and line map returns 240 ish rows.
* Projects that are in different locations can have the same name split by open year...
* I guess the geospatial projects are split into various phases while the regular pandas df is just the project in its entirety.
* https://www.planbayarea.org/2050-plan/final-plan-bay-area-2050/final-supplemental-reports/interactive-transportation-project-list
* https://opendata.mtc.ca.gov/datasets/MTC::2023-transportation-improvement-program-projects-point/about
* https://opendata.mtc.ca.gov/datasets/MTC::plan-bay-area-2050-transportation-projects-line/about
* https://data.bayareametro.gov/Projects/Plan-Bay-Area-2050-Transportation-Project-List/y7ka-jksz

In [36]:
def mtc_lrtp():
    # Open rest server data
    layer_list = ["planbayarea2050_rtp_line", "planbayarea2050_rtp_point"]
    url_pt_1 = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/ArcGIS/rest/services/"
    url_pt_2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    gdf = open_rest_server(url_pt_1, url_pt_2, layer_list)

    # Open all MTC projects. Not all projects are mappable.
    all_projects_url = "https://data.bayareametro.gov/resource/y7ka-jksz.json?$query=SELECT%0A%20%20%60plan_strategy%60%2C%0A%20%20%60rtpid%60%2C%0A%20%20%60title%60%2C%0A%20%20%60scope%60%2C%0A%20%20%60open_period%60%2C%0A%20%20%60funding_millions_yoe%60%2C%0A%20%20%60county%60"
    all_projects = pd.read_json(all_projects_url)

    # Merge info from all projects to gdf
    gdf = pd.merge(
        gdf, all_projects, how="inner", left_on=["proj_title"], right_on=["title"]
    )

    gdf = gdf.drop(columns=["title"])

    # Find projects that are unmappable
    mtc_geometry_titles = set(gdf.proj_title.unique().tolist())
    mtc_all_projects_titles = set(all_projects.title.unique().tolist())
    missing_projects = list(mtc_all_projects_titles - mtc_geometry_titles)

    # Add back in the  unmappable projects
    all_projects = all_projects[all_projects.title.isin(missing_projects)].reset_index(
        drop=True
    )
    all_projects = all_projects.rename(columns={"title": "proj_title"})
    final = pd.concat([all_projects, gdf])

    # Correct typo
    final = final.rename({"mode detai": "mode_detail"})

    # Set geometry again
    final = final.set_geometry("geometry").set_crs(geography_utils.WGS84)
    
    # Same project is split across multiple rows. Divide out project cost
    final = correct_project_cost(final,'proj_title', 'funding_millions_yoe')
    return final

In [37]:
mtc_og = mtc_lrtp()



In [38]:
give_info(mtc_og, 'proj_title', 'total_project_cost')

Minor Roadway Improvements | Regional                                           36
Express Lanes | Regional                                                        32
Other Investments to Expand & Modernize the Regional Rail Network | Regional    15
Bay Area Forward Program | Regional                                              8
Rapid Bus | AC Transit | Modernization                                           8
Name: proj_title, dtype: int64
# of unique project titles: 133
After dropping duplicates 133
Df shape: (282, 20)
Index(['plan_strategy', 'rtpid', 'proj_title', 'scope', 'open_period',
       'county', 'objectid', 'map_id', 'subcompone', 'open_year', 'strategy',
       'rtp_id', 'mode_', 'mode_detai', 'investment', 'shape__length',
       'geometry', 'layer_name', 'total_project_cost',
       'total_project_cost_note'],
      dtype='object')


In [39]:
def harmonize_mtc():
    df = mtc_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="proj_title",
        project_description_col="subcompone",
        project_category_col="plan_strategy",
        project_cost_col="funding_millions_yoe",
        location_col="geometry",
        county_col="county",
        city_col="",
        project_year_col="open_year",
        phase_col="",
        program="MTC",
        data_source="LRTP",
        note_cols=[
            "strategy",
            "layer_name",
            "mode_detai",
        ],
        cost_in_millions=False,
    )

    return df

In [40]:
harmonized_mtc = harmonize_mtc()



### Sacramento Area Council of Governments (SACOG)
* https://www.sacog.org/sites/main/files/file-attachments/2020_mtp-scs.pdf?1580330993

In [41]:
# sacog = to_snakecase(gpd.read_file("https://services.sacog.org/hosting/rest/services/Hosted/2040_Major_Stops/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"))

In [42]:
def sacog_lrtp():

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}SACOG.xlsx"))
    
    # Delete embedded headers
    df = delete_embedded_headers(df, "description", "Description")
    
    # Delete columns with all nulls
    df = df.dropna(axis=1, how='all')
    
    # Drop duplicates
    df = df.drop_duplicates().reset_index(drop = True)
    return df

In [43]:
sacog = sacog_lrtp()

96 rows are headers


In [44]:
sacog.columns

Index(['id', 'status__planned,_programmed_or_project_development_only_',
       'county', 'lead_agency', 'budget_category', 'title', 'description',
       'total_project_cost__2018_dollars_',
       'year_of_expenditure_cost_for_planned_projects', 'completion_timing'],
      dtype='object')

In [45]:
give_info(sacog, 'title', 'description')

Safety Improvements                            2
Connect Card Implementation                    2
Aux Lane Project: EB Latrobe Road              1
Bus Maintenance Facility #1 Rehabilitation     1
Name: title, dtype: int64
# of unique project titles: 1599
After dropping duplicates 1601
Df shape: (1601, 10)
Index(['id', 'status__planned,_programmed_or_project_development_only_',
       'county', 'lead_agency', 'budget_category', 'title', 'description',
       'total_project_cost__2018_dollars_',
       'year_of_expenditure_cost_for_planned_projects', 'completion_timing'],
      dtype='object')


In [46]:
def harmonize_sacog():
    df = sacog_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="title",
        project_description_col="description",
        project_category_col="budget_category",
        project_cost_col="total_project_cost__2018_dollars_",
        location_col="",
        county_col="county",
        city_col="",
        project_year_col="completion_timing",
        program="SACOG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "year_of_expenditure_cost_for_planned_projects",
            "status__planned,_programmed_or_project_development_only_",
        ],
        cost_in_millions=True,
    )

    return df

In [47]:
sacog_harmonized = harmonize_sacog()

96 rows are headers




### Southern California Association of Governments (SCAG) 

In [48]:
def scag_lrtp():
    sheets_list = ["Table 1", "Table 2"]
    dict_df = pd.read_excel(f"{LRTP_GCS}SCAG.xlsx", sheet_name=sheets_list)
    
    df1 = to_snakecase(dict_df.get("Table 1"))
    # Drop columns where everything is NA
    df1 = df1.dropna(axis=1, how='all')
    
    df2 = to_snakecase(dict_df.get("Table 2"))
    df2 = df2.dropna(axis=1, how='all')
    
    # Concat
    df3 = pd.concat([df1,df2])
    
    df4 = delete_embedded_headers(df3, "rtp_id", "RTP ID")
    
    df4 = df4.dropna(subset = ['description','lead_agency','project_cost__$1,000s_']).reset_index(drop = True)
    
    # No project title
    df4['project_title'] = (df4.description + '-' + df4.lead_agency).fillna("No Title")
    
    # Project cost is in thousands.
    # Multiple out
    df4["project_cost_millions"] = df4["project_cost__$1,000s_"]* 1_000
    return df4

In [49]:
scag = scag_lrtp()

360 rows are headers


In [50]:
give_info(scag, 'project_title', 'route_name')

WIDEN FROM 2 TO 4 LANES-RIVERSIDE COUNTY      34
WIDEN FROM 2 TO 4 LANES-PERRIS                24
WIDEN FROM 2 TO 6 LANES-RIVERSIDE COUNTY      20
WIDEN FROM 2 TO 4 LANES-DESERT HOT SPRINGS    18
WIDEN FROM 4 TO 6 LANES-PALM DESERT           17
Name: project_title, dtype: int64
# of unique project titles: 2668
After dropping duplicates 2850
Df shape: (2952, 13)
Index(['system', 'lead_agency', 'rtp_id', 'route_#', 'route_name', 'from',
       'to', 'description', 'completion_year', 'project_cost__$1,000s_',
       'county', 'project_title', 'project_cost_millions'],
      dtype='object')


In [51]:
def harmonize_scag():
    df = scag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="description",
        project_category_col="system",
        project_cost_col="project_cost_millions",
        location_col="",
        county_col="county",
        city_col="",
        project_year_col="completion_year",
        program="SCAG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "lead_agency",
            "route_#",
            "route_name",
            "from",
            "to"
        ],
        cost_in_millions=True,
    )

    return df

In [52]:
scag_harmonized = harmonize_scag()

360 rows are headers




In [53]:
# scag.head()

In [54]:
# scag_harmonized.head()

### SANDAG
* https://www.arcgis.com/home/item.html?id=ba8b2a6247ef463cb7411f039959f3ee
* https://www.arcgis.com/apps/mapviewer/index.html?webmap=ba8b2a6247ef463cb7411f039959f3ee

In [55]:
columns_to_keep = [
    "project_title",
    "lead_agency",
    "project_year",
    "project_category",
    "grant_program",
    "project_description",
    "total_project_cost",
    "location",
    "city",
    "county",
    "data_source",
    "notes",
]

In [56]:
def sandag_lrtp():
    sandag_layers_list = [
        "Complete_Corridors_Connectors_Intersections_Ramps_RP2021",
        "Complete_Corridors_Highway_RP2021",
        "Transit_Leap_RP2021",
        "Adopted_Regional_Bike_Network_RP2021",
        "Mobility_Hubs_and_Flexible_Fleets",
        "Complete_Corridors_Regional_Arterials",
        "Goods_Movement",
    ]

    sandag_url_pt1 = (
        "https://services1.arcgis.com/HG80xaIVT1z1OdO5/ArcGIS/rest/services/"
    )
    sandag_url_pt2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    sandag = open_rest_server(sandag_url_pt1, sandag_url_pt2, sandag_layers_list)
   
    # Same project is split across multiple rows. Divide out project cost
    sandag.cost2020m = sandag.cost2020m.str.replace("$","").str.replace("N/A","").apply(pd.to_numeric, errors="coerce")
    sandag = correct_project_cost(sandag,'project_name', 'cost2020m')
    
    return sandag

In [57]:
sandag = sandag_lrtp()



In [58]:
sandag.loc[sandag.project_name == "I-15 (SR 52)"][['project_name','total_project_cost','description']]

Unnamed: 0,project_name,total_project_cost,description
5,I-15 (SR 52),181.0,West to North and South to East
41,I-15 (SR 52),65.33,North to West and East to South
42,I-15 (SR 52),65.33,North to East and West to South
43,I-15 (SR 52),65.33,South to West and East to North


In [59]:
give_info(sandag, 'project_name', 'total_project_cost')

N/A                  22
I-15 (I-8)            4
I-15 (SR 52)          4
I-805 (I-8)           4
Commuter Rail 398     3
Name: project_name, dtype: int64
# of unique project titles: 309
After dropping duplicates 315
Df shape: (416, 55)
Index(['objectid', 'type', 'freeway', 'intersecti', 'caption', 'project_id',
       'phase', 'direction', 'project_id_1', 'year_built', 'category',
       'project_name', 'description', 'aqc_2016_1', 'aqc_2020_1', 'pricmcp',
       'conncmcp', 'status', 'categoría', 'corredor', 'nombre_del_proyecto',
       'descripción', 'corredores_de_conexión', 'geometry', 'layer_name',
       'corridor_i', 'type_1', 'existing', 'limits', 'description_1',
       'shape__length', 'route', 'routetype', 'route_desc', 'rp_2021_id',
       'rp_2021_id_1', 'primary_corridor', 'connecting_corridor_s_',
       'capital_cost___2020__millions', 'name', 'mohubtype', 'mohubsize',
       'transitservices', 'globalid', 'icon_url', 'shape__area',
       'mohubtype_sp', 'mohubsize_sp'

In [60]:
def harmonize_sandag():
    df = sandag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="cost2020m",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="phase",
        phase_col="status",
        program="SANDAG",
        data_source="LRTP",
        note_cols=[
            "aqc_2016_1",
            "aqc_2020_1",
            "pricmcp",
            "conncmcp",
            "layer_name",
            "corridor_i",
            "type_1",
            "existing",
            "limits",
            "description_1",
            "route",
            "routetype",
            "route_desc",
            "rp_2021_id",
            "rp_2021_id_1",
            "capital_cost___2020__millions",
        ],
        cost_in_millions=True,
    )

    return df

### SBCAG

In [61]:
def sbcag_lrtp():
  
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}SBCAG.xlsx"))
    drop_columns = ['unnamed:_2', 'project_type', 'unnamed:_4','unnamed:_6', 'unnamed:_8', 'unnamed:_9',
           'unnamed:_10', 'unnamed:_13',]
    df = df.drop(columns = drop_columns)
    df = delete_embedded_headers(df, "description", "Description")
    df = df.dropna(subset = ['description']).reset_index(drop = True)
    
    df = df.drop_duplicates(subset = ['phase','project_title','phase', 'total_cost__$000s_']).reset_index(drop = True)
    return df 

In [62]:
sbcag = sbcag_lrtp()

65 rows are headers


In [63]:
give_info(sbcag, 'project_title', 'total_cost__$000s_')

SB-PL-15: Upper De la Vina St Gap Closure and Safe Crossings              2
CT-1: SR 246 Passing Lanes – Planting Mitigation (FTIP CT93)(EA 0C641)    1
SM-PL-10: A Street Widening                                               1
Sol-PL-6: Fredensborg Canyon Rd/Adobe Creek Culvert Replacement           1
Sol-PL-5: Solvang School Sidewalk Project                                 1
Name: project_title, dtype: int64
# of unique project titles: 418
After dropping duplicates 419
Df shape: (419, 7)
Index(['type', 'project_title', 'phase', 'description',
       'primary_funding_source_s_', 'year', 'total_cost__$000s_'],
      dtype='object')


###  SJCOG

In [133]:
def sjcog_lrtp():
    sheets_list = ["1 Mainline", "2 Interchanges", "3 Reg Roadways",
              "4 RR Xings", "5 Bus Transit", "6 Rail Corridor",
              "7 Airports", "8 Active Trans Facilities", "9 TCMs",
              "10 Ops & Maint",]
    dict_df = pd.read_excel(f"{LRTP_GCS}SJCOG.xlsx", sheet_name=sheets_list)
    df1 = to_snakecase(dict_df.get("1 Mainline"))
    df2 = to_snakecase(dict_df.get("2 Interchanges"))
    df3 = to_snakecase(dict_df.get( "3 Reg Roadways"))
    df4 = to_snakecase(dict_df.get("4 RR Xings"))
    df5 = to_snakecase(dict_df.get("5 Bus Transit"))
    df6 = to_snakecase(dict_df.get("6 Rail Corridor"))
    df7 = to_snakecase(dict_df.get("7 Airports"))
    df8 = to_snakecase(dict_df.get("8 Active Trans Facilities"))
    df9 = to_snakecase(dict_df.get("9 TCMs"))
    df10 = to_snakecase(dict_df.get("10 Ops & Maint"))
    
    concat1 = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10], axis= 0)
    
    sjcog_cols = [ '_2018_rtp_mpo_id', 'ctips_id_#', 
       'jurisdiction', 'facility_name_route',
       'project_description', 'project_limits', 'total',
      'ftip_programming', 'nepa_approval',
       'open_to_traffic', 
       'completion', 'completion_date', 'facility_name_route_1',
       'project_description_1', 'milestone_years_1', 'project_name']
    
    
    concat1 = concat1.dropna(subset = ['project_description','_2018_rtp_mpo_id',]).reset_index(drop = True)
    
    # Some projects have no titles, create them manually
    concat1['title_manual'] = concat1.project_limits + '-' + concat1.jurisdiction + '-' + concat1.facility_name_route # + ' Cost-' + concat1.total.astype(int).astype(str)
    concat1.title_manual = concat1.title_manual.fillna('No Title')
    concat1.project_name = concat1.project_name.fillna(sjcog.title_manual)
    
    concat1 = concat1[sjcog_cols]
    
    return concat1

In [134]:
sjcog = sjcog_lrtp()

In [135]:
give_info(sjcog, 'project_name', 'total')

City of Lodi-Lodi-Grapeline Capital                                                                         8
Various-Caltrans-Various locations                                                                          8
No Title                                                                                                    8
City streets, various locations-Stockton-Regional Surface Transportation Program (STP) Lump Sum Projects    2
Along Northwest side of I-5 from Brookhurst Blvd to Stewart Road-Lathrop-Golden Valley Parkway              2
Name: project_name, dtype: int64
# of unique project titles: 236
After dropping duplicates 255
Df shape: (262, 16)
Index(['_2018_rtp_mpo_id', 'ctips_id_#', 'jurisdiction', 'facility_name_route',
       'project_description', 'project_limits', 'total', 'ftip_programming',
       'nepa_approval', 'open_to_traffic', 'completion', 'completion_date',
       'facility_name_route_1', 'project_description_1', 'milestone_years_1',
       'project_name'],


### Stancog

In [None]:
def stancog_lrtp():

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}STANCOG.xlsx"))
    df = embedded_column_names(df, 1)
    keep_columns = [
     'Jurisdiction',
     'Location',
     'Project Limits',
     'Description',
     'Total Cost',
     'Open to Traffic',
     'Funding Source',
     'System Preserv.',
     'Capacity Enhance.',
     'Safety',
     'Oper.',
     'Complete Streets',
     'Active\nTransporta tion',
     'Transit',
     'Other']
    
    df = df[keep_columns]
    
    df = to_snakecase(df)
    
    df = df.dropna(subset = ['description']).reset_index(drop = True)
    
    df['title'] = (df.location + "-" + df.description).fillna("No Title")
    
    df = delete_embedded_headers(df, 'location', 'Location')
    
    df = df.drop_duplicates(subset = ['title','location','jurisdiction','total_cost','open_to_traffic']).reset_index(drop = True)
    return df

In [None]:
stancog = stancog_lrtp()

In [None]:
give_info(stancog, 'title', 'total_cost')

### Tahoe
* Open data portal: https://www.tahoeopendata.org/search?tags=transportation
* https://www.tahoeopendata.org/datasets/TRPA::2020-regional-transportation-plan-projects/about

In [None]:
def tmpo_lrtp():
    tahoe_url = "https://maps.trpa.org/server/rest/services/Datadownloader_Transportation/MapServer/19/query?where=1%3D1&text=&objectIds=&time=&timeRelation=esriTimeRelationOverlaps&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&returnTrueCurves=false&maxAllowableOffset=&geometryPrecision=&outSR=&havingClause=&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&historicMoment=&returnDistinctValues=false&resultOffset=&resultRecordCount=&returnExtentOnly=false&sqlFormat=none&datumTransformation=&parameterValues=&rangeValues=&quantizationParameters=&featureEncoding=esriDefault&f=geojson"
    gdf = to_snakecase(gpd.read_file(tahoe_url))
    gdf.complete_year = gdf.complete_year.astype(int)

    # Filter out projects that are not yet completed
    gdf2 = gdf[gdf.complete_year > 2022].reset_index(drop=True)

    gdf2["project_year"] = (
        gdf2.start_year.astype(str) + "-" + gdf.complete_year.astype(str)
    )
    return gdf2

In [None]:
def harmonize_tahoe():
    df = tmpo_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="estimated_cost",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="project_year",
        program="TMPO",
        phase_col="phase",
        data_source="LRTP",
        note_cols=[
            "implementer",
            "funding_type",
            "plan_name",
            "label",
            "financial_status",
            "url",
        ],
        cost_in_millions=True,
    )

    return df

### TCAG

In [167]:
def tcag_lrtp():
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}TCAG.xlsx"))
    def completion_2035(row):
        if row.completed_by_2035_y_n == "Y":
            return "2035"
    df["completion_year"] = df.apply(lambda x: completion_2035(x), axis=1)
    
    return df

In [168]:
tcag = tcag_lrtp()

In [169]:
tcag.columns

Index(['project_id__if_applicable_', 'jurisdiction', 'project_title',
       'project_description', 'project_category', 'completed_by_2035_y_n',
       'local', 'regional', 'state', 'federal', 'cost_x_1,000',
       'completion_year'],
      dtype='object')

In [170]:
give_info(tcag, 'project_title', 'cost_x_1,000')

Bike/Ped Improvements next 9 yrs second half of measure R    9
Bike/Ped Improvements next 9 years                           9
Purchase Buses                                               4
SR 99 Widen existing roadway                                 4
Riggin Avenue Widen existing roadway                         4
Name: project_title, dtype: int64
# of unique project titles: 272
After dropping duplicates 321
Df shape: (340, 12)
Index(['project_id__if_applicable_', 'jurisdiction', 'project_title',
       'project_description', 'project_category', 'completed_by_2035_y_n',
       'local', 'regional', 'state', 'federal', 'cost_x_1,000',
       'completion_year'],
      dtype='object')


In [171]:
tcag[tcag.project_title == "Bike/Ped Improvements next 9 yrs second half of measure R"]

Unnamed: 0,project_id__if_applicable_,jurisdiction,project_title,project_description,project_category,completed_by_2035_y_n,local,regional,state,federal,"cost_x_1,000",completion_year
61,,Dinuba,Bike/Ped Improvements next 9 yrs second half of measure R,"used $49,000 for alta heritage; $50K used for El Monte Way,",Active Transportation,Y,,X,,,226.0,2035
62,,Exeter,Bike/Ped Improvements next 9 yrs second half of measure R,"Used $485,980. on Belmont Project",Active Transportation,Y,,X,,,14.02,2035
63,,Lindsay,Bike/Ped Improvements next 9 yrs second half of measure R,,Active Transportation,Y,,X,,,500.0,2035
64,,Porterville,Bike/Ped Improvements next 9 yrs second half of measure R,"used $500,000 on Bike/Ped Various",Active Transportation,Y,,X,,,0.0,2035
65,,Tulare,Bike/Ped Improvements next 9 yrs second half of measure R,,Active Transportation,Y,,X,,,500.0,2035
66,,County,Bike/Ped Improvements next 9 yrs second half of measure R,,Active Transportation,Y,,X,,,500.0,2035
67,,Visalia,Bike/Ped Improvements next 9 yrs second half of measure R,"Used on Existing Bike Projects above $500,000",Active Transportation,Y,,X,,,0.0,2035
68,,Woodlake,Bike/Ped Improvements next 9 yrs second half of measure R,"Used $130,000 City plaza; $173,200 No Valencia; 115,000 NS Valencia",Active Transportation,Y,,X,,,81.8,2035
103,,Dinuba,Bike/Ped Improvements next 9 yrs second half of measure R,"used $49,000 for alta heritage; $50K used for El Monte Way,",Active Transportation,Y,,X,,,226.0,2035


## Congestion

#### Lost

In [None]:
stop

In [None]:
lost = harmonization_utils.load_lost()

In [None]:
lost.head()

In [None]:
def harmonize_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="cost__in_millions_",
        location_col="",
        county_col="county",
        city_col="city",
        project_year_col="",
        program="",
        phase_col="phase",
        data_source="LOST",
        note_cols=["measure", 'estimated_lost_funds',
       'estimated_federal_funds', 'estimated_state_funds',
       'estimated_local_funds', 'estimated_other_funds',"notes"],
        cost_in_millions=False,
    )
    
    
    return df

In [None]:
lost_harmonized = harmonize_lost()

In [None]:
lost_harmonized.head()

In [None]:
def all_mpo():
    tahoe = harmonize_tahoe()
    sandag = harmonize_sandag()
    mtc = harmonize_mtc()
    lost = harmonize_lost()
    sacog = harmonize_sacog()
    scag = harmonize_scag()
    
    df = pd.concat([tahoe, mtc, sandag, lost, sacog, scag])

    return df

In [None]:
all_mpo_df = all_mpo()

In [None]:
all_mpo_df.groupby(["grant_program"]).size()

In [None]:
congest_keywords = [
    "congestion pricing",
    "variablly priced",
    "variable tolls",
    "express lane",
    "value",
    "rush hour",
    "cordon",
    "dynamic pricing",
    "dynmically priced",
    "high occupancy toll",
    "high-occupancy"
    "mobility pricing",
    "occupancy",
    "toll lane",
    "performance pricing",
    "peak travel",
    "managed lane",
    "tollway",
    "transponder",
    "express toll",
    "fixed pricing"
]

In [None]:
def lower_case(df, columns_to_search: list):
    """
    Lowercase certain columns
    """
    new_columns = []
    for i in columns_to_search:
        df[f"lower_case_{i}"] = (df[i].str.lower().fillna("none")
                                 .str.replace("-","")
                                 .str.replace(".","")
                                 .str.replace(":","")
                                )
        new_columns.append(f"lower_case_{i}")

    return df,new_columns

In [None]:
def find_keywords(df, columns_to_search:list, keywords_search:list):
    df2, lower_case_cols_list = lower_case(df, columns_to_search)
    
    keywords_search = f"({'|'.join(keywords_search)})"
    
    for i in lower_case_cols_list:
        df2[f"{i}_keyword_search"] = df2[i].str.extract(keywords_search).fillna("not congestion")
    
    return df2

In [None]:
# test = find_keywords(all_mpo_df,  ["project_title", "project_description", "notes"], congest_keywords)

In [None]:
def filter_cordon(df, columns_to_search:list, keywords_search:list, save_to_gcs: bool = False):

    # Filter out for Cordon 
    df = find_keywords(df, columns_to_search, keywords_search)
    df2 = (df[(df.lower_case_project_title_keyword_search != "not congestion")
        | (df.lower_case_project_description_keyword_search != "not congestion")]).reset_index(drop=True)
    
    # Delete out HOV
    projects_to_delete = ["SR 17 Corridor Congestion Relief in Los Gatos",
                         "Interstate 380 Congestion Improvements"]
    df2 = df2[~df2.project_title.isin(projects_to_delete)].reset_index(drop = True)

    # Drop invalid geometries
    gdf = df2[~df2.geometry.is_empty].reset_index(drop = True)
    gdf = gdf[gdf.geometry.is_valid].reset_index(drop = True)
    
    gdf = gdf.set_geometry("geometry")
    gdf = gdf.fillna(gdf.dtypes.replace({"float64": 0.0, "object": "None"}))

    # One version that's a df
    columns_to_drop = ['lower_case_project_title', 'lower_case_project_description']
    df2 = df2.drop(columns = columns_to_drop + ['geometry'])
    # df2 = df2.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    if save_to_gcs:
        df2.to_excel(f"{harmonization_utils.GCS_FILE_PATH}LRTP/congestion_pricing_sample.xlsx", sheet_name="Sheet_name_1", index=False)
        gdf.to_file("./congestion_pricing_sample.geojson", driver="GeoJSON")
        
    return gdf, df2

In [None]:
cordon_gdf, cordon_df = filter_cordon(all_mpo_df, ["project_title", "project_description",], congest_keywords)

In [None]:
len(cordon_df.drop_duplicates(subset = ['project_title']))

In [None]:
cordon_df.groupby(["grant_program"]).agg({"project_title":"nunique"})

In [None]:
cordon_df.grant_program.value_counts()

In [None]:
cordon_df.drop_duplicates(subset = ['total_project_cost','project_title',]).groupby(["grant_program"]).agg({'total_project_cost':'sum'})

In [None]:
cols_to_drop = ['city','county', 'notes', 'phase','project_year',]

In [None]:
cordon_df.drop(columns =cols_to_drop).drop_duplicates(subset = ['project_title', 'project_description'])

In [None]:
# cordon_gdf.explore('project_title', cmap= 'tab20c', style_kwds={'weight':6}, legend = False)