## Long Range Transportation Plan

In [59]:
import _harmonization_utils as harmonization_utils
import _specific_list_utils as list_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [60]:
import fsspec

In [61]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [62]:
LRTP_GCS = f"{harmonization_utils.GCS_FILE_PATH}LRTP/"

### General Functions

In [63]:
def give_info(df, project_title: str, other_descriptive_col: str):
    print(df[project_title].value_counts().head())
    print(f"# of unique project titles: {df[project_title].nunique()}")
    print(
        f"After dropping duplicates using {project_title} and {other_descriptive_col}: {len(df.drop_duplicates(subset = [project_title, other_descriptive_col]))}"
    )
    print(f"Df shape: {df.shape}")
    print(df.columns)

In [64]:
def harmonizing_lrtp(
    df,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    phase_col: str,
    program: str,
    data_source: str,
    note_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    the same names and formats.
    """
    rename_columns = {
        project_name_col: "project_title",
        phase_col: "phase",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "geometry",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()

    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce")

    # Add MPO & grant program
    df["lead_agency"] = program
    df["grant_program"] = f"{program} {data_source}"

    # Add data source
    df["data_source"] = program

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_year",
        "project_category",
        "phase",
        "project_description",
    ]

    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "geometry" not in df:
        df["geometry"] = None
    if "total_project_cost" not in df:
        df["total_project_cost"]: 0

    # Create notes
    df = create_notes(df, note_cols)

    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "project_description",
        "phase",
        "total_project_cost",
        "geometry",
        "city",
        "county",
        "data_source",
        "notes",
    ]

    df = df[columns_to_keep]

    return df

In [65]:
def embedded_column_names(df, data_start: int) -> pd.DataFrame:
    """
    Some excel sheets have headers and column names
    embedded in the dataframe. Take them out.

    Args:
        data_start: the row number the column names begin.
    """
    # Delete header
    df = df.iloc[data_start:].reset_index(drop=True)
    # The first row contains column names - update it to the column
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    return df

In [66]:
def create_notes(df, note_cols: list):
    """
    Combine mulitple columns together
    into a single column to minimize space.
    """
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df["notes"] = df.apply(combine_notes, axis=1)
    df.notes = df.notes.str.replace("_", " ")

    return df

In [67]:
def open_rest_server(url_pt_1: str, url_pt_2: str, layer_name: list):
    """
    Open up data that is availably publicly via ArcGis
    """
    full_gdf = pd.DataFrame()
    for i in layer_name:
        gdf = to_snakecase(gpd.read_file(f"{url_pt_1}{i}{url_pt_2}"))
        gdf["layer_name"] = i
        full_gdf = pd.concat([full_gdf, gdf], axis=0)

    return full_gdf

In [68]:
def delete_embedded_headers(df, column: str, string_search: str) -> pd.DataFrame:
    """
    Some PDFS include the column names embedded mulitple times
    within the df. Delete them out.

    Example: Under the column 'description', delete the rows
    in which the value is 'description.' This signals that the row
    is just repeating the column name again.
    """
    headers = df[df[column].str.contains(string_search) == True]
    headers_index_list = headers.index.values.tolist()

    print(f"{len(headers_index_list)} rows are headers")

    df2 = df.drop(headers_index_list).reset_index(drop=True)
    return df2

In [69]:
def correct_project_cost(df, project_title_col: str, project_total_cost: str):
    """
    For some datasets, the same project
    (as determined by the same project name, cost,
    and source) is split across multiple rows.

    Ex: A project costs $500 million and is
    split on 5 rows by phase/location. Each row still lists
    the total  cost as $500 million, which is not accurate.
    This function will recalculate each of the row to list
    $100 mil as the total project cost
    """
    # Create a unique identifier
    df["unique_identifier"] = df[project_title_col] + df[project_total_cost].astype(str)

    # Create count for each project
    df["how_many_times_same_proj_appears"] = (
        df.groupby("unique_identifier").cumcount() + 1
    )

    # Find the total number of times a project title-cost appears.
    # Sort by descending and keep only the row with the highest level
    keep_cols = [
        project_title_col,
        "how_many_times_same_proj_appears",
        project_total_cost,
        "unique_identifier",
    ]
    df2 = (
        df[keep_cols]
        .sort_values(
            [project_title_col, "how_many_times_same_proj_appears"], ascending=False
        )
        .drop_duplicates(subset=["unique_identifier"])
    )

    # Create new funding estimate
    df2["new_proj_cost"] = (
        df2[project_total_cost] / df2["how_many_times_same_proj_appears"]
    )

    # Drop some columns
    df2 = df2.drop(
        columns=[
            project_title_col,
            project_total_cost,
            "how_many_times_same_proj_appears",
        ]
    )

    # Merge
    m1 = pd.merge(df, df2, how="inner", on="unique_identifier")

    # Clean up
    m1 = m1.drop(
        columns=[
            "unique_identifier",
            "how_many_times_same_proj_appears",
            project_total_cost,
        ]
    )
    m1[
        "total_project_cost_note"
    ] = "This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset."

    # Replace project cost
    m1 = m1.rename(columns={"new_proj_cost": "total_project_cost"})
    return m1

### Ambag

In [70]:
def ambag_lrtp():
    ambag1 = pd.read_excel(
        f"{LRTP_GCS}AMBAG_2045 Revenue Constrained Project List_Monterey County_by Project Type__022823.xlsx"
    )
    ambag2 = pd.read_excel(
        f"{LRTP_GCS}AMBAG_2045 Revenue Constrained Project List_San Benito County_by Project Type__022823.xlsx"
    )
    ambag3 = pd.read_excel(
        f"{LRTP_GCS}AMBAG_2045 Revenue Constrained Project List_Santa Cruz County_by Project Type__041923.xlsx"
    )

    # Move column names up
    ambag1 = embedded_column_names(ambag1, 1)
    ambag2 = embedded_column_names(ambag2, 1)
    ambag3 = embedded_column_names(ambag3, 1)

    # Add county
    ambag1["county"] = "Santa Cruz"
    ambag2["county"] = "Monterey"
    ambag3["county"] = "San Benito"

    # Conat & clean
    concat1 = pd.concat([ambag1, ambag2, ambag3], axis=0)
    concat1 = concat1.drop(columns=[2035.0, 2045.0, 2020.0, 2035.0])

    concat1 = to_snakecase(concat1)

    # Millions
    concat1["total_cost_millions"] = concat1["total_cost_\n_$_000s_"] * 1_000
    return concat1

In [71]:
ambag = ambag_lrtp()

In [72]:
give_info(ambag, "project", "project_description")

Citywide Bike Lanes                         2
The Alameda - Salinas Road Bike Route       2
Citywide Sidewalk Program                   2
Carmel to Pebble Beach Bike/Ped Facility    1
Second Street Bike Lane                     1
Name: project, dtype: int64
# of unique project titles: 277
After dropping duplicates using project and project_description: 280
Df shape: (280, 6)
Index(['ambag_id', 'project', 'project_description', 'total_cost_\n_$_000s_',
       'county', 'total_cost_millions'],
      dtype='object', name=0)


In [73]:
ambag.loc[ambag.project == "Citywide Bike Lanes"]

Unnamed: 0,ambag_id,project,project_description,total_cost_\n_$_000s_,county,total_cost_millions
133,MON-SOL075-SO,Citywide Bike Lanes,"Bike Lanes (2007 TIF M2, 2013 TIF M2); construct bike lanes citywide",1440,Santa Cruz,1440000
52,SC-SV-P41-SCV,Citywide Bike Lanes,Construction of additional bike lanes and paths citywide (including Green Hills).,2060,San Benito,2060000


In [74]:
def harmonize_ambag():
    df = ambag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project",
        project_description_col="project_description",
        project_category_col="",
        project_cost_col="total_cost_millions",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="",
        program="AMBAG",
        phase_col="",
        data_source="LRTP",
        note_cols=["county"],
        cost_in_millions=True,
    )

    return df

### BCAG 

In [75]:
def bcag_lrtp():
    """
    Project cost is in thousands
    """
    df = pd.read_excel(f"{LRTP_GCS}BCAG.xls")
    drop_columns = [
        2018,
        2020,
        2030,
        2035,
        2040,
        "EXEMPT CODE",
        "PM 1",
        "PM 2",
        "PM 3",
    ]

    df = df.drop(columns=drop_columns)
    df = to_snakecase(df)

    # Correct cost
    # df.cost_estimate = df.cost_estimate * 1_000
    df.fund_estimate = (
        df.fund_estimate.str.replace("$", "")
        .str.replace(".", "")
        .str.replace("million", "")
        .apply(pd.to_numeric, errors="coerce")
        * 1_000_000
    )

    # create geometry
    df["geometry"] = gpd.GeoSeries.from_xy(df.x_coord, df.y_coord, crs="EPSG:4326")
    # Same project is split across multiple rows. Divide out project cost
    df = correct_project_cost(df, "title", "cost_estimate")
    df = df.set_geometry("geometry")

    return df

In [76]:
bcag = bcag_lrtp()

  df.fund_estimate.str.replace("$", "")


In [77]:
give_info(bcag, "title", "total_project_cost")

Local Highway Bridge Program (HBP Grouped)            68
Highway Safety Improvement Program (HSIP Grouped)     19
SR 70 Passing Lanes (Segment 3)                        2
Eaton Rd Widening                                      2
Lincoln Blvd Class 2                                   2
Name: title, dtype: int64
# of unique project titles: 153
After dropping duplicates using title and total_project_cost: 228
Df shape: (250, 24)
Index(['rtp_id', 'agency', 'ftip', 'rtp', 'project_type', 'title',
       'project_descr', 'project_id', 'fund_source', 'fund_estimate', 'status',
       'y_coord', 'x_coord', 'target_fiscal_year', 'fund_source_1',
       'fund_source_2', 'source', 'title_vi', 'non_title_vi', 'ej', 'non_ej',
       'geometry', 'total_project_cost', 'total_project_cost_note'],
      dtype='object')


In [78]:
def harmonize_bcag():
    df = bcag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="title",
        project_description_col="project_descr",
        project_category_col="project_type",
        project_cost_col="total_project_cost",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="target_fiscal_year",
        program="BCAG",
        phase_col="status",
        data_source="LRTP",
        note_cols=[
            "fund_estimate",
            "fund_source",
            "agency",
            "total_project_cost_note",
        ],
        cost_in_millions=True,
    )

    return df

### Fresno COG

In [79]:
def fresnocog_lrtp():
    columns_to_drop = ["unnamed:_7", "unnamed:_8", "unnamed:_9"]
    df1 = to_snakecase(pd.read_excel(f"{LRTP_GCS}FRESNO_COG_UNconstrained.xlsx"))
    df1["financial_constraint"] = "unconstrained"

    df2 = to_snakecase(pd.read_excel(f"{LRTP_GCS}FRESNO_COG_Constrained.xlsx"))
    df2["financial_constraint"] = "constrained"

    concat1 = pd.concat([df1, df2], axis=0)
    concat1 = concat1.drop(columns=columns_to_drop)

    return concat1

In [80]:
fresno_cog = fresnocog_lrtp()

In [81]:
give_info(fresno_cog, "project_title", "estimated\ntotal_cost\n_$1,000_")

Herndon @ SR 99-Widen Undercrossing                            2
Willow-International to Copper Southbound: Widen to 3 Lanes    2
SHIELDS: GARFIELD to CHATEAU FRESNO                            2
Anchor Ave. Park Blvd to Adams                                 2
Adams Ave Reconstruction                                       2
Name: project_title, dtype: int64
# of unique project titles: 3128
After dropping duplicates using project_title and estimated
total_cost
_$1,000_: 3143
Df shape: (3147, 10)
Index(['agency', 'project_id', 'project_title', 'project_description',
       'project_type', 'estimated_open_to_traffic',
       'estimated\ntotal_cost\n_$1,000_', 'est__o2t_year',
       'est_total_project_cost', 'financial_constraint'],
      dtype='object')


In [82]:
fresno_cog.project_title = fresno_cog.project_title.fillna("None")

In [83]:
fresno_cog[fresno_cog.project_title.str.contains("Ventura, SR 41")]

Unnamed: 0,agency,project_id,project_title,project_description,project_type,estimated_open_to_traffic,"estimated\ntotal_cost\n_$1,000_",est__o2t_year,est_total_project_cost,financial_constraint
989,Fresno,FRE111312,"Ventura, SR 41 to SR 99 Widen from 4LU to 4 LD",Widen to 4 LN Divided Arterial,Streets & Roads - Capacity Increasing,2027-2031,3427.0,2028-01-01,3427000.0,constrained
2071,Fresno,FRE111312,"Ventura, SR 41 to SR 99 Widen from 4LU to 4 LD",Widen to 4 LN Divided Arterial(Measure C Project F in the Urban Regional Program),Streets & Roads-Capacity Increasing,2028,3427.0,NaT,,constrained


In [84]:
def harmonize_fresnocog():
    df = fresnocog_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="project_description",
        project_category_col="project_type",
        project_cost_col="est_total_project_cost",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="estimated_open_to_traffic",
        program="Fresno COG",
        phase_col="status",
        data_source="LRTP",
        note_cols=[
            "financial_constraint",
        ],
        cost_in_millions=True,
    )

    return df

### KCAG

In [85]:
def kcag_lrtp():

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}KCAG.xlsx"))

    # No title column
    df["title"] = (df.category + "-" + df.description + "-" + df.location).fillna(
        "No Title"
    )

    # Some duplicates
    df = df.drop_duplicates(["location", "category", "description"]).reset_index(
        drop=True
    )

    # Create cost
    df["total_cost"] = 0

    return df

In [86]:
kcag = kcag_lrtp()

In [87]:
give_info(kcag, "title", "description")

No Title                                          26
Proposed Improvements-Overlay-Claudia Drive        1
Proposed Improvements-Overlay-Grand Street         1
Proposed Improvements-Overlay-Linda Lee Circle     1
Proposed Improvements-Overlay-Newbridge Drive      1
Name: title, dtype: int64
# of unique project titles: 59
After dropping duplicates using title and description: 59
Df shape: (84, 9)
Index(['category', 'jurisdiction', 'state_route', 'post_mile', 'location',
       'project_limits', 'description', 'title', 'total_cost'],
      dtype='object')


In [88]:
kcag[kcag.title == "Proposed Improvements-Pavement Maintenance-Various Locations"]

Unnamed: 0,category,jurisdiction,state_route,post_mile,location,project_limits,description,title,total_cost
4,Proposed Improvements,CORCORAN,,,Various Locations,,Pavement Maintenance,Proposed Improvements-Pavement Maintenance-Various Locations,0


In [89]:
def harmonize_kcag():
    df = kcag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="title",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="total_cost",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="",
        program="KCAG",
        phase_col="status",
        data_source="LRTP",
        note_cols=[
            "location",
            "jurisdiction",
            "state_route",
            "post_mile",
            "project_limits",
        ],
        cost_in_millions=True,
    )

    return df

### KernCog

In [90]:
def kern_lrtp():
    to_keep = [
        "project_title",
        "scope",
        "yoe_w__new_revenue",
        "yoe_w_o_new_reven",
        "maint__inflation_savings",
    ]

    monetary_cols = [
        "yoe_w__new_revenue",
        "yoe_w_o_new_reven",
        "maint__inflation_savings",
    ]

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}KERNCOG.xlsx", sheet_name="Cleaned"))

    df = df[to_keep]

    df[monetary_cols] = df[monetary_cols] * 1_000

    df["cost"] = df.yoe_w__new_revenue

    df.cost = df.cost.fillna(df.yoe_w_o_new_reven)

    return df

In [91]:
kern = kern_lrtp()

In [92]:
# kern[kern.project_title == "Kern County ‐ Class II Bike Ln"].sort_values(['cost'])

In [93]:
# kern.sample(5)

In [94]:
give_info(kern, "project_title", "cost")

Kern County ‐ Class II Bike Ln        85
Bakersfield ‐ Class III Bike Blvd     85
Bakersfield ‐ Class II Bike Ln        78
Kern County ‐ Sidewalk Improvement    71
Kern County ‐ Class III Bike Blvd     35
Name: project_title, dtype: int64
# of unique project titles: 371
After dropping duplicates using project_title and cost: 1180
Df shape: (1428, 6)
Index(['project_title', 'scope', 'yoe_w__new_revenue', 'yoe_w_o_new_reven',
       'maint__inflation_savings', 'cost'],
      dtype='object')


In [95]:
def harmonize_kerncog():
    df = kern_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="scope",
        project_category_col="",
        project_cost_col="cost",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="",
        program="Kern Cog",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "maint__inflation_savings",
        ],
        cost_in_millions=True,
    )

    return df

### Madera CTC

In [96]:
def madera_lrtp():
    sheets_list = [
        "Streets and Roads",
        "Maintenance",
        "Safety",
        "ATP",
        "Public Transit",
        "Aviation",
        "ITS",
        "Unconstrained",
    ]

    dict_df = pd.read_excel(
        f"{LRTP_GCS}Madera_CTC_cleaned.xlsx", sheet_name=sheets_list
    )
    df1 = dict_df.get(sheets_list[0])
    df2 = dict_df.get(sheets_list[1])
    df3 = dict_df.get(sheets_list[2])
    df4 = dict_df.get(sheets_list[3])
    df5 = dict_df.get(sheets_list[4])
    df6 = dict_df.get(sheets_list[5])
    df7 = dict_df.get(sheets_list[6])
    df8 = dict_df.get(sheets_list[7])

    concat1 = to_snakecase(pd.concat([df1, df2, df3, df4, df5, df6, df7, df8], axis=0))

    concat1 = concat1.drop(columns=["proje\nct_id"])
    return concat1

In [97]:
madera = madera_lrtp()

In [98]:
madera.groupby(["project_name", "total_cost"]).agg({"category": "count"}).sort_values(
    "category", ascending=False
).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,category
project_name,total_cost,Unnamed: 2_level_1
Various,1000000,3
Avenue 12,10000000,3
Road 36,563000,3
Purchase Transit Vehicle,300000,3
Road 16,197000,2


In [99]:
give_info(madera, "project_name", "total_cost")

Various                            20
Avenue 12                          10
Avenue 9                            8
State Route 41                      6
Purchase Transit Van (Electric)     5
Name: project_name, dtype: int64
# of unique project titles: 515
After dropping duplicates using project_name and total_cost: 735
Df shape: (765, 8)
Index(['category', 'agency', 'project_name', 'location', 'project_id',
       'description', 'opening_year', 'total_cost'],
      dtype='object')


In [100]:
# madera[madera.project_name == "Various"]

In [101]:
madera[madera.project_name == "Avenue 12"].head()

Unnamed: 0,category,agency,project_name,location,project_id,description,opening_year,total_cost
46,Streets and Roads,Madera County,Avenue 12,Road 38 to Road 40,,2 Lanes to 4 Lanes,2027.0,10000000
47,Streets and Roads,Madera County,Avenue 12,Road 40 to Riverwalk\nBlvd,,2 Lanes to 6 Lanes,2027.0,10000000
48,Streets and Roads,Madera County,Avenue 12,Riverwalk Blvd to SR\n41,,4 Lanes to 8 Lanes,2027.0,10000000
49,Streets and Roads,Madera County,Avenue 12,SR 41 To Flagbarn Rd,,2 Lanes to 4 Lanes,2030.0,4250000
51,Streets and Roads,Madera County,Avenue 12,Road 30 1/2 to Road 36,,2 Lanes to 4 Lanes,2030.0,21000000


In [102]:
def harmonize_madera():
    df = madera_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="total_cost",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="opening_year",
        program="Madera CTC",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "location",
        ],
        cost_in_millions=True,
    )

    return df

### MCAGOV

In [103]:
def mcagov_lrtp():

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}MCAGOV.xlsx"))

    df = df.dropna(subset=["title"]).reset_index(drop=True)

    # Millions
    df["total_cost_millions"] = df["total_cost\n_$1,000s_"] * 1_000
    return df

In [104]:
mcagov = mcagov_lrtp()

In [105]:
give_info(mcagov, "title", "total_cost\n_$1,000s_")

Bellevue Rd Realignment/Reconstruction    1
Minturn Road Rehabilitation               1
MCAG Vanpool Program (dibs)               1
Measure V Transit Free Ride Program       1
Planada Route Improvements                1
Name: title, dtype: int64
# of unique project titles: 108
After dropping duplicates using title and total_cost
_$1,000s_: 108
Df shape: (108, 8)
Index(['agency', 'title', 'limits_description', 'type', 'completion\nyear',
       'total_cost\n_$1,000s_', 'funding_sources', 'total_cost_millions'],
      dtype='object')


In [106]:
mcagov.sample()

Unnamed: 0,agency,title,limits_description,type,completion\nyear,"total_cost\n_$1,000s_",funding_sources,total_cost_millions
100,Caltrans,MER 165 Intersection Control,"Near Hilmar, at the intersection with Geer Ave. Intersection Improvements",Road Ops/Safety,2023/24,14623,SHOPP,14623000


In [107]:
def harmonize_mcagov():
    df = mcagov_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="title",
        project_description_col="limits_description",
        project_category_col="type",
        project_cost_col="total_cost_millions",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="completion\nyear",
        program="MCAGOV",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "agency",
            "funding_sources",
        ],
        cost_in_millions=True,
    )

    return df

### MTC
* 133 projects but point and line map returns 240 ish rows.
* Projects that are in different locations can have the same name split by open year...
* I guess the geospatial projects are split into various phases while the regular pandas df is just the project in its entirety.
* https://www.planbayarea.org/2050-plan/final-plan-bay-area-2050/final-supplemental-reports/interactive-transportation-project-list
* https://opendata.mtc.ca.gov/datasets/MTC::2023-transportation-improvement-program-projects-point/about
* https://opendata.mtc.ca.gov/datasets/MTC::plan-bay-area-2050-transportation-projects-line/about
* https://data.bayareametro.gov/Projects/Plan-Bay-Area-2050-Transportation-Project-List/y7ka-jksz

In [108]:
def mtc_lrtp():
    # Open rest server data
    layer_list = ["planbayarea2050_rtp_line", "planbayarea2050_rtp_point"]
    url_pt_1 = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/ArcGIS/rest/services/"
    url_pt_2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    gdf = open_rest_server(url_pt_1, url_pt_2, layer_list)

    # Open all MTC projects. Not all projects are mappable.
    all_projects_url = "https://data.bayareametro.gov/resource/y7ka-jksz.json?$query=SELECT%0A%20%20%60plan_strategy%60%2C%0A%20%20%60rtpid%60%2C%0A%20%20%60title%60%2C%0A%20%20%60scope%60%2C%0A%20%20%60open_period%60%2C%0A%20%20%60funding_millions_yoe%60%2C%0A%20%20%60county%60"
    all_projects = pd.read_json(all_projects_url)

    # Merge info from all projects to gdf
    gdf = pd.merge(
        gdf, all_projects, how="inner", left_on=["proj_title"], right_on=["title"]
    )

    gdf = gdf.drop(columns=["title"])

    # Find projects that are unmappable
    mtc_geometry_titles = set(gdf.proj_title.unique().tolist())
    mtc_all_projects_titles = set(all_projects.title.unique().tolist())
    missing_projects = list(mtc_all_projects_titles - mtc_geometry_titles)

    # Add back in the  unmappable projects
    all_projects = all_projects[all_projects.title.isin(missing_projects)].reset_index(
        drop=True
    )
    all_projects = all_projects.rename(columns={"title": "proj_title"})
    final = pd.concat([all_projects, gdf])

    # Correct typo
    final = final.rename({"mode detai": "mode_detail"})

    # Set geometry again
    final = final.set_geometry("geometry").set_crs("EPSG:4326")

    # Same project is split across multiple rows. Divide out project cost
    final = correct_project_cost(final, "proj_title", "funding_millions_yoe")
    
    # Divide project cost over 30 years
    final.total_project_cost = final.total_project_cost/30
    return final

In [109]:
mtc_og = mtc_lrtp()

In [110]:
# all_projects_url = "https://data.bayareametro.gov/resource/y7ka-jksz.json?$query=SELECT%0A%20%20%60plan_strategy%60%2C%0A%20%20%60rtpid%60%2C%0A%20%20%60title%60%2C%0A%20%20%60scope%60%2C%0A%20%20%60open_period%60%2C%0A%20%20%60funding_millions_yoe%60%2C%0A%20%20%60county%60"
# all_projects = pd.read_json(all_projects_url)

In [111]:
# all_projects.funding_millions_yoe.describe()

In [112]:
# all_projects[all_projects.funding_millions_yoe >= 211000]

In [113]:
# all_projects[all_projects.rtpid == "21-T01-003"]

In [114]:
# give_info(mtc_og, "proj_title", "total_project_cost")

In [115]:
def harmonize_mtc():
    df = mtc_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="proj_title",
        project_description_col="subcompone",
        project_category_col="plan_strategy",
        project_cost_col="total_project_cost",
        location_col="geometry",
        county_col="county",
        city_col="",
        project_year_col="open_year",
        phase_col="",
        program="MTC",
        data_source="LRTP",
        note_cols=[
            "strategy",
            "layer_name",
            "mode_detai",
        ],
        cost_in_millions=False,
    )

    return df

### Sacramento Area Council of Governments (SACOG)
* https://www.sacog.org/sites/main/files/file-attachments/2020_mtp-scs.pdf?1580330993

In [116]:
def sacog_lrtp():

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}SACOG.xlsx"))

    # Delete embedded headers
    df = delete_embedded_headers(df, "description", "Description")

    # Delete columns with all nulls
    df = df.dropna(axis=1, how="all")

    # Drop duplicates
    df = df.drop_duplicates().reset_index(drop=True)

    return df

In [117]:
sacog = sacog_lrtp()

96 rows are headers


In [118]:
give_info(sacog, "title", "description")

Safety Improvements                            2
Connect Card Implementation                    2
Aux Lane Project: EB Latrobe Road              1
Bus Maintenance Facility #1 Rehabilitation     1
Name: title, dtype: int64
# of unique project titles: 1599
After dropping duplicates using title and description: 1601
Df shape: (1601, 10)
Index(['id', 'status__planned,_programmed_or_project_development_only_',
       'county', 'lead_agency', 'budget_category', 'title', 'description',
       'total_project_cost__2018_dollars_',
       'year_of_expenditure_cost_for_planned_projects', 'completion_timing'],
      dtype='object')


In [119]:
def harmonize_sacog():
    df = sacog_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="title",
        project_description_col="description",
        project_category_col="budget_category",
        project_cost_col="total_project_cost__2018_dollars_",
        location_col="",
        county_col="county",
        city_col="",
        project_year_col="completion_timing",
        program="SACOG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "year_of_expenditure_cost_for_planned_projects",
            "status__planned,_programmed_or_project_development_only_",
        ],
        cost_in_millions=True,
    )

    return df

### SANDAG
* https://www.arcgis.com/home/item.html?id=ba8b2a6247ef463cb7411f039959f3ee
* https://www.arcgis.com/apps/mapviewer/index.html?webmap=ba8b2a6247ef463cb7411f039959f3ee

In [120]:
columns_to_keep = [
    "project_title",
    "lead_agency",
    "project_year",
    "project_category",
    "grant_program",
    "project_description",
    "total_project_cost",
    "location",
    "city",
    "county",
    "data_source",
    "notes",
]

In [121]:
def sandag_lrtp():
    sandag_layers_list = [
        "Complete_Corridors_Connectors_Intersections_Ramps_RP2021",
        "Complete_Corridors_Highway_RP2021",
        "Transit_Leap_RP2021",
        "Adopted_Regional_Bike_Network_RP2021",
        "Mobility_Hubs_and_Flexible_Fleets",
        "Complete_Corridors_Regional_Arterials",
        "Goods_Movement",
    ]

    sandag_url_pt1 = (
        "https://services1.arcgis.com/HG80xaIVT1z1OdO5/ArcGIS/rest/services/"
    )
    sandag_url_pt2 = "/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&relationParam=&returnGeodetic=false&outFields=*+&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&defaultSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="
    sandag = open_rest_server(sandag_url_pt1, sandag_url_pt2, sandag_layers_list)

    # Same project is split across multiple rows. Divide out project cost
    sandag.cost2020m = (
        sandag.cost2020m.str.replace("$", "")
        .str.replace("N/A", "")
        .apply(pd.to_numeric, errors="coerce")
    )
    sandag = correct_project_cost(sandag, "project_name", "cost2020m")

    return sandag

In [122]:
sandag = sandag_lrtp()

  sandag.cost2020m.str.replace("$", "")


In [123]:
sandag.loc[sandag.project_name == "I-15 (SR 52)"][
    ["project_name", "total_project_cost", "description"]
]

Unnamed: 0,project_name,total_project_cost,description
5,I-15 (SR 52),181.0,West to North and South to East
41,I-15 (SR 52),65.33,North to West and East to South
42,I-15 (SR 52),65.33,North to East and West to South
43,I-15 (SR 52),65.33,South to West and East to North


In [124]:
give_info(sandag, "project_name", "total_project_cost")

N/A                  22
I-15 (I-8)            4
I-15 (SR 52)          4
I-805 (I-8)           4
Commuter Rail 398     3
Name: project_name, dtype: int64
# of unique project titles: 309
After dropping duplicates using project_name and total_project_cost: 315
Df shape: (416, 55)
Index(['objectid', 'type', 'freeway', 'intersecti', 'caption', 'project_id',
       'phase', 'direction', 'project_id_1', 'year_built', 'category',
       'project_name', 'description', 'aqc_2016_1', 'aqc_2020_1', 'pricmcp',
       'conncmcp', 'status', 'categoría', 'corredor', 'nombre_del_proyecto',
       'descripción', 'corredores_de_conexión', 'geometry', 'layer_name',
       'corridor_i', 'type_1', 'existing', 'limits', 'description_1',
       'shape__length', 'route', 'routetype', 'route_desc', 'rp_2021_id',
       'rp_2021_id_1', 'primary_corridor', 'connecting_corridor_s_',
       'capital_cost___2020__millions', 'name', 'mohubtype', 'mohubsize',
       'transitservices', 'globalid', 'icon_url', 'shape__a

In [125]:
def harmonize_sandag():
    df = sandag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="cost2020m",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="phase",
        phase_col="status",
        program="SANDAG",
        data_source="LRTP",
        note_cols=[
            "aqc_2016_1",
            "aqc_2020_1",
            "pricmcp",
            "conncmcp",
            "layer_name",
            "corridor_i",
            "type_1",
            "existing",
            "limits",
            "description_1",
            "route",
            "routetype",
            "route_desc",
            "rp_2021_id",
            "rp_2021_id_1",
            "capital_cost___2020__millions",
        ],
        cost_in_millions=False,
    )

    return df

### SBCAG

In [126]:
def sbcag_lrtp():

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}SBCAG.xlsx"))
    drop_columns = [
        "unnamed:_2",
        "project_type",
        "unnamed:_4",
        "unnamed:_6",
        "unnamed:_8",
        "unnamed:_9",
        "unnamed:_10",
        "unnamed:_13",
    ]
    df = df.drop(columns=drop_columns)
    df = delete_embedded_headers(df, "description", "Description")
    df = df.dropna(subset=["description"]).reset_index(drop=True)

    df = df.drop_duplicates(
        subset=["phase", "project_title", "phase", "total_cost__$000s_"]
    ).reset_index(drop=True)

    # Millions
    df["total_cost_millions"] = df["total_cost__$000s_"] * 1_000

    return df

In [127]:
sbcag = sbcag_lrtp()

65 rows are headers


In [128]:
give_info(sbcag, "project_title", "total_cost__$000s_")

SB-PL-15: Upper De la Vina St Gap Closure and Safe Crossings              2
CT-1: SR 246 Passing Lanes – Planting Mitigation (FTIP CT93)(EA 0C641)    1
SM-PL-10: A Street Widening                                               1
Sol-PL-6: Fredensborg Canyon Rd/Adobe Creek Culvert Replacement           1
Sol-PL-5: Solvang School Sidewalk Project                                 1
Name: project_title, dtype: int64
# of unique project titles: 418
After dropping duplicates using project_title and total_cost__$000s_: 419
Df shape: (419, 8)
Index(['type', 'project_title', 'phase', 'description',
       'primary_funding_source_s_', 'year', 'total_cost__$000s_',
       'total_cost_millions'],
      dtype='object')


In [129]:
sbcag.sample()

Unnamed: 0,type,project_title,phase,description,primary_funding_source_s_,year,total_cost__$000s_,total_cost_millions
388,VMT Reducing,SB-IL-1: Pedestrian\nEnhancements: Sycamore Creek Pedestrian Crossing,"PA&ED, PS&E,\nConstruction","Construct a pedestrian overcrossing from the\nEastside Neighborhood from Canada Street, crossing Highway 101 and landing near the Sycamore Creek in the Dwight Murphy Field area. The project would include enhanced crosswalks at several lower Eastside intersection to provide safe access to the crossing.",,,TBD,TBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBDTBD


In [130]:
def harmonize_sbcag():
    df = sbcag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="description",
        project_category_col="phase",
        project_cost_col="total_cost_millions",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="completion\nyear",
        program="SBCAG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "type",
            "primary_funding_source_s_",
            "year",
        ],
        cost_in_millions=True,
    )

    return df

### Shasta County Regional Transportation Planning Agency (SCRTPA)

In [131]:
def scrtpa_lrtp():
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}SRTA.xlsx", sheet_name="Cleaned"))

    to_keep = [
        "regional_transportation_projects",
        "short_term_total_est_cost_of_project",
        "long_term_total_est_cost_of_project",
        "project_band",
        "project_type\n_project_intent_",
        "expected_funding_sources",
    ]

    df = df[to_keep]
    df["cost"] = df.short_term_total_est_cost_of_project
    df.cost = df.cost.fillna(df.long_term_total_est_cost_of_project)

    df = df.drop_duplicates()
    return df

In [132]:
scrtpa = scrtpa_lrtp()

In [133]:
# scrtpa.sample(5)

In [134]:
# scrtpa[scrtpa.regional_transportation_projects == "Little Street - Realignment"]

In [135]:
scrtpa[
    scrtpa.regional_transportation_projects == "E Cypress Ave-Alfreda Way-Victor Ave"
]

Unnamed: 0,regional_transportation_projects,short_term_total_est_cost_of_project,long_term_total_est_cost_of_project,project_band,project_type\n_project_intent_,expected_funding_sources,cost
831,E Cypress Ave-Alfreda Way-Victor Ave,,29913,(2026-2040),Buffered Bike Lane,unknown,29913
833,E Cypress Ave-Alfreda Way-Victor Ave,,529884,(2026-2040),Sidewalk,unknown,529884


In [136]:
give_info(scrtpa, "regional_transportation_projects", "cost")

E Cypress Ave-Alfreda Way-Victor Ave                      2
GROVE ST-B ST-WALNUT ST                                   2
Churn Creek Rd-Rancho Rd-Churn Creek Rd/S Bonnyview Rd    2
Hartnell Ave-Victor Ave-Shasta View Dr                    2
Hawley Rd-Hawley Rd (e/o Norwich Ct)-Collyer Dr           2
Name: regional_transportation_projects, dtype: int64
# of unique project titles: 1051
After dropping duplicates using regional_transportation_projects and cost: 1066
Df shape: (1066, 7)
Index(['regional_transportation_projects',
       'short_term_total_est_cost_of_project',
       'long_term_total_est_cost_of_project', 'project_band',
       'project_type\n_project_intent_', 'expected_funding_sources', 'cost'],
      dtype='object')


In [137]:
def harmonize_scrtpa():
    df = scrtpa_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="regional_transportation_projects",
        project_description_col="",
        project_category_col="project_type\n_project_intent_",
        project_cost_col="cost",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="project_band",
        program="SCRTPA",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "expected_funding_sources",
        ],
        cost_in_millions=True,
    )

    return df

In [138]:
# scrtpa.head()

### San Luis Obispo Council of Governments (SLOCOG)
* Have to paste the data from the word doc and merge with shapefiles, which I'll receive on Monday.

In [139]:
def slocog_lrtp():
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}SLOCOG.xlsx"))

    df["total_cost"] = df._2021_cost_estimate

    # Fill nan with the other columns
    df.loc[df.total_cost.isnull(), "total_cost"] = df["_2045\ntotal_capital_cost"]
    df.loc[df.total_cost.isnull(), "total_cost"] = df.escalated_cost_estimate

    drop_cols = [
        "_2045\ntotal_capital_cost",
        "_2021_cost_estimate",
        "escalated_cost_estimate",
        "project_category",
    ]

    df = df.drop(columns=drop_cols)
    df = df.reset_index()

    # Open up gdf
    gdf_url = f"{LRTP_GCS}SLOCOG Current Projects.zip"
    with fsspec.open(gdf_url) as file:
        gdf = to_snakecase(gpd.read_file(file))
    gdf = gdf[["project_id", "geometry"]]

    # Merge -> left b/c some RTP projects have no location info
    m1 = pd.merge(
        df, gdf, how="left", left_on=["_2023_rtp_project_id"], right_on=["project_id"]
    )

    m1 = m1.set_geometry("geometry")
    m1 = m1.to_crs("EPSG:4326")
    return m1

In [140]:
slocog = slocog_lrtp()

In [141]:
give_info(slocog, "project_name", "total_cost")

Hazel Ln. SRTS Improvements                          2
Main St. Sidewalks                                   2
Los Berros Rd. Widening                              2
Octagon Barn Park-and-Ride Lot and shared-use lot    1
Augusta Neighborhood SRTS Improvements               1
Name: project_name, dtype: int64
# of unique project titles: 416
After dropping duplicates using project_name and total_cost: 420
Df shape: (420, 12)
Index(['index', 'project_name', 'project_description', 'sponsor', 'community',
       'time_period', 'project_type', '_2023_rtp_project_id', 'time_horizon',
       'total_cost', 'project_id', 'geometry'],
      dtype='object')


In [142]:
slocog[slocog.project_name == "Hazel Ln. SRTS Improvements"]

Unnamed: 0,index,project_name,project_description,sponsor,community,time_period,project_type,_2023_rtp_project_id,time_horizon,total_cost,project_id,geometry
341,341,Hazel Ln. SRTS Improvements,Construct Class II/Class IV bike lanes on Hazel Ln. between West Tefft St. & Division St.,,NIPOMO,,Unconstrained Active Transportation Investments,STH-AT2-1013,Unconstr.,153000.0,STH-AT2-1013,"LINESTRING (-120.49885 35.02392, -120.49615 35.02301, -120.49418 35.02232, -120.49377 35.02219, -120.49313 35.02198, -120.49244 35.02176, -120.49191 35.02158, -120.49068 35.02117, -120.48944 35.02074)"
345,345,Hazel Ln. SRTS Improvements,Pedestrian improvements between W. Tefft St. and Siammons Ln.,,NIPOMO,,Unconstrained Active Transportation Investments,STH-AT4-1402,Unconstr.,459000.0,STH-AT4-1402,"LINESTRING (-120.48990 35.02475, -120.48976 35.02438, -120.48974 35.02433, -120.48973 35.02429, -120.48953 35.02372, -120.48936 35.02324, -120.48929 35.02303, -120.48915 35.02268, -120.48906 35.02247, -120.48889 35.02209, -120.48887 35.02205, -120.48887 35.02205, -120.48879 35.02196, -120.48881 35.02194, -120.48886 35.02189, -120.48890 35.02183, -120.48893 35.02178, -120.48895 35.02172, -120.48945 35.02074, -120.48945 35.02074)"


In [143]:
def harmonize_slocog():
    df = slocog_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="project_description",
        project_category_col="project_type",
        project_cost_col="total_cost",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="time_period",
        program="SLOCOG",
        phase_col="",
        data_source="LRTP",
        note_cols=["sponsor", "community"],
        cost_in_millions=True,
    )

    return df

In [144]:
harmonized_slocog = harmonize_slocog()

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


###  SJCOG

In [145]:
def sjcog_lrtp():
    sheets_list = [
        "1 Mainline",
        "2 Interchanges",
        "3 Reg Roadways",
        "4 RR Xings",
        "5 Bus Transit",
        "6 Rail Corridor",
        "7 Airports",
        "8 Active Trans Facilities",
        "9 TCMs",
        "10 Ops & Maint",
    ]
    dict_df = pd.read_excel(f"{LRTP_GCS}SJCOG.xlsx", sheet_name=sheets_list)
    df1 = to_snakecase(dict_df.get("1 Mainline"))
    df2 = to_snakecase(dict_df.get("2 Interchanges"))
    df3 = to_snakecase(dict_df.get("3 Reg Roadways"))
    df4 = to_snakecase(dict_df.get("4 RR Xings"))
    df5 = to_snakecase(dict_df.get("5 Bus Transit"))
    df6 = to_snakecase(dict_df.get("6 Rail Corridor"))
    df7 = to_snakecase(dict_df.get("7 Airports"))
    df8 = to_snakecase(dict_df.get("8 Active Trans Facilities"))
    df9 = to_snakecase(dict_df.get("9 TCMs"))
    df10 = to_snakecase(dict_df.get("10 Ops & Maint"))

    concat1 = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10], axis=0)

    sjcog_cols = [
        "_2018_rtp_mpo_id",
        "ctips_id_#",
        "jurisdiction",
        "facility_name_route",
        "project_description",
        "project_limits",
        "total",
        "ftip_programming",
        "nepa_approval",
        "open_to_traffic",
        "completion",
        "completion_date",
        "facility_name_route_1",
        "project_description_1",
        "milestone_years_1",
        "project_name",
    ]

    concat1 = concat1.dropna(
        subset=[
            "project_description",
            "_2018_rtp_mpo_id",
        ]
    ).reset_index(drop=True)

    # Some projects have no titles, create them manually
    """
    concat1["title_manual"] = (
        concat1.project_limits
        + "-"
        + concat1.jurisdiction
        + "-"
        + concat1.facility_name_route
    ) 
    concat1.title_manual = concat1.title_manual.fillna("No Title")
    concat1.project_name = concat1.project_name.fillna(concat1.title_manual)
    """
    concat1.project_name = concat1.project_name.fillna("None")
    concat1 = concat1[sjcog_cols]

    return concat1

In [146]:
sjcog = sjcog_lrtp()

In [147]:
give_info(sjcog, "project_name", "total")

None                                                        236
Bear Creek and Pixley Slough Bicycle and Pedestrian Path      1
Park and Ride Lots                                            1
Ridesharing and Vanpool Programs                              1
Corral Hollow Road Adaptive Traffic Control                   1
Name: project_name, dtype: int64
# of unique project titles: 27
After dropping duplicates using project_name and total: 223
Df shape: (262, 16)
Index(['_2018_rtp_mpo_id', 'ctips_id_#', 'jurisdiction', 'facility_name_route',
       'project_description', 'project_limits', 'total', 'ftip_programming',
       'nepa_approval', 'open_to_traffic', 'completion', 'completion_date',
       'facility_name_route_1', 'project_description_1', 'milestone_years_1',
       'project_name'],
      dtype='object')


In [148]:
sjcog.sample()

Unnamed: 0,_2018_rtp_mpo_id,ctips_id_#,jurisdiction,facility_name_route,project_description,project_limits,total,ftip_programming,nepa_approval,open_to_traffic,completion,completion_date,facility_name_route_1,project_description_1,milestone_years_1,project_name
116,SJ18-5054,,Lodi,Grapeline Capital,Bus Wash upgrades,City of Lodi,567748.0,,,,,,,,,


In [149]:
def harmonize_sjcog():
    df = sjcog_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="project_description",
        project_category_col="",
        project_cost_col="total",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="completion_date",
        program="SJCOG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "jurisdiction",
            "facility_name_route",
            "project_limits",
            "ftip_programming",
            "nepa_approval",
            "open_to_traffic",
            "completion",
            "facility_name_route_1",
            "project_description_1",
            "milestone_years_1",
        ],
        cost_in_millions=True,
    )

    return df

### Southern California Association of Governments (SCAG) 

In [150]:
def scag_lrtp():
    sheets_list = ["Table 1", "Table 2"]
    dict_df = pd.read_excel(f"{LRTP_GCS}SCAG.xlsx", sheet_name=sheets_list)

    df1 = to_snakecase(dict_df.get("Table 1"))
    # Drop columns where everything is NA
    df1 = df1.dropna(axis=1, how="all")

    df2 = to_snakecase(dict_df.get("Table 2"))
    df2 = df2.dropna(axis=1, how="all")

    # Concat
    df3 = pd.concat([df1, df2])

    df4 = delete_embedded_headers(df3, "rtp_id", "RTP ID")

    df4 = df4.dropna(
        subset=["description", "lead_agency", "project_cost__$1,000s_"]
    ).reset_index(drop=True)

    # No project titles - fill it in
    df4["project_title"] = "No Title"

    # Project cost is in thousands.
    # Multiple out
    df4["project_cost_millions"] = df4["project_cost__$1,000s_"] * 1_000
    return df4

In [151]:
scag = scag_lrtp()

360 rows are headers


In [152]:
give_info(scag, "description", "route_name")

WIDEN FROM 2 TO 4 LANES      170
WIDEN FROM 4 TO 6 LANES       64
WIDEN FROM 2 TO 6 LANES       43
CONSTRUCT 4 LANE ARTERIAL     21
WIDEN FROM 3 TO 4 LANES       10
Name: description, dtype: int64
# of unique project titles: 2607
After dropping duplicates using description and route_name: 2824
Df shape: (2952, 13)
Index(['system', 'lead_agency', 'rtp_id', 'route_#', 'route_name', 'from',
       'to', 'description', 'completion_year', 'project_cost__$1,000s_',
       'county', 'project_title', 'project_cost_millions'],
      dtype='object')


In [153]:
def harmonize_scag():
    df = scag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="description",
        project_category_col="system",
        project_cost_col="project_cost_millions",
        location_col="",
        county_col="county",
        city_col="",
        project_year_col="completion_year",
        program="SCAG",
        phase_col="",
        data_source="LRTP",
        note_cols=["lead_agency", "route_#", "route_name", "from", "to"],
        cost_in_millions=True,
    )

    return df

In [154]:
# scag.head()

In [155]:
# scag_harmonized.head()

### Stancog

In [156]:
def stancog_lrtp():

    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}STANCOG.xlsx"))
    df = embedded_column_names(df, 1)
    keep_columns = [
        "Jurisdiction",
        "Location",
        "Project Limits",
        "Description",
        "Total Cost",
        "Open to Traffic",
        "Funding Source",
        "System Preserv.",
        "Capacity Enhance.",
        "Safety",
        "Oper.",
        "Complete Streets",
        "Active\nTransporta tion",
        "Transit",
        "Other",
    ]

    df = df[keep_columns]

    df = to_snakecase(df)

    df = df.dropna(subset=["description"]).reset_index(drop=True)

    df["title"] = (df.location + "-" + df.description).fillna("No Title")

    df = delete_embedded_headers(df, "location", "Location")

    df = df.drop_duplicates(
        subset=["title", "location", "jurisdiction", "total_cost", "open_to_traffic"]
    ).reset_index(drop=True)
    return df

In [157]:
stancog = stancog_lrtp()

68 rows are headers


In [158]:
give_info(stancog, "title", "total_cost")

Whitmore Ave-Install Traffic Signal                   5
No Title                                              5
W. Main St-Widen to 3 lanes                           4
SR-99-Construct NB & SB auxiliary lanes               4
SR-33-Install 4 Lane Arterial Roadway Improvements    3
Name: title, dtype: int64
# of unique project titles: 515
After dropping duplicates using title and total_cost: 552
Df shape: (556, 16)
Index(['jurisdiction', 'location', 'project_limits', 'description',
       'total_cost', 'open_to_traffic', 'funding_source', 'system_preserv_',
       'capacity_enhance_', 'safety', 'oper_', 'complete_streets',
       'active\ntransporta_tion', 'transit', 'other', 'title'],
      dtype='object', name=0)


In [159]:
stancog.sample()

Unnamed: 0,jurisdiction,location,project_limits,description,total_cost,open_to_traffic,funding_source,system_preserv_,capacity_enhance_,safety,oper_,complete_streets,active\ntransporta_tion,transit,other,title
340,Modesto,Torrid Ave,Mercy Ave to N Carpenter Rd,"Class 3 Bicycle Boulevard. Traffic calming, wayfinding. StanCOG Non-Motorized\nTransportation Master Plan Project ID: MOD-33",90300,2022-2046,"ATP, SB 1, BIL/IIJA, CMAQ, STBGP",,,,,,X,,,"Torrid Ave-Class 3 Bicycle Boulevard. Traffic calming, wayfinding. StanCOG Non-Motorized\nTransportation Master Plan Project ID: MOD-33"


In [160]:
def harmonize_stancog():
    df = stancog_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="title",
        project_description_col="description",
        project_category_col="",
        project_cost_col="total_cost",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="open_to_traffic",
        program="STANCOG",
        phase_col="",
        data_source="LRTP",
        note_cols=[
            "jurisdiction",
            "location",
            "project_limits",
            "funding_source",
            "system_preserv_",
            "capacity_enhance_",
            "safety",
            "oper_",
            "complete_streets",
            "active\ntransporta_tion",
            "transit",
            "other",
        ],
        cost_in_millions=True,
    )

    return df

In [161]:
type(harmonize_stancog())

68 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


pandas.core.frame.DataFrame

### TMPO (Tahoe)
* Open data portal: https://www.tahoeopendata.org/search?tags=transportation
* https://www.tahoeopendata.org/datasets/TRPA::2020-regional-transportation-plan-projects/about

In [162]:
def tmpo_lrtp():
    tahoe_url = "https://maps.trpa.org/server/rest/services/Datadownloader_Transportation/MapServer/19/query?where=1%3D1&text=&objectIds=&time=&timeRelation=esriTimeRelationOverlaps&geometry=&geometryType=esriGeometryPoint&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&returnTrueCurves=false&maxAllowableOffset=&geometryPrecision=&outSR=&havingClause=&returnIdsOnly=false&returnCountOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&gdbVersion=&historicMoment=&returnDistinctValues=false&resultOffset=&resultRecordCount=&returnExtentOnly=false&sqlFormat=none&datumTransformation=&parameterValues=&rangeValues=&quantizationParameters=&featureEncoding=esriDefault&f=geojson"
    gdf = to_snakecase(gpd.read_file(tahoe_url))
    gdf.complete_year = gdf.complete_year.astype(int)

    # Filter out projects that are not yet completed
    gdf2 = gdf[gdf.complete_year > 2022].reset_index(drop=True)

    gdf2["project_year"] = (
        gdf2.start_year.astype(str) + "-" + gdf.complete_year.astype(str)
    )
    return gdf2

In [163]:
# tahoe = tmpo_lrtp()

In [164]:
def harmonize_tahoe():
    df = tmpo_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_name",
        project_description_col="description",
        project_category_col="category",
        project_cost_col="estimated_cost",
        location_col="geometry",
        county_col="",
        city_col="",
        project_year_col="project_year",
        program="TMPO",
        phase_col="phase",
        data_source="LRTP",
        note_cols=[
            "implementer",
            "funding_type",
            "plan_name",
            "label",
            "financial_status",
            "url",
        ],
        cost_in_millions=True,
    )

    return df

### TCAG Tulare County Association of Governments (TCAG)

In [165]:
def tcag_lrtp():
    df = to_snakecase(pd.read_excel(f"{LRTP_GCS}TCAG.xlsx"))

    def completion_2035(row):
        if row.completed_by_2035_y_n == "Y":
            return "2035"

    df["completion_year"] = df.apply(lambda x: completion_2035(x), axis=1)

    df["cost_x_1,000"] = df["cost_x_1,000"].abs()
    df["cost_in_millions"] = df["cost_x_1,000"] * 1_000
    return df

In [166]:
tcag = tcag_lrtp()

In [167]:
tcag.groupby(["project_title", "cost_x_1,000"]).agg(
    {"project_category": "count"}
).sort_values("project_category", ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,project_category
project_title,"cost_x_1,000",Unnamed: 2_level_1
Bike/Ped Improvements next 9 years,0.0,6
Bike/Ped Improvements next 9 yrs second half of measure R,500.0,3
Avenue 316 Construct new roadway,6500.0,3
Bike/Ped Improvements next 9 yrs second half of measure R,226.0,2
Transit Center Minor,300.0,2


In [168]:
give_info(tcag, "project_title", "cost_x_1,000")

Bike/Ped Improvements next 9 yrs second half of measure R    9
Bike/Ped Improvements next 9 years                           9
Purchase Buses                                               4
SR 99 Widen existing roadway                                 4
Riggin Avenue Widen existing roadway                         4
Name: project_title, dtype: int64
# of unique project titles: 272
After dropping duplicates using project_title and cost_x_1,000: 321
Df shape: (340, 13)
Index(['project_id__if_applicable_', 'jurisdiction', 'project_title',
       'project_description', 'project_category', 'completed_by_2035_y_n',
       'local', 'regional', 'state', 'federal', 'cost_x_1,000',
       'completion_year', 'cost_in_millions'],
      dtype='object')


In [169]:
def harmonize_tcag():
    df = tcag_lrtp()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="cost_in_millions",
        location_col="",
        county_col="",
        city_col="",
        project_year_col="completion_year",
        program="TCAG",
        phase_col="",
        data_source="LRTP",
        note_cols=["jurisdiction", "local", "regional", "federal", "state"],
        cost_in_millions=True,
    )

    return df

In [170]:
len(harmonize_tcag())

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


340

## Stack

### Lost

In [171]:
# lost = harmonization_utils.load_lost()

In [172]:
# lost.sample()

In [173]:
def harmonize_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing_lrtp(
        df,
        project_name_col="project_title",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="cost__in_millions_",
        location_col="",
        county_col="county",
        city_col="city",
        project_year_col="",
        program="LOST",
        phase_col="phase",
        data_source="measure",
        note_cols=[
            "measure",
            "estimated_lost_funds",
            "estimated_federal_funds",
            "estimated_state_funds",
            "estimated_local_funds",
            "estimated_other_funds",
            "notes",
        ],
        cost_in_millions=False,
    )

    return df

In [174]:
lost_harmonized = harmonize_lost()

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [175]:
def all_mpo(save_to_gcs: bool = True):

    # Load harmonized df
    ambag = harmonize_ambag()
    bcag = harmonize_bcag()
    fresno = harmonize_fresnocog()
    kcag = harmonize_kcag()
    kern = harmonize_kerncog()
    madera = harmonize_madera()
    mcagov = harmonize_mcagov()
    mtc = harmonize_mtc()
    lost = harmonize_lost()
    sacog = harmonize_sacog()
    sandag = harmonize_sandag()
    sbcag = harmonize_sbcag()
    scrtpa = harmonize_scrtpa()
    sjcog = harmonize_sjcog()
    scag = harmonize_scag()
    slocog = harmonize_slocog()
    stancog = harmonize_stancog()
    tahoe = harmonize_tahoe()
    tcag = harmonize_tcag()

    df_list = [
        ambag,
        bcag,
        fresno,
        kern,
        kcag,
        madera,
        mcagov,
        mtc,
        lost,
        sacog,
        sandag,
        sbcag,
        scrtpa,
        sjcog,
        scag,
        slocog,
        stancog,
        tahoe,
        tcag,
    ]
    df = pd.concat(df_list)

    # Clean string columns
    str_cols = [
        "project_title",
        "lead_agency",
        "project_category",
        "grant_program",
        "project_description",
        "phase",
        "city",
        "county",
        "data_source",
        "notes",
    ]
    for i in str_cols:
        df[i] = df[i].str.replace("_", " ").str.strip().str.title()

    # Create gdf
    gdf = df[df.geometry != None].reset_index(drop=True)
    gdf = gdf.set_geometry("geometry")
    gdf.geometry = gdf.geometry.set_crs("EPSG:4326")
    gdf = gdf[gdf.geometry.geometry.is_valid].reset_index(drop=True)

    if save_to_gcs:
        df.drop(columns=["geometry"]).to_excel(
            f"{harmonization_utils.GCS_FILE_PATH}LRTP/all_LRTP_LOST.xlsx", index=False
        )
        gdf.to_file("./all_LRTP_LOST.geojson", driver="GeoJSON")

    return df, gdf

In [176]:
all_mpo_df, all_mpo_gdf = all_mpo(False)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  df.fund_estimate.str.replace("$", "")
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


96 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  sandag.cost2020m.str.replace("$", "")
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


65 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


360 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


68 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


## Presentation Datapalozza

In [177]:
# Filter out for lost temporarily for presentation
all_mpo_df = all_mpo_df[all_mpo_df.data_source != "Lost"]

In [178]:
all_mpo_gdf = all_mpo_gdf[all_mpo_gdf.data_source != "Lost"]

In [179]:
all_mpo_df.shape, all_mpo_gdf.shape

((14451, 13), (1355, 13))

In [180]:
all_mpo_df["data_source"].value_counts().median()

417.5

In [181]:
#all_mpo_df.drop(columns=["geometry"]).to_excel(
#    f"{harmonization_utils.GCS_FILE_PATH}LRTP/all_LRTP.xlsx", index=False
#)

In [182]:
# all_mpo_gdf.explore('project_title', legend = False)

In [183]:
preview = [
    "Project Title",
    "Lead Agency",
    "Project Year",
    "Grant Program",
    "Project Description",
    "Total Project Cost",
    "City",
    "County",
    "Data Source",
    "Notes",
]

In [184]:
all_mpo_df2 = all_mpo_df.copy()

In [185]:
all_mpo_df2.columns = all_mpo_df2.columns.str.replace("_", " ").str.strip().str.title()

In [186]:
all_mpo_df2[all_mpo_df2["Total Project Cost"] == 0].shape

(95, 13)

In [187]:
all_mpo_df2[preview].sample(3)

Unnamed: 0,Project Title,Lead Agency,Project Year,Grant Program,Project Description,Total Project Cost,City,County,Data Source,Notes
1297,Adams: Del Rey To Bethel,Fresno Cog,2032-2036,Fresno Cog Lrtp,Ac Overlay,0.53,,,Fresno Cog,Financial Constraint: Constrained
78,Enterprise Canal - Dewolf To Bullard: Trail,Fresno Cog,2027-2031,Fresno Cog Lrtp,Trail,2.0,,,Fresno Cog,Financial Constraint: Constrained
274,Corridor & Interchange Improvements | I-880 | Alameda County,Mtc,2021 to 2035,Mtc Lrtp,I-880 Winton Avenue A Street Interchange Reconstruction,7.08,,Alameda,Mtc,"Strategy: T06. Improve Interchanges & Address Highway Bottlenecks, Layer Name: Planbayarea2050 Rtp Point, Mode Detai: Interchange"


### Cordon Pricing

In [188]:
def filter_projects(
    df,
    columns_to_search: list,
    keywords_search: list,
    file_name: str,
    save_to_gcs: bool = True,
):

    # Filter out for Cordon
    df = list_utils.find_keywords(df, columns_to_search, keywords_search)
    df2 = (
        df[
            (df.lower_case_project_title_keyword_search != "keyword not found")
            | (df.lower_case_project_description_keyword_search != "keyword not found")
        ]
    ).reset_index(drop=True)

    # Delete out non HOV projects that were accidentally picked up
    projects_to_delete = [
        "SR 17 Corridor Congestion Relief in Los Gatos",
        "Interstate 380 Congestion Improvements",
    ]
    df2 = df2[~df2.project_title.isin(projects_to_delete)].reset_index(drop=True)

    # Change cases
    for i in ["project_title", "project_description"]:
        df2[i] = df2[i].str.title()

    # Drop invalid geometries
    gdf = df2[df2.geometry != None].reset_index(drop=True)
    gdf = gdf.set_geometry("geometry")
    gdf.geometry = gdf.geometry.set_crs("EPSG:4326")
    gdf = gdf[gdf.geometry.is_valid].reset_index(drop=True)
    gdf = gdf.fillna(gdf.dtypes.replace({"float64": 0.0, "object": "None"}))

    # One version that's a df
    columns_to_drop = ["lower_case_project_title", "lower_case_project_description"]
    df2 = df2.drop(columns=columns_to_drop + ["geometry"])
    df2 = df2.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    if save_to_gcs:
        df2.to_excel(
            f"{harmonization_utils.GCS_FILE_PATH}LRTP/{file_name}.xlsx",
            index=False,
        )
        gdf.to_file(f"./{file_name}.geojson", driver="GeoJSON")

    return gdf, df2

In [189]:
congest_keywords = [
    "congestion pricing",
    "variable tolls",
    "express lane",
    "value pricing",
    "rush hour",
    "cordon",
    "dynamic pricing",
    "dynamically  priced",
    "high occupancy",
    "mobility pricing",
    "occupancy",
    "toll lane",
    "performance pricing",
    "peak travel",
    "managed lane",
    "tollway",
    "express toll",
    "fixed pricing",
    "hot lane",
    "hov lane",
    "expressed toll lane",
]

In [190]:
cordon_gdf, cordon_df = filter_projects(
    all_mpo_df,
    [
        "project_title",
        "project_description",
    ],
    congest_keywords,
    "congestion_pricing_lrtp_only",
    True,
)

  df[i]
  df[i]


In [191]:
len(cordon_df), len(cordon_gdf)

(168, 85)

In [192]:
len(cordon_df) / len(all_mpo_df)

0.011625493045463982

In [193]:
cordon_df.project_title.nunique(), cordon_gdf.project_title.nunique()

(92, 52)

In [194]:
# cordon_df[["data_source",'project_title','project_description','lower_case_project_title_keyword_search',
#     'lower_case_project_description_keyword_search']]

In [195]:
cordon_df.groupby(["grant_program"]).agg({"project_title": "nunique"})

Unnamed: 0_level_0,project_title
grant_program,Unnamed: 1_level_1
Fresno Cog Lrtp,2
Mtc Lrtp,4
Sacog Lrtp,30
Sandag Lrtp,46
Sbcag Lrtp,4
Scag Lrtp,1
Sjcog Lrtp,1
Slocog Lrtp,1
Stancog Lrtp,2
Tmpo Lrtp,1


In [196]:
cordon_df.groupby(["data_source"]).agg({"total_project_cost": "sum"})

Unnamed: 0_level_0,total_project_cost
data_source,Unnamed: 1_level_1
Fresno Cog,605.0
Mtc,238.67
Sacog,3231.09
Sandag,7682.0
Sbcag,569.32
Scag,22324.06
Sjcog,200.0
Slocog,108.0
Stancog,0.0
Tmpo,0.0


In [197]:
cordon_df.total_project_cost.describe()

count    168.00
mean     208.08
std      475.51
min        0.00
25%        4.72
50%       75.18
75%      201.89
max     4700.00
Name: total_project_cost, dtype: float64

In [198]:
gdf_map_subset = [
    "project_title",
    "lead_agency",
    "project_year",
    "total_project_cost",
    "geometry",
]

In [199]:
# cordon_gdf[gdf_map_subset].explore('project_title', cmap= 'tab20c', height = 400, width = 600,  style_kwds={'weight':4}, legend = False)

### ATP for Presentation

In [200]:
atp_keywords = [
    "bicycle",
    "bike",
    "pedestrian",
    "walk",
    "active trans",
    "curb",
    "ramp",
    "cycle",
    "atp",
]

In [201]:
all_mpo_df = all_mpo_df.drop(
    columns=[
        "lower_case_project_title",
        "lower_case_project_description",
        "lower_case_project_title_keyword_search",
        "lower_case_project_description_keyword_search",
    ]
)

In [202]:
atp_gdf, atp_df = filter_projects(
    all_mpo_df,
    [
        "project_title",
        "project_description",
    ],
    atp_keywords,
    "atp_projects",
    True,
)

  df[i]
  df[i]


In [204]:
atp_df["categories"] = atp_df.lower_case_project_description_keyword_search

In [205]:
# Delete "keyword not found" result and replace with the keyword from project_title
# Save this to GCS
# https://stackoverflow.com/questions/62989923/pandas-dataframe-replace-part-of-string-with-value-from-another-column
atp_df["categories"] = atp_df.apply(
    lambda x: x["categories"].replace(
        "keyword not found", str(x["lower_case_project_title_keyword_search"])
    ),
    axis=1,
)

In [206]:
atp_df.categories.value_counts()

bike            2032
walk             775
ramp             540
pedestrian       374
bicycle          276
curb             270
active trans      34
cycle             24
atp               24
Name: categories, dtype: int64

In [207]:
atp = (
    atp_df.groupby(["data_source"])
    .agg({"project_title": "count"})
    .sort_values(["project_title"])
    .reset_index()
    .rename(columns={"project_title": "n_atp_projects"})
)

In [208]:
all_projects = (
    all_mpo_df.groupby(["data_source"])
    .agg({"project_title": "count"})
    .sort_values(["project_title"])
    .reset_index()
    .rename(columns={"project_title": "n_projects"})
)

In [209]:
# Save this to GCS
atp_m = pd.merge(atp, all_projects, on=["data_source"], how="inner")

In [210]:
atp_m["percent_of_atp_projects"] = atp_m.n_atp_projects / atp_m.n_projects * 100

In [211]:
atp_m.columns = atp_m.columns.str.replace("_", " ").str.strip().str.title()

In [212]:
# with pd.ExcelWriter(f"{harmonization_utils.GCS_FILE_PATH}LRTP/atp_datapalooza.xlsx") as writer:
#    atp_df.to_excel(writer, sheet_name="full_atp", index=False)
#   atp_m.to_excel(writer, sheet_name="summarized_atp", index=False)

### Categorizing

In [213]:
all_mpo_df = all_mpo_df.fillna("None")

In [216]:
def add_categories(df):
    """
    Create general categories for each projects.
    https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305
    """
    # There are many projects that are
    ACTIVE_TRANSPORTATION = [
        "bike",
        "bicycle",
        "cyclist",
        "pedestrian",
        ## including the spelling errors of `pedestrian`
        "pedestrain",
        "crosswalk",
        "bulb out",
        "bulb-out",
        "active transp",
        "traffic reduction",
        "speed reduction",
        "ped",
        "srts",
        "safe routes to school",
        "sidewalk",
        "side walk",
        "Cl ",
        "trail",
        "atp",
    ]
    TRANSIT = [
        "bus",
        "metro",
        "station",  # Station comes up a few times as a charging station and also as a train station
        "transit",
        "fare",
        "brt",
        "yarts",
        "railroad",
        "highway-rail",
        "streetcar",
        "mass transit",
        # , 'station' in description and 'charging station' not in description
    ]
    BRIDGE = ["bridge", "viaduct"]
    STREET = [
        "traffic signal",
        "resurface",
        "resurfacing",
        "slurry",
        "seal" "sign",
        "stripe",
        "striping",
        "median",
        "guard rail",
        "guardrail",
        "road",
        "street",
        "sinkhole",
        "intersection",
        "signal",
        "curb",
        "light",
        "tree",
        "pavement",
        "roundabout",
    ]

    NOT_INC = []

    FREEWAY = ["hov ", "hot ", "freeway", "highway", "express lanes", "hwy"]

    INFRA_RESILIENCY_ER = [
        "repair",
        "emergency",
        "replace",
        "retrofit",
        "rehab",
        "improvements",
        "seismic",
        "reconstruct",
        "restoration",
    ]

    CONGESTION_RELIEF = [
        "congestion",
        "rideshare",
        "ridesharing",
        "vanpool",
        "car share",
    ]

    PASSENGER_MODE = ["non sov", "high quality transit areas", "hqta", "hov"]

    SAFETY = [
        "fatalities",
        "safe",
        "speed management",
        "signal coordination",
        "slow speeds",
        "roundabouts",
        "victims",
        "collisions",
        "collisoins",
        "protect",
        "crash",
        "modification factors",
        "safety system",
    ]

    MANAGED_LANES = [
        "congestion pricing",
        "variable tolls",
        "express lane",
        "value pricing",
        "rush hour",
        "cordon",
        "dynamic pricing",
        "dynamically  priced",
        "high occupancy",
        "mobility pricing",
        "occupancy",
        "toll lane",
        "performance pricing",
        "peak travel",
        "managed lane",
        "tollway",
        "express toll",
        "fixed pricing",
        "hot lane",
        "hov lane",
        "expressed toll lane",
    ]

    def categorize_project_descriptions(row):
        """
        This function takes a individual type of work description (row of a dataframe)
        and returns a dummy flag of 1 if it finds keyword present in
        project categories (active transportation, transit, bridge, etc).
        A description can contain multiple keywords across categories.
        """
        # Clean up project description 2
        project_description = (
            row.project_description.lower()
            .replace("-", "")
            .replace(".", "")
            .replace(":", "")
        )

        # Store a bunch of columns that will be flagged
        # A project can involve multiple things...also, not sure what's in the descriptions
        active_transp = ""
        transit = ""
        bridge = ""
        street = ""
        freeway = ""
        infra_resiliency_er = ""
        congestion_relief = ""
        passenger_mode_shift = ""
        safety = ""
        managed_lanes = ""

        if any(word in project_description for word in ACTIVE_TRANSPORTATION):
            active_transp = 1
        # if any(word in description if instanceof(word, str) else word(description) for word in TRANSIT)
        if any(word in project_description for word in TRANSIT) and not any(
            exclude_word in project_description for exclude_word in NOT_INC
        ):
            transit = 1
        if any(word in project_description for word in BRIDGE):
            bridge = 1
        if any(word in project_description for word in STREET):
            street = 1
        if any(word in project_description for word in FREEWAY):
            freeway = 1
        if any(word in project_description for word in INFRA_RESILIENCY_ER):
            infra_resiliency_er = 1
        if any(word in project_description for word in CONGESTION_RELIEF):
            congestion_relief = 1
        if any(word in project_description for word in PASSENGER_MODE):
            passenger_mode_shift = 1
        if any(word in project_description for word in SAFETY):
            safety = 1
        if any(word in project_description for word in MANAGED_LANES):
            managed_lanes = 1
        new_cols = [
            "active_transp",
            "transit",
            "bridge",
            "street",
            "freeway",
            "infra_resiliency_er",
            "congestion_relief",
            "passenger_mode_shift",
            "safety",
            "managed_lanes",
        ]

        category_series = pd.Series(
            [
                active_transp,
                transit,
                bridge,
                street,
                freeway,
                infra_resiliency_er,
                congestion_relief,
                passenger_mode_shift,
                safety,
                managed_lanes,
            ],
            index=new_cols,
        )

        return category_series

    work_categories_df = df.apply(categorize_project_descriptions, axis=1)
    new_cols = list(work_categories_df.columns)
    df2 = pd.concat([df, work_categories_df], axis=1)
    df2[new_cols] = df2[new_cols].apply(pd.to_numeric, errors="coerce").fillna(0)
    df2["n_categories"] = df2[new_cols].sum(axis=1)
    return df2

In [217]:
categorized_df = add_categories(all_mpo_df)

In [218]:
drop_cols = [
    "lower_case_project_title",
    "lower_case_project_description",
    "lower_case_project_title_keyword_search",
    "lower_case_project_description_keyword_search",
    "project_category",
    "phase",
    "geometry",
    "city",
    "county",
]

In [219]:
categorized_df = categorized_df.drop(columns=drop_cols)

In [220]:
categorized_df.n_categories.value_counts()

0.00    6217
1.00    4430
2.00    2572
3.00     865
4.00     287
5.00      62
6.00      18
Name: n_categories, dtype: int64

In [221]:
categorized_df_subset = categorized_df[
    [
        "project_title",
        "data_source",
        "active_transp",
        "transit",
        "bridge",
        "street",
        "freeway",
        "infra_resiliency_er",
        "congestion_relief",
        "passenger_mode_shift",
        "safety",
        "managed_lanes",
    ]
]

In [222]:
all_projects = (all_mpo_df
                .groupby(['data_source'])
                .agg({'project_title':'count'})
                .rename(columns = {'project_title':'total_projects'})
                .reset_index()
               )

In [223]:
categorized_df_melt = (pd
        .melt(categorized_df_subset, 
              id_vars=['data_source'], 
              value_vars=["active_transp",
        "transit",
        "bridge",
        "street",
        "freeway",
        "infra_resiliency_er",
        "congestion_relief",
        "passenger_mode_shift",
        "safety",
        "managed_lanes"])
        .rename(columns ={'variable':'category'}
                      ))

In [224]:
categorized_df_melt = (categorized_df_melt
                       .groupby(['data_source','category'])
                       .agg({'value':'sum'})
                       .reset_index()
                      )

In [225]:
categorized_df_melt.category = categorized_df_melt.category.str.replace(
    "_", " "
).str.title()

In [226]:
new_cols = [
    "active_transp",
    "transit",
    "bridge",
    "street",
    "freeway",
    "infra_resiliency_er",
    "congestion_relief",
    "passenger_mode_shift",
    "safety",
    "managed_lanes",
]

In [227]:
category_projects_df = categorized_df[new_cols].sum(axis=0)

In [228]:
category_projects_df = (
    category_projects_df.to_frame()
    .reset_index()
    .rename(columns={"index": "Category", 0: "Total Projects"})
)

In [229]:
category_projects_df

Unnamed: 0,Category,Total Projects
0,active_transp,3068.0
1,transit,1521.0
2,bridge,737.0
3,street,4404.0
4,freeway,712.0
5,infra_resiliency_er,2488.0
6,congestion_relief,81.0
7,passenger_mode_shift,100.0
8,safety,458.0
9,managed_lanes,166.0


In [230]:
category_projects_df.Category = category_projects_df.Category.str.replace(
    "_", " "
).str.title()

In [231]:
with pd.ExcelWriter(f"{harmonization_utils.GCS_FILE_PATH}LRTP/categorized_datapalooza.xlsx") as writer:
    categorized_df.to_excel(writer, sheet_name="categorized", index=False)
    category_projects_df.to_excel(writer, sheet_name="total projects", index=False)
    categorized_df_melt.to_excel(writer, sheet_name="mpo_categorized", index=False)