## https://github.com/cal-itp/data-analyses/issues/890

In [1]:
import _harmonization_utils as har_utils
import _lrtp_utils as lrtp_utils
import _sb1_utils as sb1_utils
import _specific_list_utils
import _state_rail_plan_utils as srp_utils
import geopandas as gpd
import pandas as pd
from calitp_data_analysis import utils
from calitp_data_analysis.sql import to_snakecase
from fuzzywuzzy import fuzz



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/sb125/local_transit_list/"

### Load Data
* Need to include LP2000
* Add in data source columns
* Add in phase:
    *  3 phases of projects: planned / partially funded / funded & under construction
    * state rail plan (planned)
    * LRTP (partially funded)
    * LP2000, Black Cat, TIRCP (under construction)

In [4]:
def load_lrtp():
    lrtp_df, lrtp_gdf = lrtp_utils.all_mpo(True)
    lrtp_df = lrtp_df.loc[lrtp_df.data_source != "Lost"].reset_index(drop=True)
    lrtp_df["program_col"] = "None"
    lrtp_df["dds_phase"] = "Partially Funded"
    return lrtp_df

In [5]:
lrtp_df = load_lrtp()

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


96 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  sandag.cost2020m.str.replace("$", "")
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


65 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


360 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


68 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [6]:
def load_srp():
    df = srp_utils.clean_state_rail_plan(srp_utils.state_rail_plan_file)
    df["source"] = "State Rail Plan"
    df["program"] = "State Rail Plan"
    df["dds_phase"] = "Planned"
    return df

In [7]:
srp_df = load_srp()

#### Sb1: Which col represents total project cost? cost, totalcosts, costfull
* Also filter out for ONLY TIRCP projects?

In [8]:
def load_sb1():
    # Only includes in progress/incomplete projects
    df = sb1_utils.load_sb1()
    df["source"] = "SB1 Feature Server"
    df["dds_phase"] = "Under Construction"
    df = df.loc[
        df.projprogram == "Transit And Intercity Rail Capital Program"
    ].reset_index(drop=True)
    return df

In [9]:
sb1_df = load_sb1()

  gdf[i]
  gdf[i]


0 rows are mising geometry
7917 rows contain invalid geography


In [10]:
sb1_df.projprogram.value_counts()

Transit And Intercity Rail Capital Program    60
Name: projprogram, dtype: int64

#### Blackcat: Ask what grant_encumbered_amount,local_encumbered_amount,total_encumbered_amount,activebalance mean?

In [11]:
def load_blackcat(file: str) -> pd.DataFrame:
    df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{file}"))
    df["data_source"] = "Blackcat"
    df["dds_phase"] = "Under Construction"
    return df

In [12]:
blackcat_df = load_blackcat("Grant+Projects_1_17_2024.xlsx")

In [13]:
blackcat_df.head(2)

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,grant_encumbered_amount,local_encumbered_amount,total_encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status,project_closed_by,project_closed_date,project_closed_time,data_source,dds_phase
0,2011,Section 5311,CA-18-X047 | 0012000083,2016,City of Chowchilla,BCG0000228,Operating Assistance,300902,64BO17-00368,53221.0,114511.0,211651.0,326162.0,53221.0,0.0,0,Open,,,,Blackcat,Under Construction
1,2011,Section 5311,CA-18-X047 | 0012000083,2016,Madera County,BCG0000283,Buy <30-Ft Bus For Expansion,111304,64BC17-00408,110663.0,110663.0,14337.0,125000.0,101352.02,9310.98,0,Open,,,,Blackcat,Under Construction


In [14]:
def aggregate_to_one_line(
    df: pd.DataFrame, column_to_group: str, column_to_summarize: str
):
    """
    Aggregate all values onto one line by one goruping val.
    Ex: project ABC has two rows because it has two values for the "fund column"
    as it receives money from fund 1 and fund 2. This function will
    combine fund 1 and fund 2 to fund 1, fund2 into one row.
    """
    df[f"new_{column_to_summarize}"] = df.groupby(column_to_group)[
        column_to_summarize
    ].transform(lambda x: ",".join(x))
    df2 = df[[f"new_{column_to_summarize}", column_to_group]]
    df2 = df2.drop_duplicates(subset=column_to_group).reset_index(drop=True)
    df2 = df2.rename(columns={f"new_{column_to_summarize}": column_to_summarize})
    return df2

In [15]:
def load_lp2000(file: str):
    df_project = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}{file}", sheet_name="project")
    )

    df_county = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}{file}", sheet_name="county")
    ).drop(columns=["project_label_name"])

    df_district = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}{file}", sheet_name="district")
    ).drop(columns=["project_label_name"])

    df_award = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}{file}", sheet_name="awards")
    )

    # Clean up awards so if project has multiple entries, this is all
    # on one line instead of mulitple lines
    df_award2 = aggregate_to_one_line(df_award, "project_id", "grant_program")

    # Merge
    lp2000_df = (
        pd.merge(df_project, df_county, on="project_id", how="left")
        .merge(df_district, on="project_id", how="left")
        .merge(df_award2, on="project_id", how="left")
    )

    # Create
    lp2000_df["postmile_combined"] = (
        "begin: "
        + lp2000_df.revised_post_mile_begin_ind
        + "end: "
        + lp2000_df.revised_post_mile_end_ind
    )
    lp2000_df_to_keep = [
        "comment_desc",
        "est_total_prj_costs",
        "location_name",
        "route_name",
        "state_hwy_ind",
        "senate_district_code",
        "update_date_time",
        "agency_name",
        "urban_area_code",
        "work_type_desc",
        "category_desc",
        "current_phase",
        "county_name",
        "project_label_name",
        "district_code",
        "grant_program",
        "postmile_combined",
    ]
    lp2000_df = lp2000_df[lp2000_df_to_keep]
    lp2000_df["dds_phase"] = "Under Construction"
    lp2000_df["source"] = "LP2000"
    return lp2000_df

In [16]:
lp2000_df = load_lp2000("LP2000.xlsx")

### Harmonize it

In [17]:
def create_notes(df, note_cols: list, new_col_name: str):
    """
    Concat multiple columns into one.
    """
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df[new_col_name] = df.apply(combine_notes, axis=1)
    df[new_col_name] = df[new_col_name].str.replace("_", " ")
    df[new_col_name] = df[new_col_name].fillna("None")
    return df

In [18]:
def harmonizing(
    df,
    agency_name_col: str,
    project_name_col: str,
    project_description_col: str,
    phase_col: str,
    project_cost_col: str,
    geography_col: str,
    post_mile_col: str,
    county_col: str,
    city_col: str,
    district_col: str,
    project_start_year_col: str,
    project_completion_year_col: str,
    program_col: str,
    data_source: str,
    fund_cols: list,
    notes_cols: list,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.
    """
    rename_columns = {
        agency_name_col: "lead_agency",
        project_name_col: "project_title",
        project_description_col: "project_description",
        project_cost_col: "total_project_cost",
        geography_col: "geometry",
        phase_col: "phase",
        post_mile_col: "post_mile",
        county_col: "county",
        city_col: "city",
        district_col: "ct_district",
        project_start_year_col: "project_start_year",
        project_completion_year_col: "project_completion_year",
        program_col: "grant_program",
        data_source: "source",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_start_year",
        "project_completion_year",
        "post_mile",
        "phase",
        "project_title",
        "ct_district",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "geometry" not in df:
        df["geometry"] = None
    if "grant_program" not in df:
        df["grant_program"] = data_source
    if "total_project_cost" not in df:
        df["total_project_cost"] = 0

    # Clean up monetary columns to be interger
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Add new column with funding breakout
    # Since it's summarized above and the details are suppressed.
    df["total_available_funds"] = df[fund_cols].sum(axis=1)
    df = create_notes(df, fund_cols, "funding_notes")

    # Create notes - aka other columns that were supressed
    df = create_notes(df, notes_cols, "notes")

    # Fill in nones
    geometry_column = "geometry"

    # Create a dictionary to specify replacement values for each data type
    replacement_values = {"float64": 0.0, "object": "None"}

    # Exclude the geometry column from the fillna operation
    columns_to_fill = [col for col in df.columns if col != geometry_column]

    # Fill null values in selected columns
    df[columns_to_fill] = df[columns_to_fill].fillna(
        df.dtypes.replace(replacement_values)
    )

    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_description",
        "total_project_cost",
        "total_available_funds",
        "phase",
        "post_mile",
        "county",
        "city",
        "ct_district",
        "project_start_year",
        "project_completion_year",
        "geometry",
        "grant_program",
        "source",
        "notes",
        "funding_notes",
    ]

    # Only keep certain columns
    df = df[columns_to_keep]
    return df

In [19]:
lrtp_df_harmonized = harmonizing(
    df=lrtp_df,
    agency_name_col="lead_agency",
    project_name_col="project_title",
    project_description_col="project_description",
    phase_col="dds_phase",
    project_cost_col="total_project_cost",
    geography_col="geometry",
    post_mile_col="",
    county_col="county",
    city_col="city",
    district_col="",
    project_start_year_col="",
    project_completion_year_col="",
    program_col="program_col",
    data_source="data_source",
    fund_cols=[],
    notes_cols=["notes"],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [20]:
lrtp_df_harmonized.sample()

Unnamed: 0,project_title,lead_agency,project_description,total_project_cost,total_available_funds,phase,post_mile,county,city,ct_district,project_start_year,project_completion_year,geometry,grant_program,source,notes,funding_notes
10758,No Title,Scag,Pacific Coast Highway Regional Traffic Message Systems. The Project Will Enable The City Of Malibu And Other Agencies\nTo Notify Travelers Of Critical Regional Traffic And Safety Information And Facilitate Traffic Flow Throughout The Region. The Project Will Install A Maximum Of 4 Permanent Changeable Message Signs At Strategic Locations Along Pch/Sr-1 Corridor In The City Of Malibu.,2500000.0,0.0,Partially Funded,,,,,,,,,Scag Lrtp,"notes: System: State Highway, Route #: 1, Route Name: Nan, From: Nan, To: Nan",


In [21]:
blackcat_df_harmonized = harmonizing(
    df=blackcat_df,
    agency_name_col="organization_name",
    project_name_col="",
    project_description_col="description",
    phase_col="dds_phase",
    project_cost_col="",
    geography_col="",
    post_mile_col="",
    county_col="",
    city_col="",
    district_col="",
    project_start_year_col="project_year",
    project_completion_year_col="project_closed_date",
    program_col="funding_program",
    data_source="data_source",
    fund_cols=["allocationamount"],
    notes_cols=[
        "grant_fiscal_year",
        "grant_encumbered_amount",
        "local_encumbered_amount",
        "total_encumbered_amount",
        "expendedamount",
        "activebalance",
        "closedoutbalance",
        "project_status",
    ],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [22]:
sb1_df_harmonized = harmonizing(
    df=sb1_df,
    agency_name_col="projagency",
    project_name_col="projname",
    project_description_col="description",
    phase_col="dds_phase",
    project_cost_col="totalcosts",
    geography_col="geometry",
    post_mile_col="",
    county_col="countyname",
    city_col="cityname",
    district_col="",
    project_start_year_col="",
    project_completion_year_col="",
    program_col="projprogram",
    data_source="source",
    fund_cols=[],
    notes_cols=[
        "projcategory",
        "projprogcode",
        "multiprogfunded",
        "projstatus",
        "cost",
        "assemblydistrict",
        "senatedistrict",
        "assemblycode",
        "senatecode",
        "appagencyname",
        "impagencyname",
        "routes",
        "constyear",
        "costfull",
    ],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [23]:
sb1_df_harmonized.grant_program.value_counts()

Transit And Intercity Rail Capital Program    60
Name: grant_program, dtype: int64

In [24]:
srp_df_harmonized = harmonizing(
    df=srp_df,
    agency_name_col="lead_agency",
    project_name_col="project_name",
    project_description_col="project_description",
    phase_col="",
    project_cost_col="total_project_cost",
    geography_col="",
    post_mile_col="",
    county_col="",
    city_col="",
    district_col="",
    project_start_year_col="",
    project_completion_year_col="",
    program_col="program",
    data_source="source",
    fund_cols=[],
    notes_cols=[
        "project_time_horizon",
        "srp_region",
        "corridor",
        "sub_corridor_node_1",
        "sub_corridor_node_2",
        "itsp_corridor",
        "project_category",
    ],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [25]:
lp2000_df_harmonized = harmonizing(
    df=lp2000_df,
    agency_name_col="agency_name",
    project_name_col="project_label_name",
    project_description_col="work_type_desc",
    phase_col="dds_phase",
    project_cost_col="est_total_prj_costs",
    geography_col="",
    post_mile_col="",
    county_col="county_name",
    city_col="",
    district_col="",
    project_start_year_col="",
    project_completion_year_col="",
    program_col="grant_program",
    data_source="source",
    fund_cols=[],
    notes_cols=[
        "location_name",
        "route_name",
        "state_hwy_ind",
        "senate_district_code",
        "category_desc",
        "district_code",
        "comment_desc",
        "postmile_combined",
    ],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


### Stack
Waiting
* Clarify the monetary cols of SB1 & BlackCat

Christian's Notes
* What amount of transit related projects are in this big list? 
* How big the projects are by cost? 
* Compare the cost of all the transit projects against all the projects in the list?
* Use percentages.

Done
* Harmonize county/city/lead agency names
* LRTP grant program should be none'

In [26]:
complete = pd.concat(
    [
        lrtp_df_harmonized,
        blackcat_df_harmonized,
        sb1_df_harmonized,
        srp_df_harmonized,
        lp2000_df_harmonized,
    ],
    ignore_index=True,
)

In [27]:
def clean_strings(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Use to clean project names
    """
    # Convert to lowercase and strip leading/trailing whitespaces
    df[column] = df[column].str.title().str.strip()

    # Replace consecutive whitespaces with a single space
    df[column] = df[column].replace(r"\s+", " ", regex=True)

    # Remove specific characters
    chars_to_remove = ["-", "/", ")", "(", ".", "County", "Of", "District"]
    for char in chars_to_remove:
        df[column] = df[column].str.replace(char, "")

    df[column] = df[column].astype(str).replace("\d+", "", regex=True)
    return df

In [28]:
complete = clean_strings(complete, "lead_agency")

  df[column] = df[column].str.replace(char, "")


In [29]:
complete.lead_agency.nunique()

1056

In [30]:
len(complete)

29381

In [31]:
complete.source.value_counts()

LP2000                11233
Blackcat               3385
Fresno Cog Lrtp        3147
Scag Lrtp              2952
Sacog Lrtp             1601
Kern Cog Lrtp          1411
Scrtpa Lrtp            1066
Madera Ctc Lrtp         765
Stancog Lrtp            552
Slocog Lrtp             420
Sbcag Lrtp              419
Sandag Lrtp             416
Tcag Lrtp               337
Mtc Lrtp                282
Ambag Lrtp              280
State Rail Plan         276
Sjcog Lrtp              262
Bcag Lrtp               250
Mcagov Lrtp             108
Kcag Lrtp                84
Tmpo Lrtp                75
SB1 Feature Server       60
Name: source, dtype: int64

#### Try to find duplicated projects

In [38]:
project_names_only = complete[['project_title', 'source']]

In [39]:
len(project_names_only)

29381

In [42]:
# Drop dups if the project title has the same source
project_names_only2 = project_names_only.drop_duplicates()

In [43]:
len(project_names_only2)

14331

In [44]:
project_names_only2 = project_names_only2[['project_title']]

In [66]:
import numpy as np

In [69]:

num_chunks = 5
df_chunks = split_dataframe(project_names_only2, num_chunks)


In [80]:
type(df_chunks)

list

In [32]:
# complete.to_excel(f"{GCS_FILE_PATH}all_local_projects.xlsx", index=False)

### Filter for just transit

In [None]:
def filter_projects(
    df,
    columns_to_search: list,
    keywords_search: list,
    file_name: str,
    gcs_path: str,
    projects_to_del: list,
):

    # Filter out for Cordon
    df = _specific_list_utils.find_keywords(df, columns_to_search, keywords_search)
    df2 = (
        df[
            (df.lower_case_project_title_keyword_search != "keyword not found")
            | (df.lower_case_project_description_keyword_search != "keyword not found")
        ]
    ).reset_index(drop=True)

    # Delete out non HOV projects that were accidentally picked up
    projects_to_delete = []
    df2 = df2[~df2.project_title.isin(projects_to_delete)].reset_index(drop=True)

    # Change cases
    for i in ["project_title", "project_description"]:
        df2[i] = df2[i].str.title()

    columns_to_drop = ["lower_case_project_title", "lower_case_project_description"]
    df2 = df2.drop(columns=columns_to_drop)
    df2 = df2.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    df2.to_excel(f"{gcs_path}{file_name}.xlsx", index=False)
    return df2

In [None]:
transit_terms = [
    "automated transit",
    "brt",
    "bus",
    "bus depot",
    "bus lane",
    "bus lanes",
    "bus network",
    "bus rapid transit",
    "bus route",
    "bus routes",
    "bus service",
    "bus shelter",
    "bus stop",
    "bus terminal",
    "buses",
    "commuter rail",
    "express bus",
    "facility",
    "fare card",
    "fare collection",
    "fare evasion",
    "fare integration",
    "fare payment",
    "fare structure",
    "fare subsidy",
    "fare technology",
    "farebox",
    "ferry",
    "ferrys",
    "high quality transit areas",
    "integrated transit",
    "intermodal",
    "light rail",
    "light rail vehicles",
    "locomotives",
    "mobility as a service",
    "mobility hub",
    "multimodal",
    "paratransit",
    "rail",
    "rail network",
    "railroad",
    "shuttle",
    "shuttles",
    "station",
    "terminal",
    "train",
    "trains",
    "transit",
    "transit agency",
    "transit center",
    "transit hub",
    "transit signal priority",
    "transit-oriented development",
    "transportation equity",
    "trolley",
    "trolleys",
    "van",
    "vans"
]


In [None]:
len(transit_terms)

In [None]:
transit_df = filter_projects(
    complete,
    [
        "project_title",
        "project_description",
    ],
    transit_terms,
    "sb125_transit",
    GCS_FILE_PATH,
    [],
)

In [None]:
transit_df.shape

### Insights

In [None]:
f"{(int(len(transit_df) / len(complete) * 100))}% or {len(transit_df)} of projects in this list are related to Transit"

In [None]:
transit_cost = int(transit_df.total_project_cost.sum())

In [None]:
formatted_total_cost = "{:,}".format(transit_cost)

In [None]:
total_projects_cost = int(complete.total_project_cost.sum())

In [None]:
formatted_projects_cost = "{:,}".format(total_projects_cost)

In [None]:
f"The total estimated cost is ${formatted_total_cost} compared to ${formatted_projects_cost} in all the compiled projects. This makes up {int((transit_cost/total_projects_cost) * 100)}% of the requested funding (that we have on file)."

In [None]:
transit_df.source.value_counts()

#### Keywords that were picked up

In [None]:
def count_categories(df: pd.DataFrame, column: str):
    # Convert the result to a DataFrame
    filtered_df = df.loc[df[column] != "keyword not found"][[column]].value_counts()
    result_df = pd.DataFrame(filtered_df, columns=["Count"])

    # Reset the index to make the keyword a regular column
    result_df = result_df.reset_index()

    # Rename the columns if needed
    result_df.columns = ["Keyword", "Count"]
    return result_df

In [None]:
proj_desc = count_categories(
    transit_df, "lower_case_project_description_keyword_search"
)

In [None]:
title = count_categories(transit_df, "lower_case_project_title_keyword_search")

In [None]:
categories = pd.merge(proj_desc, title, on="Keyword", how="outer", indicator=True)

In [None]:
categories["Total Projects"] = categories.Count_x.fillna(0) + categories.Count_y.fillna(
    0
)

In [None]:
categories = (
    categories.sort_values(by=["Total Projects"], ascending=False)
    .reset_index(drop=True)
    .drop(columns=["Count_x", "Count_y", "_merge"])
)

In [None]:
categories

In [None]:
transit_df.sort_values(by = ['total_project_cost'], ascending = False).drop(columns = ['geometry']).head()