## https://github.com/cal-itp/data-analyses/issues/890

In [1]:
import _harmonization_utils as har_utils
import _lrtp_utils as lrtp_utils
import _sb1_utils as sb1_utils
import _state_rail_plan_utils as srp_utils
import geopandas as gpd
import pandas as pd
from calitp_data_analysis import utils
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/sb125/"

### Load Data
* Need to include LP2000
* Add in data source columns
* Add in phase:
    *  3 phases of projects: planned / partially funded / funded & under construction
    * state rail plan (planned)
    * LRTP (partially funded)
    * LP2000, Black Cat, TIRCP (under construction)

In [4]:
def load_lrtp():
    lrtp_df, lrtp_gdf = lrtp_utils.all_mpo(True)
    lrtp_df = lrtp_df.loc[lrtp_df.data_source != "Lost"].reset_index(drop=True)
    lrtp_df["program_col"] = lrtp_df.data_source
    lrtp_df["dds_phase"] = "Partially Funded"
    return lrtp_df

In [5]:
lrtp_df = load_lrtp()

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


96 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  sandag.cost2020m.str.replace("$", "")
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


65 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


360 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


68 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [6]:
def load_srp():
    df = har_utils.load_state_rail_plan()
    df["source"] = "State Rail Plan"
    df["program"] = "State Rail Plan"
    df["dds_phase"] = "Planned"
    return df

In [7]:
srp_df = load_srp()

#### Sb1: Which col represents total project cost? cost, totalcosts, costfull
* Also filter out for ONLY TIRCP projects?

In [8]:
def load_sb1():
    df = sb1_utils.load_sb1()
    df["source"] = "SB1 Feature Server"
    df["dds_phase"] = "Under Construction"
    return df

In [9]:
sb1_df = load_sb1()

  gdf[i]
  gdf[i]


0 rows are mising geometry
7917 rows contain invalid geography


In [40]:
sb1_df.projprogram.value_counts()

201718 Local Streets And Roads                3814
201819 Local Streets And Roads                2295
State Highway Projects                        1089
Local Streets And Roads                        539
State Of Good Repair                           429
State Transit Assistance                       370
Active Transportation Program                  318
Local Partnership Program                      174
Transit And Intercity Rail Capital Program      60
Trade Corridor Enhancement Program              57
State Rail Assistance                           23
Solutions For Congested Corridors Program       17
Solutions To Congested Corridors Program         1
Name: projprogram, dtype: int64

#### Blackcat: Ask what grant_encumbered_amount,local_encumbered_amount,total_encumbered_amount,activebalance mean?

In [10]:
def load_blackcat(file: str) -> pd.DataFrame:
    df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{file}"))
    df["data_source"] = "Blackcat"
    df["dds_phase"] = "Under Construction"
    return df

In [11]:
blackcat_df = load_blackcat("Grant+Projects_1_17_2024.xlsx")

In [12]:
def aggregate_to_one_line(
    df: pd.DataFrame, column_to_group: str, column_to_summarize: str
):
    df[f"new_{column_to_summarize}"] = df.groupby(column_to_group)[
        column_to_summarize
    ].transform(lambda x: ",".join(x))
    df2 = df[[f"new_{column_to_summarize}", column_to_group]]
    df2 = df2.drop_duplicates(subset=column_to_group).reset_index(drop=True)
    df2 = df2.rename(columns={f"new_{column_to_summarize}": column_to_summarize})
    return df2

In [33]:
def load_lp2000(file: str):
    df_project = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}{file}", sheet_name="project")
    )

    df_county = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}{file}", sheet_name="county")
    ).drop(columns=["project_label_name"])

    df_district = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}{file}", sheet_name="district")
    ).drop(columns=["project_label_name"])

    df_award = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}{file}", sheet_name="awards")
    )

    # Clean up awards so if project has multiple entries, this is all
    # on one line instead of mulitple lines
    df_award2 = aggregate_to_one_line(df_award, "project_id", "grant_program")
    
    # Merge 
    lp2000_df = (
        pd.merge(df_project, df_county, on="project_id", how="left")
        .merge(df_district, on="project_id", how="left")
        .merge(df_award2, on="project_id", how="left")
    )
    
    # Create 
    lp2000_df["postmile_combined"] = (
        "begin: "
        + lp2000_df.revised_post_mile_begin_ind
        + "end: "
        + lp2000_df.revised_post_mile_end_ind
    )
    lp2000_df_to_keep = [
        "comment_desc",
        "est_total_prj_costs",
        "location_name",
        "route_name",
        "state_hwy_ind",
        "senate_district_code",
        "update_date_time",
        "agency_name",
        "urban_area_code",
        "work_type_desc",
        "category_desc",
        "current_phase",
        "county_name",
        "project_label_name",
        "district_code",
        "grant_program",
        "postmile_combined",
    ]
    lp2000_df = lp2000_df[lp2000_df_to_keep]
    lp2000_df["dds_phase"] = "Under Construction"
    lp2000_df["source"] = "LP2000"
    return lp2000_df

In [34]:
lp2000_df = load_lp2000("LP2000.xlsx")

### Harmonize it

In [18]:
def create_notes(df, note_cols: list, new_col_name: str):
    """
    Concat multiple columns into one.
    """
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df[new_col_name] = df.apply(combine_notes, axis=1)
    df[new_col_name] = df[new_col_name].str.replace("_", " ")
    df[new_col_name] = df[new_col_name].fillna("None")
    return df

In [19]:
def harmonizing(
    df,
    agency_name_col: str,
    project_name_col: str,
    project_description_col: str,
    phase_col: str,
    project_cost_col: str,
    geography_col: str,
    post_mile_col: str,
    county_col: str,
    city_col: str,
    district_col: str,
    project_start_year_col: str,
    project_completion_year_col: str,
    program_col: str,
    data_source: str,
    fund_cols: list,
    notes_cols: list,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.
    """
    rename_columns = {
        agency_name_col: "lead_agency",
        project_name_col: "project_title",
        project_description_col: "project_description",
        project_cost_col: "total_project_cost",
        geography_col: "geometry",
        phase_col: "phase",
        post_mile_col: "post_mile",
        county_col: "county",
        city_col: "city",
        district_col: "ct_district",
        project_start_year_col: "project_start_year",
        project_completion_year_col: "project_completion_year",
        program_col: "grant_program",
        data_source: "source",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_start_year",
        "project_completion_year",
        "post_mile",
        "phase",
        "project_title",
        "ct_district",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "geometry" not in df:
        df["geometry"] = None
    if "grant_program" not in df:
        df["grant_program"] = data_source
    if "total_project_cost" not in df:
        df["total_project_cost"] = 0

    # Clean up monetary columns to be interger
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Add new column with funding breakout
    # Since it's summarized above and the details are suppressed.
    df["total_available_funds"] = df[fund_cols].sum(axis=1)
    df = create_notes(df, fund_cols, "funding_notes")

    # Add column for unfunded needs
    # df["unfunded_needs_estimate"] = (
    #    df["total_project_cost"] - df["total_available_funds"]
    # )

    # Create notes - aka other columns that were supressed
    df = create_notes(df, notes_cols, "notes")

    # Fill in nones
    geometry_column = "geometry"

    # Create a dictionary to specify replacement values for each data type
    replacement_values = {"float64": 0.0, "object": "None"}

    # Exclude the geometry column from the fillna operation
    columns_to_fill = [col for col in df.columns if col != geometry_column]

    # Fill null values in selected columns
    df[columns_to_fill] = df[columns_to_fill].fillna(
        df.dtypes.replace(replacement_values)
    )

    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_description",
        "total_project_cost",
        "total_available_funds",
        "phase",
        "post_mile",
        "county",
        "city",
        "ct_district",
        "project_start_year",
        "project_completion_year",
        "geometry",
        "grant_program",
        "source",
        "notes",
        "funding_notes",
    ]

    # Only keep certain columns
    df = df[columns_to_keep]
    return df

In [20]:
lrtp_df_harmonized = harmonizing(
    df=lrtp_df,
    agency_name_col="lead_agency",
    project_name_col="project_title",
    project_description_col="project_description",
    phase_col="dds_phase",
    project_cost_col="total_project_cost",
    geography_col="geometry",
    post_mile_col="",
    county_col="county",
    city_col="city",
    district_col="",
    project_start_year_col="",
    project_completion_year_col="",
    program_col="program_col",
    data_source="data_source",
    fund_cols=[],
    notes_cols=["notes"],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [21]:
lrtp_df_harmonized.sample()

Unnamed: 0,project_title,lead_agency,project_description,total_project_cost,total_available_funds,phase,post_mile,county,city,ct_district,project_start_year,project_completion_year,geometry,grant_program,source,notes,funding_notes
7957,Sr 905 (Siempre Viva Rd),Sandag,North,42000000.0,0.0,Partially Funded,,,,,,,POINT (-116.939865968489 32.5554557291213),Sandag Lrtp,Sandag Lrtp,"notes: Category: Complete Corridor: Dar, Status: Proposed, Aqc 2016 1: 2050, Aqc 2020 1: 2050, Pricmcp: Sb2S, Conncmcp: Sr 125, Layer Name: Complete Corridors Connectors Intersections Ramps Rp2021, Corridor I: Nan, Type 1: Nan, Existing: Nan, Limits: Nan, Description 1: Nan, Route: Nan, Routetype: Nan, Route Desc: Nan, Rp 2021 Id: Nan, Rp 2021 Id 1: Nan, Capital Cost 2020 Millions: Nan",


In [22]:
blackcat_df_harmonized = harmonizing(
    df=blackcat_df,
    agency_name_col="organization_name",
    project_name_col="",
    project_description_col="description",
    phase_col="dds_phase",
    project_cost_col="",
    geography_col="",
    post_mile_col="",
    county_col="",
    city_col="",
    district_col="",
    project_start_year_col="project_year",
    project_completion_year_col="project_closed_date",
    program_col="funding_program",
    data_source="data_source",
    fund_cols=["allocationamount"],
    notes_cols=[
        "grant_fiscal_year",
        "grant_encumbered_amount",
        "local_encumbered_amount",
        "total_encumbered_amount",
        "expendedamount",
        "activebalance",
        "closedoutbalance",
        "project_status",
    ],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [23]:
sb1_df_harmonized = harmonizing(
    df=sb1_df,
    agency_name_col="projagency",
    project_name_col="projname",
    project_description_col="description",
    phase_col="dds_phase",
    project_cost_col="totalcosts",
    geography_col="geometry",
    post_mile_col="",
    county_col="countyname",
    city_col="cityname",
    district_col="",
    project_start_year_col="",
    project_completion_year_col="",
    program_col="projprogram",
    data_source="source",
    fund_cols=[],
    notes_cols=[
        "projcategory",
        "projprogcode",
        "multiprogfunded",
        "projstatus",
        "cost",
        "assemblydistrict",
        "senatedistrict",
        "assemblycode",
        "senatecode",
        "appagencyname",
        "impagencyname",
        "routes",
        "constyear",
        "costfull",
    ],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [24]:
sb1_df_harmonized.grant_program.value_counts()

201718 Local Streets And Roads                3814
201819 Local Streets And Roads                2295
State Highway Projects                        1089
Local Streets And Roads                        539
State Of Good Repair                           429
State Transit Assistance                       370
Active Transportation Program                  318
Local Partnership Program                      174
Transit And Intercity Rail Capital Program      60
Trade Corridor Enhancement Program              57
State Rail Assistance                           23
Solutions For Congested Corridors Program       17
Solutions To Congested Corridors Program         1
Name: grant_program, dtype: int64

In [25]:
srp_df_harmonized = harmonizing(
    df=srp_df,
    agency_name_col="lead_agency",
    project_name_col="project_name",
    project_description_col="project_description",
    phase_col="",
    project_cost_col="total_project_cost",
    geography_col="",
    post_mile_col="",
    county_col="",
    city_col="",
    district_col="",
    project_start_year_col="",
    project_completion_year_col="",
    program_col="program",
    data_source="source",
    fund_cols=[],
    notes_cols=[
        "project_time_horizon",
        "srp_region",
        "corridor",
        "sub_corridor_node_1",
        "sub_corridor_node_2",
        "itsp_corridor",
        "project_category",
    ],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [26]:
lp2000_df.sample()

Unnamed: 0,comment_desc,est_total_prj_costs,location_name,route_name,state_hwy_ind,senate_district_code,update_date_time,agency_name,urban_area_code,work_type_desc,category_desc,current_phase,county_name,project_label_name,district_code,grant_program
9669,"I-15 Express Lanes (Contract 1): Construct 1 Exp. Lane in each direction between Cantu-Galleano Ranch Rd. and SR-60 and 2 Exp. Lanes in each direction between SR-60 and north of Foothill Blvd. Additional improvements to AUX LN widening, undercrossing, and reconstruction of ramps and lane transitions where needed.",,,1-SBCT,Y,,2023-05-30 14:19:03,San Bernardino County Transportation Authority,,,Road Related - Other,single phase,San Bernardino County,Road Related - Other,8.0,


In [36]:
lp2000_df.columns

Index(['comment_desc', 'est_total_prj_costs', 'location_name', 'route_name',
       'state_hwy_ind', 'senate_district_code', 'update_date_time',
       'agency_name', 'urban_area_code', 'work_type_desc', 'category_desc',
       'current_phase', 'county_name', 'project_label_name', 'district_code',
       'grant_program', 'postmile_combined', 'dds_phase', 'source'],
      dtype='object')

In [37]:
lp2000_df_harmonized = harmonizing(
    df=lp2000_df,
    agency_name_col="agency_name",
    project_name_col="project_label_name",
    project_description_col="comment_desc",
    phase_col="dds_phase",
    project_cost_col="est_total_prj_costs",
    geography_col="",
    post_mile_col="",
    county_col="county_name",
    city_col="",
    district_col="",
    project_start_year_col="",
    project_completion_year_col="",
    program_col="grant_program",
    data_source="source",
    fund_cols=[],
    notes_cols=[
        "location_name",
        "route_name",
        "state_hwy_ind",
        "senate_district_code",
        "work_type_desc",
        "category_desc",
        "district_code",
        "postmile_combined",
    ],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [38]:
lp2000_df_harmonized.sample(2)

Unnamed: 0,project_title,lead_agency,project_description,total_project_cost,total_available_funds,phase,post_mile,county,city,ct_district,project_start_year,project_completion_year,geometry,grant_program,source,notes,funding_notes
649,Implement parking pricing pilot programs in Residential Preferential Permit (RPP) areas adjacent to,Berkeley,"(work description continued) ...existing Southside/telegraph and Elmwood goBerkeely areas, including comprehensive data collection using existing Automated License Plate Recognition system, extensive public outreach, introduction of a pay-by-phone payment system, and upgrades to existing parking signage. Implement targeted TDM strategies in pilot areas, including education/outreach and transit and biking incentives, focused on employees and residents in and around pilot areas.",0.0,0.0,Under Construction,,Alameda County,,,,,,Local Assistance,LP2000,"location name: Residential areas adjacent to Southside/Telegraph and Elmwood go Berkeley areas., route name: 0-BER, state hwy ind: N, senate district code: nan, work type desc: nan, category desc: Miscellaneous - Other, district code: 4.0, postmile combined: nan",
6744,,San Clemente,Road Related - Other,0.0,0.0,Under Construction,,Orange County,,,,,,,LP2000,"location name: REUSE THIS PROJECT NO., route name: 0-SCLE, state hwy ind: N, senate district code: nan, work type desc: Safety, category desc: Road Related - Other, district code: 12.0, postmile combined: nan",
