## Compile Projects
To-do
* Figure out how to version things b/c projects will get updated and we want to track any changes.
* This only needs to be done with data from lp2000 and ctips.
* Need to track changes across all the different dataframes
* Use merges to figure it out?

In [91]:
import hashlib
from datetime import datetime

import _harmonization_utils as har_utils
import _lrtp_utils as lrtp_utils
import _sb1_utils as sb1_utils
import _specific_list_utils
import _state_rail_plan_utils as srp_utils
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
from calitp_data_analysis.sql import to_snakecase

In [2]:
from calitp_data_analysis import get_fs

fs = get_fs()
import os

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_list/"

## General Functions

In [5]:
def create_notes(df, note_cols: list, new_col_name: str):
    """
    Concatenates multiple columns into one, with specified prefix and separator.

    Args:
        df (pandas.DataFrame): The DataFrame containing the columns to be concatenated.
        note_cols (list): A list of column names to be concatenated.
        new_col_name (str): The name of the new column that will store the concatenated values.

    Returns:
        pandas.DataFrame: The DataFrame with the new column containing concatenated values.

    Example:
        Suppose df is a DataFrame like this:

            |  A  |  B  |  C  |
            | --- | --- | --- |
            |  x  |  1  |  a  |
            |  y  |  2  |  b  |

        Using create_notes(df, ['A', 'B', 'C'], 'Concatenated'), the resulting DataFrame will be:

            |  A  |  B  |  C  |         Concatenated         |
            | --- | --- | --- | --------------------------- |
            |  x  |  1  |  a  | A: x, B: 1, C: a           |
            |  y  |  2  |  b  | A: y, B: 2, C: b           |
    """
    prefix = "_"  # Prefix for the newly created columns

    # Iterate over each column and create a new column with the prefix
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)

    # Update note_cols to contain the prefixed column names
    note_cols = [prefix + sub for sub in note_cols]

    # Function to combine the notes from multiple columns
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    # Apply the combine_notes function row-wise to create the new concatenated column
    df[new_col_name] = df.apply(combine_notes, axis=1)

    # Replace underscores with spaces in the concatenated column
    df[new_col_name] = df[new_col_name].str.replace("_", " ")

    return df

In [6]:
def separate_out_df(df: pd.DataFrame, columns_to_keep: list) -> pd.DataFrame:
    """
    Subset the DataFrame based on the specified columns to keep,
    drop any rows in which the values are NaN or "None."

    Parameters:
    df (pd.DataFrame): Input DataFrame.
    columns_to_keep (list): List of column names to keep in the DataFrame.

    Returns:
    pd.DataFrame: DataFrame with specified columns and cleaned rows.
    """
    # Subset DataFrame based on columns_to_keep
    df2 = df[columns_to_keep]

    # Fill in missing values with 'none' if possible
    try:
        df2 = df2.fillna("none")
    except:
        df2

    # Remove 'project_number' from columns_to_keep
    columns_to_keep.remove("project_number")

    # Drop rows that are NaN or "None" based on how many columns are being included in the
    # new dataframe
    if len(columns_to_keep) == 1:
        # Drop rows with any NaN values
        df2 = df2.dropna(how="any")
        # Drop rows where any value in the row is 'none'
        df2 = df2[
            df2.applymap(lambda x: x.lower() if isinstance(x, str) else x) != "none"
        ].dropna()
    else:
        # Keep any row that has a non-null value in at least one of the specified columns
        df2 = df2.dropna(how="all", subset=columns_to_keep)

    print(f"Number of rows left {len(df2)}")
    return df2

In [7]:
def explode_dataframe(df: pd.DataFrame, column_to_explode: str) -> pd.DataFrame:
    """
    Explode a DataFrame based on a specified column.

    This function converts a column of lists or strings separated by commas into multiple rows,
    with each element in the list or string becoming a separate row entry.

    Parameters:
    df (pd.DataFrame): Input DataFrame.
    column_to_explode (str): Name of the column to explode.

    Returns:
    pd.DataFrame: Exploded DataFrame.
    """
    # Apply a function to the specified column to split the strings into lists of elements
    df["Column2"] = df[column_to_explode].apply(
        lambda x: [int(i) if i.isdigit() else i for i in x.replace(",", "").split()]
    )

    # Drop the original column that was exploded
    df = df.drop(columns=[column_to_explode])

    # Explode the DataFrame based on the newly created 'Column2'
    df2 = df.explode("Column2")

    # Rename the exploded column back to the original column name
    df2 = df2.rename(columns={"Column2": column_to_explode})

    df2 = df2.sort_values(by=["project_number"]).reset_index(drop=True)
    return df2

In [8]:
def generate_hash(string) -> str:

    hash_object = hashlib.sha1(string.encode())
    hash_hex = hash_object.hexdigest()
    return hash_hex[:10]

In [9]:
def generate_project_id(df: pd.DataFrame, project_name_col: str) -> pd.DataFrame:
    """
    Generate a unique project ID based on the index number and a string column.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        project_name_col (str): The name of the column containing project names.

    Returns:
        pd.DataFrame: The DataFrame with a new column 'project_number' containing the generated project IDs.
    """
    # Reset the index to ensure a clean index for generating project IDs
    df = df.reset_index(drop=True)

    # Create a column containing index numbers
    df["index_number"] = df.index

    # Combine index number and project name column into a single column
    # Convert to string and fill NaN values with 'none'
    df["combo"] = df.index_number.astype("str") + df[project_name_col].astype(
        "str"
    ).fillna("none")

    # Apply a function to generate a unique hash for each combination
    df["project_number"] = df["combo"].apply(generate_hash)

    # Drop intermediate columns used for generation
    df = df.drop(columns=["index_number", "combo"])

    # Check if there are the same number of unique project numbers as rows
    print("Checking that there are the same number of unique project numbers & rows")
    print(f"{len(df) == df.project_number.nunique()}")

    return df

## Separate out data based on schema

### LRTP/LOST

In [10]:
lrtp_lost_df, lrtp_lost_gdf = lrtp_utils.all_mpo(True)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


96 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  sandag.cost2020m.str.replace("$", "")
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


65 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


360 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


68 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [11]:
lrtp_lost_df = generate_project_id(lrtp_lost_df, "project_title")

Checking that there are the same number of unique project numbers & rows
True


#### LRTP City

In [12]:
lrtp_lost_city = separate_out_df(lrtp_lost_df, ["project_number", "city"])

Number of rows left 745


In [292]:
lrtp_lost_city.sample()

Unnamed: 0,project_number,city
15057,9fa4a3968d,Brawley


#### LRTP County

In [13]:
lrtp_lost_county = separate_out_df(lrtp_lost_df, ["project_number", "county"])

Number of rows left 4012


#### LRTP Geo

In [14]:
lrtp_lost_geo = separate_out_df(lrtp_lost_df, ["project_number", "geometry"])

Number of rows left 1357


In [74]:
lrtp_lost_geo = lrtp_lost_geo.set_geometry("geometry")

In [293]:
lrtp_lost_geo.sample()

Unnamed: 0,project_number,geometry
13094,9f5dd6b9c7,"LINESTRING (-120.71428 35.55439, -120.71225 35.55438)"


In [75]:
type(lrtp_lost_geo)

geopandas.geodataframe.GeoDataFrame

#### LRTP Base Table

In [15]:
lrtp_to_drop = ["county", "city", "geometry"]

In [16]:
lrtp_lost_df = lrtp_lost_df.drop(columns=lrtp_to_drop)

In [17]:
lrtp_lost_df.sample()

Unnamed: 0,project_title,lead_agency,project_year,project_description,total_project_cost,data_source,notes,project_number
2162,Fourth Street Trail - Sierra Ave To Barstow Ave: Trail,Fresno Rtp/Scs Published In 2022,2022-2026,Trail,680000.0,Fresno Rtp/Scs Published In 2022 Lrtp,"Project Type: Bike & Ped, Financial Constraint: Constrained",69c5f9779e


### SB1

In [18]:
sb1_df = sb1_utils.load_sb1()

  gdf[i]
  gdf[i]


0 rows are mising geometry
7917 rows contain invalid geography


In [19]:
sb1_df = generate_project_id(sb1_df, "projectid")

Checking that there are the same number of unique project numbers & rows
True


In [20]:
sb1_df.sample()

Unnamed: 0,objectid,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,description,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,popuptitle,popup,geometry,projcount,totalcosts,projlisturl,projlist2url,mponame,mpocode,assetcategory,bphotojpg,bphotohtml,aphotojpg,aphotohtml,routes,constyear,costfull,projagency,data_source,project_number
2703,172307,LsrFy18195958Pp404,Rood Road Project 2 1718 Ongoing,Local,Local And Regional,Lsr1819,201819 Local Streets And Roads,,PreConstruction,Repair Potholes And Other Damaged Road Areas Resurface The Road With Asphalt Slurry Seal Remove And Replace Concrete Curbs And Restriping As Necessary,,56,40,56,40,Imperial Submitted By County,,Imp,,,,,,,,,,,,,,,,,,,2019,0.0,Imperial County,SB1 Server,2b5daa7f6f


#### SB1 Assembly

In [21]:
sb1_assembly = separate_out_df(sb1_df, ["project_number", "assemblydistrict"])

Number of rows left 9185


In [22]:
sb1_assembly = explode_dataframe(sb1_assembly, "assemblydistrict")

In [294]:
sb1_assembly.sample()

Unnamed: 0,project_number,assembly_districts
11389,e3e997ed89,22


In [23]:
# sb1_assembly.loc[sb1_assembly.project_number == "001784c575"]

In [24]:
# sb1_df.loc[sb1_df.project_number == "001784c575"]

##### Check that assemblydistrict and assemblycode are the same values
* Yes they are, just padded with zeroes or spaced out differently.

In [25]:
sb1_df["assembly_same"] = sb1_df.assemblycode == sb1_df.assemblydistrict

In [26]:
sb1_df.assembly_same.value_counts()

True     7217
False    1969
Name: assembly_same, dtype: int64

In [27]:
sb1_df.loc[sb1_df.assembly_same == False][["assemblycode", "assemblydistrict"]].sample(
    10
)

Unnamed: 0,assemblycode,assemblydistrict
596,6266,62 66
6727,3,3
3163,68697374,68 69 73 74
8549,2,2
2976,506,56
4647,3,3
4313,2930,29 30
2890,3944,44 39
5391,1,1
3906,50535459,50 53 54 59


In [28]:
sb1_df["senate_same"] = sb1_df.senatedistrict == sb1_df.senatecode

In [29]:
sb1_df.senate_same.value_counts()

True     6950
False    2236
Name: senate_same, dtype: int64

#### SB1 Awards

In [30]:
sb1_awards = separate_out_df(sb1_df, ["project_number", "projprogram"])

Number of rows left 9186


In [295]:
sb1_awards.sample()

Unnamed: 0,project_number,grant_program,award_year
2113,bd71f7f0e9,201819 Local Streets And Roads,Not Available


#### SB1 City

In [31]:
sb1_city = separate_out_df(sb1_df, ["project_number", "cityname"])

Number of rows left 6696


In [296]:
sb1_city.sample()

Unnamed: 0,project_number,city
3676,480260800d,Guadalupe


#### SB1 County

In [32]:
sb1_county = separate_out_df(sb1_df, ["project_number", "countyname"])

Number of rows left 9184


In [235]:
sb1_county.sample(10)

Unnamed: 0,project_number,countyname
8677,5c7cfac65e,Alameda San Joaquin Santa Clara
3823,1722677420,Shasta
1987,f67a62bdbc,Colusa
5672,aee1fbd6de,Kings
8923,e5fd2419c2,Colusa
6599,d9ca4959c9,Tehama
6030,92e842524c,Marin Submitted By County
1350,a41c0f40ac,Alameda
292,cf7f751ce8,Amador
5373,d8b14965d3,Stanislaus


#### SB1 Geography

In [33]:
sb1_geo = separate_out_df(sb1_df, ["project_number", "geometry"])

Number of rows left 1585


#### SB1 Senate

In [34]:
sb1_senate = separate_out_df(sb1_df, ["project_number", "senatedistrict"])

Number of rows left 9185


In [35]:
sb1_senate = explode_dataframe(sb1_senate, "senatedistrict")

In [297]:
sb1_senate.sample()

Unnamed: 0,project_number,senatedistrict
11023,f271c621f2,8


#### SB1 Base Table

In [36]:
sb1_df = sb1_df.drop(
    columns=[
        "countyname",
        "assemblydistrict",
        "senatedistrict",
        "cityname",
        "geometry",
        "projprogram",
        "assemblycode",
        "senatecode",
        "countycode",
        "citycode",
        "assembly_same",
    ]
)

### LP2000

In [37]:
def load_lp2000(file: str):

    df_project = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="project")
    )

    df_county = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="county")
    ).drop(columns=["project_label_name"])

    df_district = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="district")
    ).drop(columns=["project_label_name"])

    df_award = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="awards")
    )

    df_phase = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="phase_funding")
    )

    return df_project, df_county, df_district, df_award, df_phase

In [38]:
(
    lp2000_project,
    lp2000_county,
    lp2000_district,
    lp2000_award,
    lp2000_phase,
) = load_lp2000("LP2000_projects.xlsx")

In [39]:
lp2000_project = generate_project_id(lp2000_project, "project_id")

Checking that there are the same number of unique project numbers & rows
True


In [40]:
def add_project_number(
    df_with_project_id: pd.DataFrame, right_project: pd.DataFrame, merge_col: str
) -> pd.DataFrame:
    """
    Merge two DataFrames on a specified column and drop the merge column.
    This function is necessary to add the database specific project ID to
    datasets that are already split out based on the schema.

    Args:
        df_with_project_id (pd.DataFrame): The DataFrame with project IDs.
        right_project (pd.DataFrame): The DataFrame containing additional project information.
        merge_col (str): The column name on which the DataFrames will be merged.

    Returns:
        pd.DataFrame: The merged DataFrame without the merge column.
    """
    # Perform an inner merge on the specified column
    keep_cols = ["project_number", merge_col]
    merged_df = pd.merge(
        right_project, df_with_project_id[keep_cols], on=merge_col, how="inner"
    )

    # Drop the merge column
    merged_df = merged_df.drop(columns=merge_col)

    return merged_df

#### LP2000 Award

In [41]:
lp2000_award = add_project_number(lp2000_project, lp2000_award, "project_id")

In [42]:
lp2000_award.sample()

Unnamed: 0,grant_program,state_fiscal_awarded_year,project_number
4148,Local Assistance,1516,0421d91c64


#### LP2000 County

In [43]:
lp2000_county = add_project_number(lp2000_project, lp2000_county, "project_id")

In [298]:
lp2000_county.sample()

Unnamed: 0,county,project_number
1298,Ventura,aff7f24ce3


#### LP2000 District

In [44]:
lp2000_district = add_project_number(lp2000_project, lp2000_district, "project_id")

In [299]:
lp2000_district.sample()

Unnamed: 0,ct_districts,project_number
4761,6.0,d72d0c9f31


#### LP2000 Phase

In [45]:
lp2000_phase = add_project_number(lp2000_project, lp2000_phase, "project_id")

In [300]:
lp2000_phase.sample()

Unnamed: 0,single_phase_cost,single_phase_expenditure_amt,total_state_funds,total_federal_funds,is_state,is_federal,project_number
3705,222000.0,222000.0,222000.0,0.0,Yes,No,3c3d136615


### CTIPS

In [46]:
def load_ctips(file: str):

    df_project = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="project")
    )

    df_agency = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="agencies")
    )

    df_county = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="county")
    )

    df_district = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="district")
    )

    df_phase = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="phase_funding")
    )

    df_award = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="awards")
    )

    df_house = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="us_house")
    )

    df_senate = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="senate")
    )

    df_assembly = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="assembly")
    )

    df_project["data_source"] = "CTIPS"
    return (
        df_project,
        df_agency,
        df_county,
        df_district,
        df_phase,
        df_award,
        df_house,
        df_senate,
        df_assembly,
    )

In [47]:
(
    ctips_project,
    ctips_agencies,
    ctips_county,
    ctips_district,
    ctips_phase,
    ctips_award,
    ctips_house,
    ctips_senate,
    ctips_assembly,
) = load_ctips("CTIPS_data.xlsx")

In [48]:
ctips_project = generate_project_id(ctips_project, "ctips_id")

Checking that there are the same number of unique project numbers & rows
True


#### CTIPS Assembly

In [49]:
ctips_assembly = add_project_number(ctips_project, ctips_assembly, "ctips_id")

In [50]:
ctips_assembly.sample()

Unnamed: 0,assembly,project_number
3443,53,9d76b2a0fc


#### CTIPS Base Table

In [51]:
ctips_agencies.sample()

Unnamed: 0,agency_name,implpaed_agency,implpse_agency,implrw_agency,implcon_agency,ctips_id
9448,Yuba County,,,,,20700001173


In [52]:
ctips_project = pd.merge(ctips_project, ctips_agencies, on="ctips_id", how="left")

In [53]:
ctips_project.sample()

Unnamed: 0,ctips_id,chg_offcl,chg_qual1,chg_qual2,districtid,document,docyear,ea_number,needpurpose,ppno,proj_desc,postmiles1,pm1b,pm2b,pm3b,pm1a,pm2a,pm3a,route1,route2,route3,title,data_source,project_number,agency_name,implpaed_agency,implpse_agency,implrw_agency,implcon_agency
21634,20800000059,1,1,0,5,FTIP,2002,,,,Operations/Maintenance Facility - Santa Maria - Contruction Of An Operations/Maintenace Facility For Transit Buses In The City Of Santa Maria.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,Operations/Maintenace Facility,CTIPS,6966e100e4,"Santa Maria, City of",,,,


#### CTIPS Award

In [54]:
ctips_award = add_project_number(ctips_project, ctips_award, "ctips_id")

In [301]:
ctips_award.sample()

Unnamed: 0,award_year,grant_program,progdesc,project_number
36811,2013,SHOPP - Major Damage Restoration,Major Damage (Emergency Restoration),de8082842a


#### CTIPS County

In [55]:
ctips_county = add_project_number(ctips_project, ctips_county, "ctips_id")

In [56]:
ctips_county.sample()

Unnamed: 0,county,project_number
2544,El Dorado County,f655ddfdb7


#### CTIPS District

In [57]:
ctips_district = add_project_number(ctips_project, ctips_district, "ctips_id")

In [302]:
ctips_district.sample()

Unnamed: 0,ct_districts,project_number
18890,7.0,cddaab68c8


#### CTIPS House

In [58]:
ctips_house = add_project_number(ctips_project, ctips_house, "ctips_id")

In [303]:
ctips_house.sample()

Unnamed: 0,ushouse,project_number
1445,31,a6be955f44


#### CTIPS Phase

In [59]:
ctips_phase = add_project_number(ctips_project, ctips_phase, "ctips_id")

In [304]:
ctips_phase.sample()

Unnamed: 0,con,rw,pe_env,pe_rw,pe_con,pe_total,total_federal_funds,total_state_funds,total_local_funds,is_local,is_state,is_federal,projectid,pa_ed_begin,pa_ed_end,ps_e_begin,begin_row,end_row,con_start_date,con_end_date,begin_closeout,end_closeout,construction_completion_date,ready_to_list_date,projcomp_date,project_number
10695,3500900000.0,266000000.0,0.0,0.0,0.0,173100000.0,1240000.0,0.0,3938760000.0,Yes,No,Yes,,,,,,,,,,,,,,64f8c153a0


#### CTIPS Senate

In [60]:
ctips_senate = add_project_number(ctips_project, ctips_senate, "ctips_id")

In [305]:
ctips_senate.sample()

Unnamed: 0,ssenate,project_number
587,2,688d73e171


### State Rail Plan

In [61]:
srp_df = har_utils.load_state_rail_plan()

In [62]:
srp_df = generate_project_id(srp_df, "project_name")

Checking that there are the same number of unique project numbers & rows
True


## Stack

### Assembly Districts
* ctips_assembly
* sb1_assembly

In [63]:
def harmonize_assembly(df: pd.DataFrame, assembly_column: str) -> pd.DataFrame:
    df = df.rename(columns={assembly_column: "assembly_districts"})

    return df

In [64]:
ctips_assembly = harmonize_assembly(ctips_assembly, "assembly")

In [73]:
type(ctips_assembly)

pandas.core.frame.DataFrame

In [65]:
sb1_assembly = harmonize_assembly(sb1_assembly, "assemblydistrict")

In [72]:
type(sb1_assembly)

pandas.core.frame.DataFrame

#### Find the assembly district if we have the coordinates?
* Reading file from ArcGIS directly isn't working
* https://gis.data.ca.gov/datasets/waterboards::california-senate-and-assembly-district-boundaries/about?layer=3

In [66]:
assembly_file = "https://gispublic.waterboards.ca.gov/portalserver/rest/services/Hosted/California_Senate_and_Assembly_District_Boundaries/FeatureServer/3/query?outFields=*&where=1%3D1&f=geojson"

In [67]:
# assembly_gdf = gpd.read_file(assembly_file)

In [68]:
assembly_gcs_file = "gs://calitp-analytics-data/data-analyses/project_list/geometry/California_Senate_and_Assembly_District_Boundaries.geojson"

In [100]:
with get_fs().open(assembly_gcs_file) as f:
    assembly_gdf = to_snakecase(gpd.read_file(f))
    assembly_gdf = assembly_gdf[["district", "geometry"]]
    assembly_gdf = assembly_gdf.rename(columns={"district": "assembly_district"})

In [110]:
assembly_gdf.assembly_district.nunique()

80

In [111]:
assembly_gdf.assembly_district.unique()

array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
       '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29',
       '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39',
       '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49',
       '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59',
       '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69',
       '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79',
       '8', '80', '9'], dtype=object)

In [77]:
lrtp_lost_geo.shape

(1357, 2)

In [86]:
type(lrtp_lost_geo)

geopandas.geodataframe.GeoDataFrame

In [95]:
lrtp_lost_geo = lrtp_lost_geo.set_crs(WGS84)

In [97]:
def overlay_different_geotypes(left_gdf: gpd.GeoDataFrame, right_gdf: gpd.GeoDataFrame):

    full_gdf = pd.DataFrame()

    # Make sure the crs is the same
    left_gdf = left_gdf.to_crs(right_gdf.crs)

    # Have to compare each geom type to district separately
    geo_types_list = list(left_gdf.geometry.geom_type.unique())

    for i in geo_types_list:
        print(f"Overlaying {i} type")
        filtered_gdf = left_gdf.loc[left_gdf.geometry.geom_type == i]
        # Overlay
        gdf = filtered_gdf.overlay(right_gdf, how="intersection")
        full_gdf = pd.concat([full_gdf, gdf], axis=0)
    return full_gdf

In [101]:
lrtp_assembly_district = overlay_different_geotypes(lrtp_lost_geo, assembly_gdf)

Overlaying Point type
Overlaying MultiLineString type
Overlaying LineString type
Overlaying Polygon type
Overlaying MultiPolygon type


In [108]:
lrtp_assembly_district.assembly_district.unique()

array(['3', '18', '24', '11', '15', '16', '12', '4', '17', '14', '21',
       '23', '26', '2', '25', '19', '29', '20', '74', '76', '77', '78',
       '79', '75', '80', '28', '13', '9', '6', '30', '37', '71', '35',
       '1', '5', '7'], dtype=object)

In [103]:
lrtp_assembly_district = lrtp_assembly_district.drop(columns=["geometry"])

In [104]:
all_assembly_df = pd.concat([ctips_assembly, sb1_assembly, lrtp_assembly_district])

In [105]:
all_assembly_df.assembly_districts.nunique()

90

#### What to do about rows that have cojoined assembly districts?

In [106]:
all_assembly_df[["assembly_districts"]].drop_duplicates()

Unnamed: 0,assembly_districts
0,1
1,5
2,6
9,7
104,3
105,4
108,8
109,9
110,11
112,2


### Awards
* sb1_awards
* lp2000_award
* ctips_award

In [112]:
def harmonize_awards(
    df: pd.DataFrame, year: str, grant_program_col: str
) -> pd.DataFrame:
    rename_dict = {year: "award_year", grant_program_col: "grant_program"}
    df = df.rename(columns=rename_dict)

    create_cols = ["award_year", "grant_program"]
    for column in create_cols:
        if column not in df:
            df[column] = "Not Available"
    return df

In [113]:
sb1_awards = harmonize_awards(sb1_awards, "", "projprogram")

In [114]:
lp2000_award = harmonize_awards(
    lp2000_award, "state_fiscal_awarded_year", "grant_program"
)

In [115]:
ctips_award = harmonize_awards(ctips_award, "line_year", "program")

In [138]:
all_awards_df = pd.concat([sb1_awards, lp2000_award, ctips_award])

In [117]:
len(all_awards_df)

58937

#### Fix duplicates of award programs

In [139]:
def fix_duplicates(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    An attempt to remove string duplicates in a column
    """
    # Delete out any stuff after parantheses
    df[column] = df[column].str.split("(", n=1).str.get(0)

    # Replace consecutive whitespaces with a single space
    df[column] = df[column].replace(r"\s+", " ", regex=True)
    # Remove other misc characters
    chars_to_remove = ["-", "/", ")", "(", ".", "–"]
    for char in chars_to_remove:
        df[column] = df[column].str.replace(char, "")

    df[column] = df[column].str.strip()
    return df

In [140]:
all_awards_df = fix_duplicates(all_awards_df, "grant_program")

  df[column]  = df[column].str.replace(char, "")


##### I don't know these acronyms
Gf Iip
41006	Gf Iip Prior
7002	Gf Rip
5004	Gf Rip Prior
Local TEA

In [151]:
awards_replace = {
    "201718 Local Streets And Roads": "Local Streets And Roads",
    "201819 Local Streets And Roads": "Local Streets And Roads",
    "ATP": "Active Transportation Program",
    "CMAQ": "Congestion Mitigation and Air Quality",
    "CT Minor Pgm": "Caltrans Minor Program",
    "Federal Disc": "Federal Discretionary Funds",
    "FFTA ta Funds": "Federal Transit Administration Funds",
    "IIP": "Interregional Improvement Program (IIP)",
    "IIP Prior": "Interregional Improvement Program (IIP) Prior",
    "Local ER": "Local Emergency Relief",
    "Local HBRR": "Local The Highway Bridge Rehabilitation and Replacement Program",
    "RIP": "Regional Improvement Program",
    "RIP Prior": "Regional Improvement Program Prior",
    "RSTP": "Regional Surface Transportation Program",
    "Solutions To Congested Corridors Program": "Solutions For Congested Corridors Program",
    "State SB1": "State Senate Bill 1",
    "State SB1 ATP": "State Senate Bill 1 Active Transportation Program",
    "State SB1 LPP": "State Senate Bill 1  Local Partnership Program",
    "State SB1 SCCP": "State Senate Bill 1 Solutions For Congested Corridors Program",
    "State SB1 TCEP": "State Senate Bill 1 Trade Corridor Enhancement Program",
    "TCRP": "Traffic Congestion Relief Program",
}

In [152]:
all_awards_df.grant_program = all_awards_df.grant_program.replace(awards_replace)

In [153]:
# Delete none
all_awards_df = all_awards_df.loc[all_awards_df.grant_program != "None"].reset_index(
    drop=True
)

In [155]:
# all_awards_df[["grant_program"]].sort_values(by=["grant_program"]).drop_duplicates()

In [156]:
len(all_awards_df)

58922

In [160]:
all_awards_df.project_number.value_counts().head()

abc4a1da61    14
5a1f19bf2f    13
7d45c84bda    13
c8628f4a38    13
9a11f53af8    11
Name: project_number, dtype: int64

In [161]:
all_awards_df.loc[all_awards_df.project_number == "abc4a1da61"]

Unnamed: 0,project_number,grant_program,award_year,progdesc
39324,abc4a1da61,State Bond,2011,
39325,abc4a1da61,SHOPP Roadway Preservation,2012,
39326,abc4a1da61,SHOPP Roadside Preservation,2007,
39327,abc4a1da61,SHOPP Roadside,2007,
39328,abc4a1da61,SHOPP Mandates,2009,
39329,abc4a1da61,SHOPP Collision Reduction,2011,
39330,abc4a1da61,SHOPP Bridge Preservation,2012,
39331,abc4a1da61,Regional Surface Transportation Program (RSTP),2014,
39332,abc4a1da61,Regional Improvement Program,2009,
39333,abc4a1da61,Other Fed,2010,


### Cities
* lrtp_lost_city
* sb1_city
* Need to find a way to retain Daly City

In [189]:
def harmonize_cities(df: pd.DataFrame, city_column: str) -> pd.DataFrame:
    df = df.rename(columns={city_column: "city"})

    df.city = (
        df.city.str.title()
        .str.replace("City Of", "")
        .str.replace("Submitted By", "")
        .str.replace("City", "")
        .str.strip()
        .str.replace("Daly", "Daly City")
        .str.replace("Redwood", "Redwood City")
        .str.replace("Yuba", "Yuba City")
        .str.replace("Amador", "Amador City")
    )

    # Delete out empty cities
    df = df.loc[df.city != ""]
    return df

In [190]:
sb1_city = harmonize_cities(sb1_city, "cityname")

In [191]:
lrtp_lost_city = harmonize_cities(lrtp_lost_city, "city")

#### Find cities if geometry is available
* Where to find a shapefile with all the cities in California?
    * Can't find anything on our state's portal aside from incorporated cities
    * https://data.ca.gov/dataset/ca-geographic-boundaries
* Giving back errors

In [202]:
city_file = "gs://calitp-analytics-data/data-analyses/project_list/geometry/ca_places_boundaries.shp"

In [204]:
# with get_fs().open(city_file) as f:
#    city_gdf = to_snakecase(gpd.read_file(f))

####  What to do with rows that have tons of values that are separated only by spaces?
* Attempt to split out the name doesn't work...Cities like El Cerrito or San Pablo are always tricky

In [192]:
all_city_df = pd.concat([sb1_city, lrtp_lost_city])

In [193]:
all_city_df.city.nunique()

789

In [196]:
all_city_df.city.value_counts().head()

Eastvale      289
Brawley       191
Calipatria    147
Los Banos     127
Imperial      119
Name: city, dtype: int64

In [211]:
all_city_df.drop_duplicates(subset=["city"]).sort_values(by=["city"])

Unnamed: 0,project_number,city
2825,478f5d79b3,Adelanto
8512,cfd6d00fb2,Adelanto Apple Valley Barstow Hesperia Victorville
54,6939c74e6b,Adelanto Victorville
286,120f71abdd,Agoura Hills
8626,0560de6ce4,Agoura Hills Alhambra Arcadia Artesia Azusa Baldwin Park Bell Bell Gardens Bellflower Beverly Hills Bradbury Burbank Calabasas Carson Cerritos Commerce Compton Cudahy Culver Downey Duarte El Monte El Segundo Gardena Glendale
4091,a50099b468,Agoura Hills Calabasas Hidden Hills Los Angeles
804,05ea97b581,Alameda
8452,99b2d65b25,Alameda Albany Berkeley El Cerrito Emeryville Fremont Hayward Newark Oakland Piedmont Pinole Richmond San Leandro San Pablo Union
2081,e43aaf595a,Alameda Albany Berkeley Emeryville Fremont Hayward Newark Oakland Piedmont San Leandro Union El Cerrito Pinole Richmond San Pablo
2732,c4b0d4d82e,Albany


#### 2021 list of cities from state controller's office
* https://bythenumbers.sco.ca.gov/

In [216]:
state_controller = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/project_list/geometry/CIX_EachDataSet_2021-22_20231127_V1.xlsx"
    )
)[["city"]]

In [217]:
state_controller.sample()

Unnamed: 0,city
339,Redondo Beach


In [206]:
def split_and_associate(row):
    cities = row["city"].split()
    return pd.DataFrame(
        {"project_number": [row["project_number"]] * len(cities), "city": cities}
    )

In [218]:
expanded_data = (
    pd.concat(
        all_city_df.apply(split_and_associate, axis=1).tolist(), ignore_index=True
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

In [221]:
expanded_data.sample()

Unnamed: 0,project_number,city
6591,ef16d11d0a,Mission


In [220]:
# expanded_data.loc[expanded_data.project_number == "0560de6ce4"]

In [223]:
all_city_df2 = pd.merge(
    expanded_data, state_controller, how="left", on="city", indicator=True
)

In [224]:
all_city_df2.loc[all_city_df2._merge == "left_only"].sample(3)

Unnamed: 0,project_number,city,_merge
1091,86def861c1,El,left_only
5797,a83487fc3d,Marino,left_only
2484,3dcd038107,Hill,left_only


### Counties
* lrtp_lost_county
* sb1_counties
* lp2000_county
* ctips_county
* Clean the counties too, find a way to separate out counties separated out by spaces

In [225]:
def harmonize_counties(df: pd.DataFrame, county_column: str) -> pd.DataFrame:
    rename_columns = {
        county_column: "county",
    }

    df = df.rename(columns=rename_columns)

    df.county = (
        df.county.str.title().str.replace("County", "").str.replace("Submitted By", "")
    )

    return df

In [226]:
ctips_county = harmonize_counties(ctips_county, "county")

In [227]:
lp2000_county = harmonize_counties(lp2000_county, "county_name")

In [228]:
sb1_counties = harmonize_counties(sb1_county, "countyname")

In [229]:
lrtp_lost_county = harmonize_counties(lrtp_lost_county, "county")

In [230]:
all_counties_df = pd.concat(
    [ctips_county, lp2000_county, sb1_counties, lrtp_lost_county]
)

In [232]:
all_counties_df = fix_duplicates(all_counties_df, "county")

  df[column]  = df[column].str.replace(char, "")


In [233]:
all_counties_df.drop_duplicates(subset=["county"]).sort_values(by=["county"])

Unnamed: 0,county,project_number
816,Alameda,b1624e05e7
8447,Alameda And Santa Clara Contra Costa Placer Sacramento San Francisco Solano Yolo,545e490b77
1712,Alameda Contra Costa,136b2eb6e9
3000,Alameda Contra Costa Marin Napa San Francisco San Mateo Santa Clara Solano Sonoma,1666ec3785
8317,Alameda Contra Costa Marin Napa San Francisco San Mateo Solano,258119d7fd
3173,Alameda Contra Costa Marin San Francisco San Mateo Santa Clara Solano,dcb0203d8e
8333,Alameda Contra Costa Marin San Mateo,76b69b134b
8537,Alameda Contra Costa Placer Sacramento San Francisco Santa Clara Solano Yolo,40b0868e6d
3716,Alameda Contra Costa San Francisco San Mateo Santa Clara,7f84afe36d
4353,Alameda Contra Costa Santa Clara,b63065d9e5


### Congressional Districts
* None of the datasets have congressional district info

### CT Districts
* Missing this column in SB1, which is surprising.
* lp2000_district
* ctips_district
#### To do:
* Manually fill it in for LRTP/LOST (especially those with coordinates) in the original datasource


In [244]:
def harmonize_ct_districts(df: pd.DataFrame, district_column: str) -> pd.DataFrame:
    df = df.rename(columns={district_column: "ct_district"})

    df.ct_districts = df.ct_districts.apply(pd.to_numeric, errors="coerce").fillna(0)
    return df

In [245]:
lp2000_district = harmonize_ct_districts(lp2000_district, "district_code")

In [246]:
ctips_district = harmonize_ct_districts(ctips_district, "districtid")

#### Find SB1 districts using `overlay`

In [252]:
# Load districts
districts_url = "https://caltrans-gis.dot.ca.gov/arcgis/rest/services/CHboundary/District_Tiger_Lines/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
districts = to_snakecase(gpd.read_file(districts_url))[["geometry", "district"]]

In [251]:
districts.columns

Index(['objectid', 'district', 'region', 'shape__area', 'shape__length',
       'geometry'],
      dtype='object')

In [272]:
sb1_district = overlay_different_geotypes(sb1_geo, districts)

Overlaying LineString type
Overlaying MultiLineString type
Overlaying MultiPoint type
Overlaying Point type


In [273]:
sb1_district.columns

Index(['project_number', 'district', 'geometry'], dtype='object')

In [274]:
sb1_district = harmonize_ct_districts(sb1_district, "district")

In [265]:
# sb1_district.loc[sb1_district.project_number == 'cc4f326501'][['district']]

In [266]:
# sb1_district.loc[sb1_district.project_number == 'cc4f326501'].explore('district')

#### Stack

In [275]:
all_ct_dist_df = pd.concat([lp2000_district, ctips_district, sb1_district])

In [276]:
all_ct_dist_df.ct_districts.nunique()

17

#### There are districts that are beyond 12?

In [277]:
all_ct_dist_df.drop_duplicates(subset=["ct_districts"])

Unnamed: 0,ct_districts,project_number,geometry
0,1.0,ef0be36b7c,
1,10.0,5fbd41028d,
2,12.0,7cc011c5d1,
4,2.0,47b8cb8e36,
5,7.0,a09fd77df0,
16,6.0,6b8fdb400b,
19,4.0,3190949a69,
22,8.0,f950ad60e9,
23,11.0,451cbd6a6e,
28,5.0,71f9bd05d9,


### Geometry
* lrtp_lost_gdf
* sb1_geo

In [278]:
def harmonize_geo(gdf: gpd.GeoDataFrame, geography_col: str) -> gpd.GeoDataFrame:
    gdf = gdf.rename(columns={geography_col: "geometry"})

    gdf = gdf.set_geometry("geometry")

    gdf = gdf.set_crs(WGS84)
    return gdf

In [279]:
lrtp_lost_gdf = harmonize_geo(lrtp_lost_gdf, "geometry")

In [280]:
sb1_geo = harmonize_geo(sb1_geo, "geometry")

In [281]:
all_geo_df = pd.concat([lrtp_lost_gdf, sb1_geo])

In [282]:
type(all_geo_df)

geopandas.geodataframe.GeoDataFrame

In [283]:
len(all_geo_df)

2940

### Phase Funding
* ctips_phase
* lp2000_phase
* A function really needed because only CTIPS has all this information.

In [286]:
ctips_phase.columns

Index(['con', 'rw', 'pe_env', 'pe_rw', 'pe_con', 'pe_total',
       'total_federal_funds', 'total_state_funds', 'total_local_funds',
       'is_local', 'is_state', 'is_federal', 'projectid', 'pa_ed_begin',
       'pa_ed_end', 'ps_e_begin', 'begin_row', 'end_row', 'con_start_date',
       'con_end_date', 'begin_closeout', 'end_closeout',
       'construction_completion_date', 'ready_to_list_date', 'projcomp_date',
       'project_number'],
      dtype='object')

In [287]:
lp2000_phase.columns

Index(['single_phase_cost', 'single_phase_expenditure_amt',
       'total_state_funds', 'total_federal_funds', 'is_state', 'is_federal',
       'project_number'],
      dtype='object')

In [288]:
phase_funding_df = pd.concat([ctips_phase, lp2000_phase])

In [290]:
phase_funding_df = phase_funding_df.drop(columns=["projectid"])

In [291]:
phase_funding_df.sample()

Unnamed: 0,con,rw,pe_env,pe_rw,pe_con,pe_total,total_federal_funds,total_state_funds,total_local_funds,is_local,is_state,is_federal,pa_ed_begin,pa_ed_end,ps_e_begin,begin_row,end_row,con_start_date,con_end_date,begin_closeout,end_closeout,construction_completion_date,ready_to_list_date,projcomp_date,project_number,single_phase_cost,single_phase_expenditure_amt
13979,90000000.0,0.0,0.0,0.0,0.0,0.0,0.0,65000000.0,25000000.0,Yes,Yes,No,,,,,,,,,,,,,6c009e1693,,


### Project (aka the primary/base table)
* Agency_id: not available yet.

#### Find maps for
* DAC boundaries
* SHS

#### Need to add metrics/categories/enough info after stacking
#### Brainstorm best way to inject last_accessed/last_updated

In [372]:
def harmonizing(
    df: pd.DataFrame,
    agency_id: str,
    beg_pm: str,
    current_phase: str,  # figure out how to include only certain options,
    data_source: str,
    end_pm: str,
    funded_amount: str,
    funding_status: str,
    general_phase: str,
    grant_recipient: str,
    implementing_agency: str,
    isonshys_y_n: str,
    last_accessed: str,
    last_updated: str,
    located_in_dac: str,
    percentage_of_project_in_DAC: str,
    ppno: str,
    programmed_y_n: str,
    project_agency_sponsor: str,
    project_description: str,
    project_title: str,
    purpose_need: str,
    route: str,
    target_opening_year: str,
    total_cost: str,
    urban_or_rural: str,
    notes_cols: list,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.
    """
    rename_columns = {
        agency_id: "agency_id",
        beg_pm: "beg_pm",
        current_phase: "current_phase",
        data_source: "data_source",
        end_pm: "end_pm",
        funded_amount: "funded_amount",
        funding_status: "funding_status",
        grant_recipient: "grant_recipient",
        implementing_agency: "implementing_agency",
        isonshys_y_n: "isonshys_y_n",
        last_accessed: "last_accessed",
        last_updated: "last_updated",
        located_in_dac: "located_in_dac",
        percentage_of_project_in_DAC: "percentage_of_project_in_DAC",
        ppno: "ppno",
        programmed_y_n: "programmed_y_n",
        project_agency_sponsor: "project_agency_sponsor",
        project_description: "project_description",
        project_title: "project_title",
        purpose_need: "purpose_need",
        route: "route",
        target_opening_year: "target_opening_year",
        total_cost: "total_cost",
        urban_or_rural: "urban_or_rural",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Add data source
    df["data_source"] = data_source

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    all_cols = [
        "agency_id",
        "beg_pm",
        "current_phase",
        "data_source",
        "end_pm",
        "funded_amount",
        "funding_status",
        "grant_recipient",
        "implementing_agency",
        "isonshys_y_n",
        "last_accessed",
        "last_updated",
        "located_in_dac",
        "percentage_of_project_in_DAC",
        "ppno",
        "programmed_y_n",
        "project_agency_sponsor",
        "project_description",
        "project_title",
        "purpose_need",
        "route",
        "target_opening_year",
        "total_cost",
        "urban_or_rural",
    ]

    for column in all_cols:
        if column not in df:
            df[column] = "None"

    # Clean up monetary columns to be interger
    cost_columns = df.columns[df.columns.str.contains("(cost|funded)")].tolist()
    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Add column for unfunded needs
    df["unfunded_amount"] = df.total_cost - df.funded_amount
    df["general_phase"] = general_phase

    # Create notes - aka other columns that don't belong in the schema
    # But there is still information of value
    df = create_notes(df, notes_cols, "notes")

    # Clean up string columns
    string_cols = df.select_dtypes(include=["object"]).columns.to_list()
    for i in string_cols:
        df[i] = df[i].str.replace("_", " ").str.strip().str.title()

    # Only keep certain columns
    all_cols = ["project_number", "unfunded_amount", "general_phase", "notes"] + all_cols
    df = df[all_cols]
    return df

In [351]:
lrtp_lost_df.sample()

Unnamed: 0,project_title,lead_agency,project_year,project_description,total_project_cost,data_source,notes,project_number
7192,East Commerce Way A,Sacog Rtp/Scs Published In 2019,2020-2025,"In Sacramento, East Commerce Way From Club Center Drive To Del Paso Rd,\nExtend As A 6-Lane Facility.",8142225.0,Sacog Rtp/Scs Published In 2019 Lrtp,"Budget Category: B- Road & Highway\nCapacity, Year Of Expenditure Cost For Planned Projects: 8554425.0, Status Planned, Programmed Or Project Development Only : Planned",bd634fd740


In [373]:
lrtp_lost_base = harmonizing(
    df=lrtp_lost_df,
    agency_id="",
    beg_pm="",
    current_phase="",
    data_source="data_source",
    end_pm="",
    funded_amount="",
    funding_status="",
    general_phase="",
    grant_recipient="lead_agency",
    implementing_agency="",
    isonshys_y_n="",
    last_accessed="",
    last_updated="",
    located_in_dac="",
    percentage_of_project_in_DAC="",
    ppno="",
    programmed_y_n="",
    project_agency_sponsor="",
    project_description="project_description",
    project_title="project_title",
    purpose_need="",
    route="",
    target_opening_year="",
    total_cost="total_project_cost",
    urban_or_rural="",
    notes_cols=["project_year", "notes"],
)

  cost_columns = df.columns[df.columns.str.contains("(cost|funded)")].tolist()


In [374]:
lrtp_lost_base.sample()

Unnamed: 0,project_number,unfunded_amount,general_phase,notes,agency_id,beg_pm,current_phase,data_source,end_pm,funded_amount,funding_status,grant_recipient,implementing_agency,isonshys_y_n,last_accessed,last_updated,located_in_dac,percentage_of_project_in_DAC,ppno,programmed_y_n,project_agency_sponsor,project_description,project_title,purpose_need,route,target_opening_year,total_cost,urban_or_rural
13520,Abddf976A0,3000000.0,,"Project Year: 2023, Notes: Jurisdiction: Hughson, Location: Euclid Ave, Project Limits: Hatch Rd To Whitmore Ave, Funding Source: Dev. Impact\nFees, Sb 1, System Preserv : Nan, Capacity Enhance : X, Safety: X, Oper : Nan, Complete Streets: X, Active\nTransporta Tion: Nan, Transit: Nan, Other: Nan",,,,Data Source,,0.0,,Stancog Rtp/Scs Published In 2022,,,,,,,,,,Install Complete Street Improvements,,,,,3000000.0,


### Senate District
* CTIPS
* SB1

In [331]:
def harmonize_senate_districts(df: pd.DataFrame, district_column: str) -> pd.DataFrame:

    df.senate_districts = df.senate_districts.apply(
        pd.to_numeric, errors="coerce"
    ).fillna(0)
    return df

In [332]:
ctips_senate.sample()

Unnamed: 0,senate_districts,project_number
671,2,e7856b5a16


In [333]:
ctips_senate = harmonize_senate_districts(ctips_senate, "ssenate")

In [334]:
sb1_senate = harmonize_senate_districts(sb1_senate, "senatedistrict")

#### Boundaries
* https://gis.data.ca.gov/datasets/CDEGIS::senate-districts/explore

In [335]:
senate_district_url = "https://services3.arcgis.com/fdvHcZVgB2QSRNkL/arcgis/rest/services/Legislative/FeatureServer/1/query?outFields=*&where=1%3D1&f=geojson"

In [336]:
senate_gdf = to_snakecase(gpd.read_file(senate_district_url))[["geoid", "geometry"]]

In [337]:
senate_gdf = senate_gdf.rename(columns={"geoid": "senate_districts"})

In [344]:
len(senate_gdf)

40

In [338]:
lrtp_senate = overlay_different_geotypes(lrtp_lost_geo, senate_gdf)

Overlaying Point type
Overlaying MultiLineString type
Overlaying LineString type
Overlaying Polygon type
Overlaying MultiPolygon type


In [339]:
lrtp_senate = lrtp_senate.drop(columns=["geometry"])

In [340]:
lrtp_senate = harmonize_senate_districts(lrtp_senate, "senate_districts")

In [341]:
senate_df = pd.concat([lrtp_senate, ctips_senate, sb1_senate])

In [346]:
len(senate_df)

17399

In [347]:
senate_df.project_number.nunique()

13717

In [348]:
senate_df.project_number.value_counts().head()

e09624d815    40
02c6083564    24
b137be7ff4    24
d1c26d27f3    24
86eb90e1dd    24
Name: project_number, dtype: int64

#### There are districts that are beyond 40?

In [345]:
senate_df.senate_districts.value_counts()

2                  1292
1                  1095
17                  940
12                  776
3                   773
4                   749
8                   696
31                  657
14                  644
40                  563
5                   445
16                  433
13                  418
38                  396
19                  387
9                   385
7                   370
21                  367
10                  360
18                  358
39                  350
23                  342
32                  323
28                  300
25                  299
15                  293
36                  274
37                  272
35                  270
22                  262
29                  252
6                   252
33                  246
27                  242
20                  239
11                  222
24                  222
34                  221
26                  212
30                  176
1416                  4
46              

In [343]:
senate_df.senate_districts.unique()

array([              1,               7,              10,               3,
                     9,               2,              11,              13,
                    15,               5,              38,              40,
                    39,              18,              32,               8,
                    17,              21,              36,              16,
                     4,               6,              12,              14,
                    19,              20,              22,              23,
                    24,              25,              26,              27,
                    28,              29,              30,              31,
                    33,              34,              35,              37,
                  1113,             358,              51,              45,
                    46,              43,              44,              41,
       237891011131517,            1416,              49,           81214,
                  5812,  