## Compile Projects
To-do
* Figure out how to version things b/c projects will get updated and we want to track any changes.
* This only needs to be done with data from lp2000 and ctips.
* Need to track changes across all the different dataframes
* Use merges to figure it out?

In [72]:
import hashlib
from datetime import datetime

import _harmonization_utils as har_utils
import _lrtp_utils as lrtp_utils
import _sb1_utils as sb1_utils
import _specific_list_utils
import _state_rail_plan_utils as srp_utils
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [73]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [74]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_list/"

## General Functions

In [75]:
def create_notes(df, note_cols: list, new_col_name: str):
    """
    Concat multiple columns into one.
    """
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df[new_col_name] = df.apply(combine_notes, axis=1)
    df[new_col_name] = df[new_col_name].str.replace("_", " ")

    return df

In [76]:
def separate_out_df(df: pd.DataFrame, columns_to_keep: list) -> pd.DataFrame:
    """
    Subset the DataFrame based on the specified columns to keep,
    drop any rows in which the values are NaN or "None."

    Parameters:
    df (pd.DataFrame): Input DataFrame.
    columns_to_keep (list): List of column names to keep in the DataFrame.

    Returns:
    pd.DataFrame: DataFrame with specified columns and cleaned rows.
    """
    # Subset DataFrame based on columns_to_keep
    df2 = df[columns_to_keep]

    # Fill in missing values with 'none' if possible
    try:
        df2 = df2.fillna("none")
    except:
        df2

    # Remove 'project_number' from columns_to_keep
    columns_to_keep.remove("project_number")

    # Drop rows that are NaN or "None" based on how many columns are being included in the
    # new dataframe
    if len(columns_to_keep) == 1:
        # Drop rows with any NaN values
        df2 = df2.dropna(how="any")
        # Drop rows where any value in the row is 'none'
        df2 = df2[
            df2.applymap(lambda x: x.lower() if isinstance(x, str) else x) != "none"
        ].dropna()
    else:
        # Keep any row that has a non-null value in at least one of the specified columns
        df2 = df2.dropna(how="all", subset=columns_to_keep)

    print(f"Number of rows left {len(df2)}")
    return df2

In [77]:
def explode_dataframe(df: pd.DataFrame, column_to_explode: str) -> pd.DataFrame:
    """
    Explode a DataFrame based on a specified column.

    This function converts a column of lists or strings separated by commas into multiple rows,
    with each element in the list or string becoming a separate row entry.

    Parameters:
    df (pd.DataFrame): Input DataFrame.
    column_to_explode (str): Name of the column to explode.

    Returns:
    pd.DataFrame: Exploded DataFrame.
    """
    # Apply a function to the specified column to split the strings into lists of elements
    df["Column2"] = df[column_to_explode].apply(
        lambda x: [int(i) if i.isdigit() else i for i in x.replace(",", "").split()]
    )

    # Drop the original column that was exploded
    df = df.drop(columns=[column_to_explode])

    # Explode the DataFrame based on the newly created 'Column2'
    df2 = df.explode("Column2")

    # Rename the exploded column back to the original column name
    df2 = df2.rename(columns={"Column2": column_to_explode})

    df2 = df2.sort_values(by=["project_number"]).reset_index(drop=True)
    return df2

In [78]:
def generate_hash(string) -> str:

    hash_object = hashlib.sha1(string.encode())
    hash_hex = hash_object.hexdigest()
    return hash_hex[:10]

In [79]:
def generate_project_id(df: pd.DataFrame, project_name_col: str) -> pd.DataFrame:
    df = df.reset_index(drop=True)
    df["index_number"] = df.index
    df["combo"] = df.index_number.astype("str") + df[project_name_col].astype('str').fillna("none")
    df["project_number"] = df["combo"].apply(generate_hash)
    df = df.drop(columns=["index_number", "combo"])

    print("Checking that there are the same number of unique project numbers & rows")
    print(f"{len(df) == df.project_number.nunique()}")
    return df

## Separate out data based on schema

### LRTP/LOST

In [80]:
lrtp_lost_df, lrtp_lost_gdf = lrtp_utils.all_mpo(True)

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


96 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  sandag.cost2020m.str.replace("$", "")
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


65 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


360 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


68 rows are headers


  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [81]:
lrtp_lost_df = generate_project_id(lrtp_lost_df, "project_title")

Checking that there are the same number of unique project numbers & rows
True


In [82]:
lrtp_lost_county = separate_out_df(lrtp_lost_df, ["project_number", "county"])

Number of rows left 4012


In [83]:
lrtp_lost_city = separate_out_df(lrtp_lost_df, ["project_number", "city"])

Number of rows left 745


In [84]:
lrtp_lost_agency = separate_out_df(lrtp_lost_df, ["project_number", "lead_agency"])

Number of rows left 16276


In [85]:
lrtp_lost_geo = separate_out_df(lrtp_lost_df, ["project_number", "geometry"])

Number of rows left 1357


In [86]:
lrtp_to_drop = ["county", "city", "lead_agency", "geometry"]

In [87]:
lrtp_lost_df = lrtp_lost_df.drop(columns=lrtp_to_drop)

In [88]:
lrtp_lost_df.sample()

Unnamed: 0,project_title,project_year,project_description,total_project_cost,data_source,notes,project_number
5297,Various,,Rehab/Maint/Operations,19555000.0,Madera 2046 Rtp/Scs Lrtp,"Category: Maintenance, Location: Nan",16a4851029


### SB1

In [89]:
sb1_df = sb1_utils.load_sb1()

  gdf[i]
  gdf[i]


0 rows are mising geometry
7917 rows contain invalid geography


In [90]:
sb1_df.sample()

Unnamed: 0,objectid,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,description,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,popuptitle,popup,geometry,projcount,totalcosts,projlisturl,projlist2url,mponame,mpocode,assetcategory,bphotojpg,bphotohtml,aphotojpg,aphotohtml,routes,constyear,costfull,projagency,data_source
2767,167060,LsrFy17185163Pp005,Franklin Avenue Rehabilitation,Local,Local And Regional,Lsr1718,201718 Local Streets And Roads,,PreConstruction,3 Mill And Ac Overlay Ada Ramp Improvements Striping Improvements,,3,4,3,4,Sutter,Yuba City Submitted By City,Sut,Yc,,,,,,,,,,,,,,,,,,2021,,Yuba City City,SB1 Server


In [91]:
sb1_df = generate_project_id(sb1_df, "projectid")

Checking that there are the same number of unique project numbers & rows
True


#### Check that assemblydistrict and assemblycode are the same values
* Yes they are, just padded with zeroes or spaced out differently.

In [92]:
sb1_df["assembly_same"] = sb1_df.assemblycode == sb1_df.assemblydistrict

In [93]:
sb1_df.assembly_same.value_counts()

True     7217
False    1969
Name: assembly_same, dtype: int64

In [94]:
sb1_df.loc[sb1_df.assembly_same == False][["assemblycode", "assemblydistrict"]].sample(
    10
)

Unnamed: 0,assemblycode,assemblydistrict
6747,3234,32 34
7001,1,1
3895,2,2
558,556568697274,55 65 68 69 72 74
6930,1,1
2586,111416,11 14 16
2502,4,4
4748,3,3
2213,434546505153,43 45 46 50 51 53
6133,709,07 09


In [95]:
sb1_df["senate_same"] = sb1_df.senatedistrict == sb1_df.senatecode

In [96]:
sb1_df.senate_same.value_counts()

True     6950
False    2236
Name: senate_same, dtype: int64

In [97]:
sb1_county = separate_out_df(sb1_df, ["project_number", "countyname"])

Number of rows left 9184


In [98]:
sb1_assembly = separate_out_df(sb1_df, ["project_number", "assemblydistrict"])

Number of rows left 9185


In [99]:
sb1_assembly = explode_dataframe(sb1_assembly, "assemblydistrict")

In [100]:
# sb1_assembly.loc[sb1_assembly.project_number == "001784c575"]

In [101]:
# sb1_df.loc[sb1_df.project_number == "001784c575"]

In [102]:
sb1_senate = separate_out_df(sb1_df, ["project_number", "senatedistrict"])

Number of rows left 9185


In [103]:
sb1_senate = explode_dataframe(sb1_senate, "senatedistrict")

In [104]:
sb1_city = separate_out_df(sb1_df, ["project_number", "cityname"])

Number of rows left 6696


In [105]:
sb1_geo = separate_out_df(sb1_df, ["project_number", "geometry"])

Number of rows left 1585


In [106]:
sb1_awards = separate_out_df(sb1_df, ["project_number", "projprogram"])

Number of rows left 9186


In [107]:
sb1_agencies = separate_out_df(
    sb1_df, ["project_number", "projagency", "appagencyname", "impagencyname"]
)

Number of rows left 9186


In [108]:
sb1_df = sb1_df.drop(
    columns=[
        "countyname",
        "assemblydistrict",
        "senatedistrict",
        "cityname",
        "geometry",
        "projprogram",
        "projagency",
        "appagencyname",
        "impagencyname",
        "assemblycode",
        "senatecode",
        "countycode",
        "citycode",
        "assembly_same",
    ]
)

In [109]:
sb1_df.sample()

Unnamed: 0,objectid,projectid,projname,projcatcode,projcategory,projprogcode,multiprogfunded,projstatus,description,cost,popuptitle,popup,projcount,totalcosts,projlisturl,projlist2url,mponame,mpocode,assetcategory,bphotojpg,bphotohtml,aphotojpg,aphotohtml,routes,constyear,costfull,data_source,project_number,senate_same
6126,166627,LsrFy17185123Pp007,BodegaFlorence Crosswalk Safety Improvements,Local,Local And Regional,Lsr1718,,PreConstruction,Relocate Crosswalk And Create A Raised Pedestrian Refuge Island At The Median,,,,,,,,,,,,,,,,2018,,SB1 Server,cb021880b7,True


### LP2000

In [110]:
def load_lp2000(file: str):

    df_project = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="project")
    )

    df_county = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="county")
    ).drop(columns=["project_label_name"])

    df_district = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="district")
    ).drop(columns=["project_label_name"])

    df_award = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="awards")
    )

    df_phase = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="phase_funding")
    )

    return df_project, df_county, df_district, df_award, df_phase

In [111]:
(
    lp2000_project,
    lp2000_county,
    lp2000_district,
    lp2000_award,
    lp2000_phase,
) = load_lp2000("LP2000_projects.xlsx")

In [112]:
lp2000_project = generate_project_id(lp2000_project, "project_id")

Checking that there are the same number of unique project numbers & rows
True


In [113]:
def add_project_number(
    df_with_project_id: pd.DataFrame, right_project: pd.DataFrame, merge_col: str
) -> pd.DataFrame:
    m1 = pd.merge(df_with_project_id, right_project, on=merge_col, how="inner")
    m1 = m1.drop(columns = merge_col)
    return m1

In [114]:
lp2000_county = add_project_number(lp2000_project, lp2000_county, "project_id")

In [115]:
lp2000_district = add_project_number(lp2000_project, lp2000_district, "project_id")

In [116]:
lp2000_award = add_project_number(lp2000_project, lp2000_award, "project_id")

In [117]:
lp2000_phase = add_project_number(lp2000_project, lp2000_phase, "project_id")

### CTIPS

In [118]:
def load_ctips(file: str):

    df_project = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="project")
    )

    df_agency = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="agencies")
    )

    df_county = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="county")
    )

    df_district = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="district")
    )

    df_phase = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="phase_funding")
    )

    df_award = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="awards")
    )

    df_house = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="us_house")
    )

    df_senate = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="senate")
    )

    df_assembly = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}LP2000_CTIPS/{file}", sheet_name="assembly")
    )

    df_project["data_source"] = "CTIPS"
    return (
        df_project,
        df_agency,
        df_county,
        df_district,
        df_phase,
        df_award,
        df_house,
        df_senate,
        df_assembly,
    )

In [119]:
(
    ctips_project,
    ctips_agencies,
    ctips_county,
    ctips_district,
    ctips_phase,
    ctips_award,
    ctips_house,
    ctips_senate,
    ctips_assembly,
) = load_ctips("CTIPS_data.xlsx")

In [120]:
ctips_project = generate_project_id(ctips_project, "ctips_id")

Checking that there are the same number of unique project numbers & rows
True


In [121]:
ctips_project.sample()

Unnamed: 0,ctips_id,chg_offcl,chg_qual1,chg_qual2,districtid,document,docyear,ea_number,needpurpose,ppno,proj_desc,postmiles1,pm1b,pm2b,pm3b,pm1a,pm2a,pm3a,route1,route2,route3,title,data_source,project_number
11049,20600006818,2,4,0,4,FTIP,2022,,,,Solano County : Countywide : Develop A Countywide Connected Mobility Implementation Plan To Address How Solano Reacts To The Recommendations Of Blue Ribbon Task Force,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,Solano Connected Mobility Implementation Plan,CTIPS,d80317fcb2


In [122]:
ctips_project.shape

(27016, 24)

In [123]:
ctips_county = add_project_number(ctips_project, ctips_county, "ctips_id")

In [124]:
ctips_district = add_project_number(ctips_project, ctips_district, "ctips_id")

In [125]:
ctips_phase = add_project_number(ctips_project, ctips_phase, "ctips_id")

In [126]:
ctips_award = add_project_number(ctips_project, ctips_award, "ctips_id")

In [127]:
ctips_house = add_project_number(ctips_project, ctips_house, "ctips_id")

In [128]:
ctips_senate = add_project_number(ctips_project, ctips_senate, "ctips_id")

In [129]:
ctips_agencies = add_project_number(ctips_project, ctips_agencies, "ctips_id")

In [130]:
ctips_assembly = add_project_number(ctips_project, ctips_assembly, "ctips_id")

In [131]:
ctips_assembly.shape

(4679, 24)

### State Rail Plan

In [132]:
srp_df = har_utils.load_state_rail_plan()

In [133]:
srp_df.sample()

Unnamed: 0,project_time_horizon,project_name,project_description,lead_agency,total_project_cost,srp_region,corridor,sub_corridor_node_1,sub_corridor_node_2,itsp_corridor,project_category,data_source
12,Mid Term,Rengstorff Ave Grade Separation Project,The project will help Caltrain reduce local traffic congestion and train horn noise. A new pedestrian overcrossing will be constructed across Rengstorff Avenue to maintain east-west pedestrian and bicycle connectivity.,Caltrain,3500000,Northern California Megaregion,San Francisco Peninsula Corridor,San Francisco,San Jose,Central Coast - San Jose/ San Francisco Bay Area,Grade Separation,State Rail Plan


In [134]:
srp_df = generate_project_id(srp_df, "project_name")

Checking that there are the same number of unique project numbers & rows
True


In [135]:
srp_df_agency = separate_out_df(srp_df, ["project_number", "lead_agency"])

Number of rows left 263


In [136]:
srp_df.columns

Index(['project_time_horizon', 'project_name', 'project_description',
       'lead_agency', 'total_project_cost', 'srp_region', 'corridor',
       'sub_corridor_node_1', 'sub_corridor_node_2', 'itsp_corridor',
       'project_category', 'data_source', 'project_number'],
      dtype='object')

In [137]:
notes_cols = ['project_time_horizon','srp_region', 'corridor', 'sub_corridor_node_1',
       'sub_corridor_node_2', 'itsp_corridor', 'project_category',]

In [138]:
# srp_df = create_notes(srp_df, notes_cols, 'notes')

In [139]:
# srp_df = srp_df.drop(columns = notes_cols + ['lead_agency'])

## Stack

### Agencies
* lrtp_lost_agency
* sb1_agencies
* lp2000_agencies
* ctips_agencies
* srp_df_agency

* Agency should be attached back to projects...

In [140]:
def harmonize_agencies(df: pd.DataFrame, agency_column: str) -> pd.DataFrame:
    df = df.rename(columns={agency_column: "lead_agency"})

    return df

In [141]:
srp_df_agency = harmonize_agencies(srp_df_agency, "lead_agency")

In [142]:
ctips_agencies = harmonize_agencies(ctips_agencies, "lead_agency")

### Assembly_districts
* ctips_assembly
* sb1_assembly
* Find the assembly district if we have the coordinates?

In [143]:
def harmonize_assembly(df: pd.DataFrame, assembly_column: str) -> pd.DataFrame:
    df = df.rename(columns={assembly_column: "assembly_districts"})

    return df

In [144]:
ctips_assembly = harmonize_assembly(ctips_assembly, "assembly")

In [145]:
sb1_assembly = harmonize_assembly(sb1_assembly, "assemblydistrict")

In [146]:
all_assembly_df = pd.concat([ctips_assembly, sb1_assembly])

### Awards
* sb1_awards
* lp2000_award
* ctips_award

In [147]:
def harmonize_awards(
    df: pd.DataFrame, year: str, grant_program_col: str
) -> pd.DataFrame:
    rename_dict = {year: "award_year", grant_program_col: "grant_program"}
    df = df.rename(columns=rename_dict)

    create_cols = ["award_year", "grant_program"]
    for column in create_cols:
        if column not in df:
            df[column] = "Not Available"
    return df

In [148]:
sb1_awards = harmonize_awards(sb1_awards, "", "projprogram")

In [149]:
lp2000_award = harmonize_awards(
    lp2000_award, "state_fiscal_awarded_year", "grant_program"
)

In [150]:
ctips_award = harmonize_awards(ctips_award, "line_year", "program")

In [151]:
all_awards_df = pd.concat([sb1_awards, lp2000_award, ctips_award])

### Cities
* lrtp_lost_city
* sb1_city
* Need to find a way to retain Daly City

In [152]:
def harmonize_cities(df: pd.DataFrame, city_column: str) -> pd.DataFrame:
    df = df.rename(columns={city_column: "city"})

    df.city = (
        df.city.str.title()
        .str.replace("City Of", "")
        .str.replace("Submitted By", "")
        .str.strip()
    )

    return df

In [153]:
sb1_city = harmonize_cities(sb1_city, "cityname")

In [154]:
lrtp_lost_city = harmonize_cities(lrtp_lost_city, "city")

In [155]:
all_city_df = pd.concat([sb1_city, lrtp_lost_city])

In [156]:
all_city_df[["city"]].drop_duplicates().sort_values(by=["city"])

Unnamed: 0,city
832,
8964,Adelanto
2825,Adelanto City
8512,Adelanto Apple Valley Barstow Hesperia Victorville
54,Adelanto Victorville
8965,Agoura Hills
286,Agoura Hills City
8626,Agoura Hills Alhambra Arcadia Artesia Azusa Baldwin Park Bell Bell Gardens Bellflower Beverly Hills Bradbury Burbank Calabasas Carson Cerritos Commerce Compton Cudahy Culver City Downey Duarte El Monte El Segundo Gardena Glendale
4091,Agoura Hills Calabasas Hidden Hills Los Angeles
2884,Alameda


### Counties
* lrtp_lost_county
* sb1_counties
* lp2000_county
* ctips_county
* Clean the counties too, find a way to separate out counties separated out by spaces

In [157]:
def harmonize_counties(df: pd.DataFrame, county_column: str) -> pd.DataFrame:
    rename_columns = {
        county_column: "county",
    }

    df = df.rename(columns=rename_columns)

    df.county = (
        df.county.str.title().str.replace("County", "").str.replace("Submitted By", "")
    )

    return df

In [158]:
ctips_county = harmonize_counties(ctips_county, "county")

In [159]:
lp2000_county = harmonize_counties(lp2000_county, "county_name")

In [161]:
sb1_counties = harmonize_counties(sb1_county, "countyname")

In [162]:
lrtp_lost_county = harmonize_counties(lrtp_lost_county, "county")

In [163]:
all_counties_df = pd.concat(
    [ctips_county, lp2000_county, sb1_counties, lrtp_lost_county]
)

### Congressional Districts
* None of the datasets have congressional district info

### CT Districts
* Manually fill it in for LRTP/LOST? 
* Missing this column in SB1 but it shouldn't be...
* lp2000_district
* ctips_district

In [164]:
def harmonize_ct_districts(df: pd.DataFrame, district_column: str) -> pd.DataFrame:
    df = df.rename(columns={district_column: "ct_districts"})

    return df

In [165]:
lp2000_district = harmonize_ct_districts(lp2000_district, "district_code")

In [166]:
ctips_district = harmonize_ct_districts(ctips_district, "districtid")

In [167]:
all_ct_dist_df = pd.concat([lp2000_district, ctips_district])

### Geometry
* lrtp_lost_gdf
* sb1_geo
* 

In [168]:
def harmonize_geo(gdf: gpd.GeoDataFrame, geography_col: str) -> gpd.GeoDataFrame:
    gdf = gdf.rename(columns={geography_col: "geometry"})
    # Add CRS
    gdf = gdf.set_geometry('geometry')
    return gdf

In [169]:
lrtp_lost_gdf = harmonize_geo(lrtp_lost_gdf, "geometry")

In [170]:
sb1_geo = harmonize_geo(sb1_geo, "geometry")

In [171]:
all_geo_df = pd.concat([lrtp_lost_gdf, sb1_geo])

In [172]:
type(all_geo_df)

geopandas.geodataframe.GeoDataFrame

In [173]:
len(all_geo_df)

2940

### Phase Funding
* ctips_phase
* lp2000_phase

### Project (aka the primary/base table)

### Senate District