## Add SB1 geographic information.

In [1]:
import _utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
import fuzzywuzzy
from fuzzywuzzy import process



In [3]:
import fsspec
from calitp import *
from calitp.storage import get_fs

fs = get_fs()
import os

In [4]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
def basic_cleaning(df, agency_col: str, project_name_col:str, project_id_col: str, project_desc_col:str):
    
    df = _utils.organization_cleaning(df, agency_col)
    
    # Remove all punctation, lowercase, and strip whitespaces from 
    # project titles & descriptions
    for i in [project_name_col, project_desc_col]:
        df[i] = (df[i].str.lower().str.replace('[^\w\s]','').str.strip())
                 
    # Some project names contain the year. Remove anything after 20..
    df[project_name_col] = df[project_name_col].str.split("20").str[0]
    
    # Project ID, remove all commas and lowercase if there are strings
    df[project_id_col] = (df[project_id_col].str.replace("'", "").str.lower().str.strip())
    
    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = df.select_dtypes(include=['object']).columns.to_list()
    try:
        for i in string_cols:
            df[i] = df[i].str.replace("|", "")
    except:
        pass
        
    # Fill in NA
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    return df

### Non SHOPP-ATP-TIRCP
* No year information for projects.

In [6]:
# Read in 10 Year non SHOPP with ATP and TIRCP
nonshopp = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx")
)

In [7]:
# Subset to join.
non_shopp_subset = [
    "ppno",
    "ct_project_id",
    "ea",
    "project_name",
    "lead_agency",
    "previous_caltrans_nominations",
    "full_county_name",
    "county",
    "district_full_name",
    "district",
    "project_description",
    "current_phase",
    "primary_mode",
    "urban_rural",
    "total_project_cost__$1,000",
    "total_unfunded_need__$1,000",
    "shs_capacity_increase_detail",
    "current_phase",
]

In [8]:
# nonshopp = nonshopp[non_shopp_subset]

In [9]:
# Add a digit in front of single digits
nonshopp.district = nonshopp.district.map("{:02}".format)

In [10]:
nonshopp = basic_cleaning(nonshopp, "lead_agency", "project_name", "ct_project_id", "project_description")



### 9 Sample Non SHOPP 

In [11]:
nine_projects_names = [
    "LA-210 Median Concrete Barrier Renovation",
    "SR-14 Widening Project",
    "US 395 Freight Mobility and Safety Project",
    "East Bay Greenway Multimodal Corridor Project",
    "Watsonville-Santa Cruz Multimodal Corridor Program",
    "SM 101 Woodside Road Interchange and Port Access Project",
    "I-710 Integrated Corridor Management",
    "Five Cities Multimodal Transportation Network Enhancement Project",
    "SR-86/Avenue 50 New Interchange (Phase II)",
]

In [12]:
nine_projects_names = [x.lower() for x in nine_projects_names]

In [13]:
nine_projects_id = [
    "0422000202",
    "0414000032",
    "0520000083",
    "0515000063",
    "0721000056",
    "0716000370",
    "0813000222",
    "0814000144",
    "0414000032",
    "0720000165",
]

In [14]:
nine_sample_projects = (nonshopp[nonshopp.ct_project_id.isin(nine_projects_id)].reset_index(drop=True))

* Solutions for Congest Corridors (SCCP): 1
* Trade Corridor Enhancement Program (TCEP): 3
* Only 3 projects seem to have been awarded. 
    * east bay greenway multimodal corridor project phase 1
    * us 101woodside road interchange and port access project
    * watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12

In [15]:
# nine_sample_projects[['project_name','project_description','county','previous_caltrans_nominations']]

### Sb1 Geo
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer

In [16]:
# Subset to preview SB1 vs Nonshopp. Nonshopp is on the left, sb1 on the right
preview_cols =  ['project_name','projecttitle','project_description','projectdescription', 'full_county_name', 'countynames']

#### Step 1: Read in files with geometry 

In [17]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [18]:
def rest_server():
    full_gdf = pd.DataFrame()
    for i in [*range(0,22)]:
        df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)
    return full_gdf

In [19]:
sb1_geo1 = rest_server()

In [20]:
sb1_geo1.columns

Index(['objectid', 'agencyids', 'agencies', 'programcodes', 'iijaprogram',
       'iijacodes', 'projectstatuscodes', 'fiscalyears', 'fiscalyearcodes',
       'projectstatuses', 'sb1funds', 'iijafunds', 'totalcost', 'dateupdated',
       'projectcount', 'assemblydistricts', 'senatedistricts',
       'congressionaldistricts', 'assemblycodes', 'senatecodes',
       'congressionalcodes', 'countynames', 'citynames', 'countycodes',
       'citycodes', 'ct_codes', 'ct_districts', 'issb1', 'isiija', 'isonshs',
       'issb1codes', 'isiijacode', 'isonshscodes', 'popup', 'geometry',
       'projectid', 'projecttitle', 'projectdescription'],
      dtype='object')

In [21]:
sb1_geo2 = basic_cleaning(sb1_geo1, 'agencies','projecttitle','projectid',
                         'projectdescription')



In [22]:
# sb1_geo = sb1_geo.drop(columns = 'popup')

In [23]:
sb1_geo2.programcodes.value_counts()

SHOPP    2741
HM       1163
LSR       509
ATP       321
SHOPA     165
SGR       161
STIP      126
TIRCP      96
LPP-F      68
TCEP       63
LPP-C      57
STA        49
SCCP       40
FM         12
SRA        11
Name: programcodes, dtype: int64

In [24]:
# All geometry is valid
sb1_geo2.geometry.is_valid.sum() == len(sb1_geo2)

True

In [25]:
sb1_geo2.geometry.notna().sum()  == len(sb1_geo2)

Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.



True

In [26]:
len(sb1_geo2[~sb1_geo2.geometry.is_empty]) 

5581

In [27]:
# Throw out missing geometry
missing_geo = sb1_geo2[sb1_geo2.geometry.is_empty]

In [28]:
sb1_geo2 = sb1_geo2[~sb1_geo2.geometry.is_empty].reset_index(drop = True)

In [29]:
len(sb1_geo2) == len(sb1_geo2[~sb1_geo2.geometry.is_empty]) 

True

In [30]:
sb1_geo2.projecttitle.value_counts().head()

None                        1321
major damage restoration     465
safety improvements          366
pavement rehabilitation      305
pavement  hm1                220
Name: projecttitle, dtype: int64

In [31]:
sb1_geo2.shape

(5581, 38)

In [32]:
# sb1_geo.countynames.sort_values().unique()

In [33]:
# sb1_geo.explore()

### Compare with 9 Sample Projects

In [34]:
# Subset sb1_geo to only programs these 9 projects have applied for
tcep_sccp1 = sb1_geo2[sb1_geo2["programcodes"].str.contains(('TCEP|SCCP'))].reset_index(drop = True)

In [35]:
# Subset sb1_geo to only programs these 9 projects are located in
tcep_sccp2 = sb1_geo2[sb1_geo2["countynames"].str.contains(('Alameda|San Mateo|Santa Cruz|San Luis Obispo|Los Angeles|San Bernardino|Riverside'))].reset_index(drop = True)

In [36]:
# Subset sb1_geo to only programs these 9 projects have applied for
tcep_sccp2 = tcep_sccp2[tcep_sccp2["programcodes"].str.contains(('TCEP|SCCP'))].reset_index(drop = True)

In [37]:
tcep_sccp2.shape

(37, 38)

In [38]:
tcep_sccp2.programcodes.value_counts()

TCEP    26
SCCP    11
Name: programcodes, dtype: int64

In [39]:
pd.merge(
    nine_sample_projects,
    tcep_sccp2,
    how="outer",
    left_on=["ct_project_id"],
    right_on=["projectid"],
    indicator=True,
)[["_merge"]].value_counts()

_merge    
right_only    37
left_only      9
both           0
dtype: int64

In [40]:
pd.merge(
    nine_sample_projects,
    tcep_sccp2,
    how="outer",
    left_on=["project_name"],
    right_on=["projecttitle"],
    indicator=True,
)[["_merge"]].value_counts()

_merge    
right_only    37
left_only      9
both           0
dtype: int64

* Eyeballing matches
    * route 395 widening from sr 18 to chamberlaine way in SB1 could match us 395 freight mobility and safety project in Non SHOPP
    * state route 1 state park to bayporter auxiliary lanes in SB1 is watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12 in non SHOPP

In [41]:
# tcep_sccp2[['projecttitle','agencies','countynames', 'projectdescription']].sort_values(['countynames','projecttitle'])

In [42]:
# nine_sample_projects[['project_name','lead_agency','full_county_name','project_description']].sort_values(['full_county_name','project_name'])

In [43]:
non_shopp_projects_sb1_list = ['route 395 widening from sr 18 to chamberlaine way', 
                             'state route 1  state park to bayporter auxiliary lanes']

In [44]:
non_shopp_projects_in_sb1 = tcep_sccp2[tcep_sccp2["projecttitle"].isin(non_shopp_projects_sb1_list)].reset_index(drop = True)

In [45]:
non_shopp_projects_in_sb1.projecttitle = non_shopp_projects_in_sb1.projecttitle.replace({
    'route 395 widening from sr 18 to chamberlaine way': 'us 395 freight mobility and safety project',
    'state route 1  state park to bayporter auxiliary lanes': 'watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1  sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12'
})

In [46]:
non_shopp_projects_in_sb1[['agencies','programcodes', 'countynames','projecttitle','projectdescription']]

Unnamed: 0,agencies,programcodes,countynames,projecttitle,projectdescription
0,Caltrans,SCCP,Santa Cruz,watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12,near capitola and aptos state route 1 from state park drive to bayporter interchanges construct auxiliary lanes between interchanges includes reconstruction of the capitola avenue overcrossing to accommodate new lanes on state route 1
1,,TCEP,San Bernardino,us 395 freight mobility and safety project,on us 395 between sr 18 and chamberlaine way in the city of adelanto widen this section of us 395 from two to four lanes proposed improvements also include operational improvements such as adding turn lanes and signal improvements at intersections


In [47]:
len(non_shopp_projects_in_sb1)

2

In [48]:
nine_sample_projects_geo = pd.merge(
    non_shopp_projects_in_sb1[['projecttitle','geometry', 'projectdescription']],
    nine_sample_projects,
    how="outer",
    left_on=["projecttitle"],
    right_on=["project_name"],
)

In [49]:
len(nine_sample_projects_geo)

9

In [50]:
type(nine_sample_projects_geo)

geopandas.geodataframe.GeoDataFrame

In [51]:
# nine_sample_projects_geo[['project_name','projecttitle','project_description','projectdescription', 'full_county_name']]

In [52]:
#  _utils.geojson_gcs_export(nine_sample_projects_geo,_utils.GCS_FILE_PATH, 'nine_sample_projects_geom')

### Compare with ALL Projects

#### Fuzzy Matches

##### Try with Project Titles

In [53]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [54]:
nonshopp_projects = nonshopp.project_name.unique().tolist()

In [55]:
len(nonshopp_projects)

901

In [56]:
# Delete project titles that are none
sb1_w_projectnames = sb1_geo2.loc[sb1_geo2.projecttitle != "None"].reset_index(drop = True)

In [57]:
len(sb1_w_projectnames)

4260

In [58]:
sb1_w_projectnames.projecttitle.nunique()

291

In [59]:
for i in nonshopp_projects:
    replace_matches_in_column(
        sb1_w_projectnames
        , "projecttitle", "project_title_fuzzy_match", i,90 
    )

In [60]:
fuzzy_match_results = sb1_w_projectnames.loc[sb1_w_projectnames.project_title_fuzzy_match.notnull()].reset_index(drop = True)

In [61]:
fuzzy_match_results[['projecttitle','project_title_fuzzy_match']].sort_values('projecttitle').drop_duplicates()

Unnamed: 0,projecttitle,project_title_fuzzy_match
1,caldwell interchange,caldwell interchange
10,etiwanda avenue grade separation,etiwanda ave grade separation
15,eurekaarcata corridor improvement,eurekaarcata corridor improvement
18,excelsior expressway,excelsior expressway ii
13,i680sr4 interchange improvements phases 1 2 a,i680sr4 interchange improvements phases 1 2 a
5,interstate 680 southbound express lane from state route 84 to alcosta blvd,interstate 680 southbound express lane from state route 84 to alcosta blvd
4,interstate 680 southbound express lane from state route 84 to alcosta blvdtoll system,interstate 680 southbound express lane from state route 84 to alcosta blvd
19,livingston widening northbound,livingston widening southbound
14,north county corridor project phase 1 from claribel road to clause road,north county corridor phase 1 from claribel road to claus road
16,redding to anderson six lane phase 1 2,redding to anderson six lane phase 1 and 2


In [62]:
outer_m_project_titles = pd.merge(
    fuzzy_match_results,
    nonshopp,
    how="outer",
    left_on=["project_title_fuzzy_match"],
    right_on=["project_name"],
    indicator=True)

In [63]:
type(outer_m_project_titles)

geopandas.geodataframe.GeoDataFrame

In [69]:
# outer_m_project_titles.loc[outer_m_project_titles._merge == 'both'][preview_cols]

##### Try with project description since titles are very vague.

In [65]:
def replace_matches_set_ratio(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [88]:
# Only include project descriptions that are 
# more than 10 strings long
sb1_geo2["project_str_count"] = sb1_geo2['projectdescription'].str.count('\w+')

In [89]:
len(sb1_geo2.loc[sb1_geo2.project_str_count > 10])

3638

In [94]:
sb1_geo2.loc[sb1_geo2.project_str_count > 10]['projectdescription'].nunique()

3501

In [95]:
sb1_w_projectdesc = sb1_geo2.loc[sb1_geo2.project_str_count > 10].head(500)["projectdescription"].unique().tolist()

In [96]:
len(sb1_w_projectdesc)

476

In [97]:
for i in sb1_w_projectdesc:
    replace_matches_set_ratio(
        nonshopp
        , "project_description", "project_desc_fuzzy_match", i, 95
    )

In [98]:
nonshopp.loc[nonshopp.project_desc_fuzzy_match.notnull()].reset_index(drop = True)[['project_description','project_desc_fuzzy_match']]

Unnamed: 0,project_description,project_desc_fuzzy_match
0,pavement preservation,this project will perform preservation activities on approximately 32 lane miles of pavement on route 90 in orange county
1,on route 101 in marin county in and near city of novato from just south of the franklin avenue overhead to 03 miles south of the marinsonoma county line the project will widen route 101 to construct a southbound hov lane from 03 miles south of the marinsonoma county line to just south of the franklin avenue overhead 60 miles and a northbound hov lane from 17 miles north of the atherton avenue overcrossing to 03 miles south of the marinsonoma county line 35 miles the project includes roadway and bridge widening for hov lanes and standard shoulders the project will also upgrade the horizontal and vertical roadway alignment for a 70 mph design speed modify the redwood landfill interchange ramps to conform with the new alignment and restripe a frontage road redwood boulevard for class ii bike lanes in novato\n\nconstruct 16 miles of mainline shoulder 9130 lf culverts 076 miles of bikeped facilities 95 miles of hothov mainline widen 4 shoulders modify 4 ramps and provide corrects to 10 curve and vertical alignments install 3 traffic monitoring detection stations 2 changeable message signs 1 extinguishable message signs 6 freeway ramp meters and 3 close circuit television cameras,on route 101 in marin county in and near city of novato from just south of the franklin avenue overhead to 03 miles south of the marinsonoma county line the project will widen route 101 to construct a southbound hov lane from 03 miles south of the marinsonoma county line to just south of the franklin avenue overhead 60 miles and a northbound hov lane from 17 miles north of the atherton avenue overcrossing to 03 miles south of the marinsonoma county line 35 miles the project includes roadway and bridge widening for hov lanes and standard shoulders the project will also upgrade the horizontal and vertical roadway alignment for a 70 mph design speed modify the redwood landfill interchange ramps to conform with the new alignment and restripe a frontage road redwood boulevard for class ii bike lanes in novato
2,on state route 68 from josselyn canyon road to spreckels blvd operational improvements replaces signalized intersections with roundabouts to achieve smooth traffic flow provide active transportation facilities at intersections and achieve transit benefits achieving these improvements on the existing corridor will preempt need for previouslyconsidered bypass alignment,on state route 68 from josselyn canyon road to spreckels blvd operational improvements
3,in monterey county at castroville boulevard from post mile r16 to 14 build a new interchange at castroville boulevard and highway 156,in monterey county at castroville boulevard from post mile r16 to 14 build a new interchange at castroville boulevard and highway 156
4,reconstruct interchange,on route 99 in tulare county between 03 miles south of the avenue 280 caldwell avenue overcrossing to 04 miles north of the avenue 280 overcrossing reconstruct interchange
5,reconstruct interchange,on route 99 in tulare county between 03 miles south of the avenue 280 caldwell avenue overcrossing to 04 miles north of the avenue 280 overcrossing reconstruct interchange
6,turnbull canyon road,in the city of industry and unincorporated los angeles county along the alameda corridoreast trade corridor at turnbull canyon road replace atgrade crossing with a new grade separated undercrossing add sidewalks and bike lanes
7,realign intersection,in stockton on the northern limits of the navy drive bridge at the san joaquin river to the port of stockton west complex entrance construct a grade separated crossing four lane overcrossing over the fyffe avenue rail line realign mccloy avenue south of its current location to form a stopcontrolled intersection
8,the north county corridor project phase 2 of 4,the north county corridor project consists of 4 separate phases of construction totaling 18 miles in length the scope of this project is for phase 1 the corridor will be a high capacity bypass around the cities of modesto riverbank and oakdale as shown in exhibit 1 the phase 1 project will be an ultimate 6lane divided expressway beginning at the intersection of claribel road oakdale road extending eastward to the intersection of claribel road claus road it will be access controlled with a 4070median with grade separations over roselle avenue terminal avenue and the burlington northern santa fe railroad tracks this new alignment will build a westeast expressway that will improve regional network circulation connecting from the western end of downtown modesto to the eastward end joining sr120 east of the city of oakdale segments 1 to 4
9,roadway grade separation,in elk grove at lent ranch parkway to interstate 5 construct a 2 lane roadway modification of the i5hood franklin interchange and grade separation at the uprr tracks class 2 bike lanes class 1 path and signalized intersections the project will design and acquire rightofway for 2 lanes that excludes the class 1 path


### Read in all projects
* Compare with CSV.
* Clean it up.

In [None]:
sb1_all_projects_url = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/22/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
# Read in SB1 csv
# sb1_csv = to_snakecase(pd.read_csv(f"{_utils.GCS_FILE_PATH}RebuildingCA_map_Data.csv"))

In [None]:
sb1_all_projects = to_snakecase(gpd.read_file(sb1_all_projects_url))

In [None]:
sb1_all_projects.shape, sb1_all_projects.projecttitle.nunique()

In [None]:
# Same rows, different columns.
# sb1_all_projects.shape, sb1_csv.shape

In [None]:
# Project ID matches
# csv_projectid = set(sb1_csv.project_id.unique().tolist())
# geojson_projectid = set(sb1_all_projects.projectid.unique().tolist())
# csv_projectid - geojson_projectid

In [None]:
# set(sb1_all_projects.columns).difference(set(sb1_csv.columns))

In [None]:
# Align funding
sb1_all_projects = _utils.align_funding_numbers(
    sb1_all_projects,
    [
        "totalcost",
        "sb1funds",
    ],
)

In [None]:
# Lower case and clean project names
sb1_all_projects.projecttitle = (
    sb1_all_projects.projecttitle.str.lower().str.strip().str.split("20").str[0]
)

In [None]:
# No geometry, just drop it
sb1_all_projects = sb1_all_projects.drop(columns=["geometry"])

### Figure out why the rows differ between `sb1_all_projects` and `sb1_geo2`

In [None]:
# Subset to join back to the 9,000 projects above
# subset = ['objectid', 'agencyids', 'projecttitle','programcodes', 'projectid','geometry']

In [None]:
full_gdf2.projectid.nunique(), sb1_project_id.project_id.nunique(), len(sb1_project_id)

In [None]:
pd.merge(
    sb1_project_id,
    full_gdf2,
    how="outer",
    left_on=["project_id"],
    right_on=["projectid"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
sb1_m = pd.merge(
    full_gdf2,
    sb1_project_id,
    how="left",
    left_on=["projectid"],
    right_on=["project_id"],
)

In [None]:
sb1_m.project_name = sb1_m.project_name.fillna(sb1_m.projecttitle)

In [None]:
pd.merge(nonshopp, sb1_m, how="outer", on=["project_name"], indicator=True)[
    ["_merge"]
].value_counts()

In [None]:
sb1.shape, sb1.project_name.nunique()

In [None]:
# Titles are not necessarily specifically named.
# Tends to be very general
# sb1.project_name.value_counts()

In [None]:
sb1_subset = [
    "project_name",
    "implementing_agency",
    "sb1_program",
    "fiscal_year",
    "project_description",
    "total_cost",
    "sb1_funds",
    "is_sb1?",
    "project_status",
    "assembly_districts",
    "senate_districts",
    "congressional_districts",
    "counties",
    "cities",
    "caltrans_districts",
    "on_shs?",
]

In [None]:
sb1_2 = sb1[sb1_subset]

In [None]:
sb1.sb1_program.value_counts()

In [None]:
sb1_2.shape

In [None]:
# Align funding
sb1_2 = _utils.align_funding_numbers(
    sb1_2,
    [
        "total_cost",
        "sb1_funds",
    ],
)

In [None]:
# sb1_geo2 = sb1_geo[subset]

In [None]:
sb1_all_projects.shape, sb1_geo.shape

In [None]:
# Objectid: no cleaning for either of dfs
# This is a int64 dtype...yet zero merges
pd.merge(
    sb1_all_projects,
    sb1_geo,
    how="outer",
    on=["objectid"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
sb1_all_projects = clean_project_names_id(sb1_all_projects, 'projectid', 'projecttitle')

In [None]:
sb1_geo2 = clean_project_names_id(sb1_geo, 'projectid','projecttitle')

In [None]:
pd.merge(
    sb1_all_projects,
    sb1_geo2,
    how="outer",
    left_on=["projectid"],
    right_on = ["projectid"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sb1_all_projects,
    sb1_geo2,
    how="outer",
    left_on=["projecttitle", "ct_districts", "agencyid", "fiscalyear"],
    right_on=["projecttitle", "ct_districts", "agencyids", "fiscalyears"],
    indicator=True,
)[["_merge"]].value_counts()

### Merge all other Projects

In [None]:
# 62 matches
pd.merge(
    nonshopp,
    sb1_all_projects,
    how="outer",
    left_on=["project_name"],
    right_on=["projecttitle"],
    indicator=True,
)[["_merge"]].value_counts()

### Tircp
* None of the projects from TIRCP are mapping, even though the names appear the same.

In [None]:
# sb1_tircp = sb1_2.loc[sb1_2.sb1_program == "Transit and Intercity Rail Capital Program"].reset_index(drop = True)

In [None]:
# sb1_tircp[['project_name','caltrans_districts','counties']].sort_values('project_name').head(2)

In [None]:
# tircp_sb[['project_name']].sort_values(by = 'project_name')

In [None]:
# tircp_shopp[['project_name']].sort_values(by = 'project_name')

In [None]:
tircp_shopp = df.loc[
    df.previous_caltrans_nominations.str.contains("TIRCP")
].reset_index(drop=True)

In [None]:
# why are there no matches??
pd.merge(tircp_shopp, tircp_sb, how="outer", on=["project_name"], indicator=True)[
    ["_merge"]
].value_counts()