## Add SB1 geographic information.

In [None]:
import _utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase

In [None]:
import fuzzywuzzy
from fuzzywuzzy import process

In [None]:
import fsspec
from calitp import *
from calitp.storage import get_fs

fs = get_fs()
import os

In [None]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
def basic_cleaning(df, agency_col: str,
                   project_name_col:str,
                   project_id_col: str, 
                   project_desc_col:str,
                   data:str):
    
    df = _utils.organization_cleaning(df, agency_col)
    
    # Remove all punctation, lowercase, and strip whitespaces from 
    # project titles & descriptions. Count number of strings.
    for i in [project_name_col, project_desc_col]:
        df[i] = (df[i].str.lower().str.replace('[^\w\s]','').str.strip())
        df[f"{i}_count"] = df[i].str.count('\w+')
                 
    # Some project names contain the year. Remove anything after 20..
    df[project_name_col] = df[project_name_col].str.split("20").str[0]
    
    # Project ID, remove all commas and lowercase if there are strings
    df[project_id_col] = (df[project_id_col].str.replace("'", "").str.lower().str.strip())
    
    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = df.select_dtypes(include=['object']).columns.to_list()
    try:
        for i in string_cols:
            df[i] = df[i].str.replace("|", "")
    except:
        pass
    
    # Try to extract titles from popups
    try:
        df["popup"] = df['popup'].str.split("<br  />").str[1].str.split("20").str[0].str.lower().str.strip().str.replace('[^\w\s]','')
    except:
        pass
 
    return df

### Non SHOPP
* No year information for projects.

In [None]:
# Read in 10 Year non SHOPP with ATP and TIRCP
nonshopp = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx")
)

In [None]:
# Subset to join.
non_shopp_subset = [
    "ppno",
    "ct_project_id",
    "ea",
    "project_name",
    "lead_agency",
    "previous_caltrans_nominations",
    "full_county_name",
    "county",
    "district_full_name",
    "district",
    "project_description",
    "current_phase",
    "primary_mode",
    "urban_rural",
    "total_project_cost__$1,000",
    "total_unfunded_need__$1,000",
    "shs_capacity_increase_detail",
    "current_phase",
]

In [None]:
# nonshopp = nonshopp[non_shopp_subset]

In [None]:
# Add a digit in front of single digits
nonshopp.district = nonshopp.district.map("{:02}".format)

In [None]:
nonshopp = basic_cleaning(nonshopp, "lead_agency", "project_name", "ct_project_id", "project_description", "nonshopp")

In [None]:
nonshopp[['project_name', 'project_name_count']].sample(3)

In [None]:
nine_projects_names = [
    "LA-210 Median Concrete Barrier Renovation",
    "SR-14 Widening Project",
    "US 395 Freight Mobility and Safety Project",
    "East Bay Greenway Multimodal Corridor Project",
    "Watsonville-Santa Cruz Multimodal Corridor Program",
    "SM 101 Woodside Road Interchange and Port Access Project",
    "I-710 Integrated Corridor Management",
    "Five Cities Multimodal Transportation Network Enhancement Project",
    "SR-86/Avenue 50 New Interchange (Phase II)",
]

In [None]:
# nine_projects_names = [x.lower() for x in nine_projects_names]

In [None]:
nine_projects_id = [
    "0422000202",
    "0414000032",
    "0520000083",
    "0515000063",
    "0721000056",
    "0716000370",
    "0813000222",
    "0814000144",
    "0414000032",
    "0720000165",
]

In [None]:
 nine_sample_projects = (nonshopp[nonshopp.ct_project_id.isin(nine_projects_id)].reset_index(drop=True))

* Solutions for Congest Corridors (SCCP): 1
* Trade Corridor Enhancement Program (TCEP): 3
* Only 3 projects seem to have been awarded. 
    * east bay greenway multimodal corridor project phase 1
    * us 101woodside road interchange and port access project
    * watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12

In [None]:
# nine_sample_projects[['project_name','project_description','county','previous_caltrans_nominations']]

### Sb1 Geo
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer

In [None]:
# Subset to preview SB1 vs Nonshopp. Nonshopp is on the left, sb1 on the right
preview_cols =  ['project_name','projecttitle','project_description','projectdescription', 'full_county_name', 'countynames']

In [None]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
def rest_server():
    full_gdf = pd.DataFrame()
    for i in [*range(0,22)]:
        df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)
    return full_gdf

In [None]:
sb1_geo1 = rest_server()

In [None]:
# tircp = to_snakecase(gpd.read_file("https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/5/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"))

In [None]:
# atp = to_snakecase(gpd.read_file("https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/12/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"))

In [None]:
sb1_geo2 = basic_cleaning(sb1_geo1, 'agencies','projecttitle','projectid',
                         'projectdescription', 'sb1')

In [None]:
sb1_geo2['projecttitle'] = sb1_geo2['projecttitle'].fillna(sb1_geo2['popup'])

In [None]:
sb1_geo2.programcodes.value_counts()

#### Check geometries
* Take away invalid row.

In [None]:
# All geometry is valid
#sb1_geo2.geometry.is_valid.sum() == len(sb1_geo2)

In [None]:
#sb1_geo2.geometry.notna().sum()  == len(sb1_geo2)

In [None]:
#len(sb1_geo2[~sb1_geo2.geometry.is_empty]) 

In [None]:
# Throw out missing geometry
missing_geo = sb1_geo2[sb1_geo2.geometry.is_empty]

In [None]:
len(missing_geo)

In [None]:
sb1_geo2 = sb1_geo2[~sb1_geo2.geometry.is_empty].reset_index(drop = True)

In [None]:
# len(sb1_geo2) == len(sb1_geo2[~sb1_geo2.geometry.is_empty]) 

In [None]:
sb1_geo2.shape

In [None]:
# sb1_geo.countynames.sort_values().unique()

In [None]:
# sb1_geo.explore()

### Read in all projects
* Need this because not every file has project names.

In [None]:
sb1_all_projects_url = f"{url_pt1}22{url_pt2}"

In [None]:
sb1_all_projects = to_snakecase(gpd.read_file(sb1_all_projects_url))

In [None]:
sb1_all_projects.geometry.value_counts()

In [None]:
# No geometry, just drop it
sb1_all_projects = sb1_all_projects.drop(columns=["geometry"])

In [None]:
sb1_all_projects = basic_cleaning(sb1_all_projects, 'implementingagency','projecttitle','projectid',
                         'projectdescription', 'sb1')

In [None]:
sb1_all_projects.shape, sb1_all_projects.projecttitle.nunique()

In [None]:
# Project ID matches...
all_projects_project_id = set(sb1_all_projects.projectid.unique().tolist())
sb1_geo2_project_id = set(sb1_geo2.projectid.unique().tolist())

In [None]:
len(list(set(sb1_geo2_project_id).intersection(all_projects_project_id)))

In [None]:
len(sb1_geo2_project_id - all_projects_project_id)

In [None]:
len(sb1_geo2)

In [None]:
# None of the object id matches 
all_projects_object_id = set(sb1_all_projects.objectid.unique().tolist())
sb1_geo2_object_id = set(sb1_geo2.objectid.unique().tolist())

In [None]:
len(sb1_geo2_object_id - all_projects_object_id), len(sb1_geo2_object_id)

In [None]:
len(all_projects_object_id - sb1_geo2_object_id)

In [None]:
# set(sb1_all_projects.columns).difference(set(sb1_csv.columns))

In [None]:
m1 = pd.merge(
    sb1_geo2,
    sb1_all_projects[['projectid','projecttitle']],
    how="left",
    on=["projectid"],
)

In [None]:
m1.projecttitle_x.value_counts().head()

In [None]:
m1.projecttitle_y.isna().sum()

In [None]:
len(sb1_all_projects), len(sb1_geo2), 5513+4415

In [None]:
pd.merge(
    sb1_all_projects,
    sb1_geo2,
    how="left",
    left_on=[ "ct_districts", "agencyid", "totalcost", "fiscalyearcode"],
    right_on=["ct_districts", "agencyids", "totalcost", "fiscalyearcodes"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
m1 = pd.merge(
    sb1_all_projects,
    sb1_geo2[['projecttitle', "programcodes", "totalcost", "agencies", "countynames"]],
    how="right",
    left_on=["programcodes", "totalcost", "implementingagency", "countynames"],
    right_on=["programcodes", "totalcost", "agencies", "countynames"],
    indicator=True,
)

In [None]:
m1.projecttitle_x = m1.projecttitle_x.fillna(m1.projecttitle_y)

In [None]:
m1._merge.value_counts(), len(m1)

In [None]:
m1[['projecttitle_x','programcodes',]].sort_values('programcodes').head()

In [None]:
m3 = pd.merge(
    sb1_geo2,
    sb1_all_projects[['projecttitle', "programcodes", "totalcost", "implementingagency", "fiscalyearcode"]],
    how="left",
    left_on=[ "programcodes", "totalcost", "agencies","fiscalyearcodes"],
    right_on=[ "programcodes", "totalcost", "implementingagency","fiscalyearcode"],
    indicator = True
)

In [None]:
m3._merge.value_counts()

In [None]:
m3.projecttitle_x = m3.projecttitle_x.fillna(m3.projecttitle_y)

In [None]:
m3.projecttitle_x = m3.projecttitle_x.fillna('None')

In [None]:
m3[['projecttitle_x','programcodes',]].sort_values('programcodes').head()

In [None]:
# m2[['projecttitle_x','projecttitle_y', 'programcodes',]].sort_values('programcodes')

### Compare with ALL Projects
* m3 is the merge with the most results.

In [None]:
nonshopp_sb1_m1 = pd.merge(
    m3.loc[m3.projecttitle_x != "None"].drop(columns = ["_merge"])[['projecttitle_x','projectdescription','countynames','geometry']],
    nonshopp,
    how="inner",
    left_on=[ "projecttitle_x"],
    right_on=["project_name",],
    indicator=True,
)

In [None]:
len(nonshopp_sb1_m1), type(nonshopp_sb1_m1)

In [None]:
# nonshopp_sb1_m1.explore()

In [None]:
# nonshopp_sb1_m1[['previous_caltrans_nominations','project_name','projecttitle_x','project_description','projectdescription', 'full_county_name', 'countynames']]

In [None]:
found_projects = nonshopp_sb1_m1.projecttitle_x.unique().tolist()

In [None]:
# Filter out results before fuzzy matching
m3 = m3[~m3["projecttitle_x"].isin(found_projects)].reset_index(drop = True)

In [None]:
# Filter out the projects already found above.
nonshopp2 = nonshopp[~nonshopp["project_name"].isin(found_projects)].reset_index(drop = True)

#### Fuzzy Matches

##### Try with Project Titles

In [None]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [None]:
nonshopp_projects = nonshopp2.project_name.unique().tolist()

In [None]:
m3["projecttitle_x_count"] = m3["projecttitle_x"].str.count('\w+')

In [None]:
# Delete project titles that are short
sb1_w_projectnames = (m3.loc[m3.projecttitle_x_count > 3]).reset_index(drop = True)

In [None]:
sb1_w_projectnames.shape

In [None]:
for i in nonshopp_projects:
    replace_matches_in_column(
        sb1_w_projectnames
        , "projecttitle_x", "project_title_fuzzy_match", i,90 
    )

In [None]:
# Drop nulls and duplicates
fuzzy_match_results = (sb1_w_projectnames.loc[sb1_w_projectnames.project_title_fuzzy_match.notnull()]
                       .drop_duplicates(subset = ["projecttitle_x", "project_title_fuzzy_match", "projectdescription"])
                       .reset_index(drop = True)
                       .sort_values('projecttitle_x')
                      )

In [None]:
len(fuzzy_match_results)

In [None]:
# Last one isn't the same.
fuzzy_match_results[['projecttitle_x','project_title_fuzzy_match', 'countynames', 'fiscalyears']]

In [None]:
nonshopp_sb1_m2 = pd.merge(
    fuzzy_match_results.drop(columns = ["_merge"]).head(4)[['projecttitle_x','project_title_fuzzy_match','projectdescription','countynames','geometry']],
    nonshopp2,
    how="inner",
    left_on=["project_title_fuzzy_match", "countynames"],
    right_on=["project_name", "full_county_name"],
)

In [None]:
type(nonshopp_sb1_m2), len(nonshopp_sb1_m2)

In [None]:
found_projects2 = nonshopp_sb1_m2.projecttitle_x.unique().tolist()

In [None]:
found_projects3 = nonshopp_sb1_m2.project_title_fuzzy_match.unique().tolist()

In [None]:
# Filter out the projects already found above.
m3 = m3[~m3["projecttitle_x"].isin(found_projects2)].reset_index(drop = True)

In [None]:
# Filter out the projects already found above.
nonshopp2 = nonshopp2[~nonshopp2["project_name"].isin(found_projects3)].reset_index(drop = True)

##### Try with project description since titles are very vague.

In [None]:
def replace_matches_set_ratio(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [None]:
# Only include projects with a long enough desc
# Drop dups
sb1_w_projectdesc = ((m3.loc[m3.projectdescription_count > 10])
.drop_duplicates(subset = ["projecttitle_x", "projectdescription", "countynames"])
.reset_index(drop = True)
                    )

In [None]:
# Test with nonshopp 
nonshopp_with_desc = ((nonshopp2.loc[nonshopp2.project_description_count > 10])
                     .drop_duplicates(subset = ["project_name", "project_description", "full_county_name"]).reset_index(drop = True)
                    )

In [None]:
# Reverse -> replace 
#for i in sb1_w_projectdesc["projectdescription"].loc[1001:3000].unique().tolist():
#    replace_matches_set_ratio(
#        nonshopp_with_desc, "project_description", "project_desc_fuzzy_match", i, 95
#  )

In [None]:
# nonshopp_with_desc = nonshopp_with_desc.loc[nonshopp_with_desc.project_desc_fuzzy_match.notnull()].reset_index(drop = True)

In [None]:
# len(nonshopp_with_desc)

In [None]:
#nonshopp_with_desc[['project_description','project_desc_fuzzy_match']]

In [None]:
# fuzzy_desc_list = nonshopp_with_desc.project_description.unique().tolist()

In [None]:
fuzzy_desc_list = ['in humboldt county on us 101 between the 6th street interchange pm 984 and the trinidadmain street interchange pm 1007 and along scenic drive and westhaven drive between the community of westhaven and trinidad construction of a new interchange on us 101 and new road connections from us 101 to scenic drive and westhaven drive associated pedestrian and bicycle improvements through the new interchange and along scenic drive and westhaven drive other work will include associated grading drainage utility relocations signage and striping retaining walls etckim floyd 707 4415739',
 'on route 70 from laurellen to yubabutte county line continuous passing lanes\n\nconstruction 192 of passing lane miles on the state highway',
 'eastbound sr12 to eastbound i80 connector  this project would construct a new connector from eastbound sr12 to eastbound i80 which will also connect two previous cmia projects icp and jameson canyon widening projects reconstruction of the i80i680sr12 interchange complex phase 1 includes remaining packages 3 7\n\nmodifyimprovereconstruct 1 interchange and 1 bridge and construct 1 new bridge',
 'on state route 68 from josselyn canyon road to spreckels blvd operational improvements replaces signalized intersections with roundabouts to achieve smooth traffic flow provide active transportation facilities at intersections and achieve transit benefits achieving these improvements on the existing corridor will preempt need for previouslyconsidered bypass alignment',
 'in fresno and kings county about 6 miles north of lemoore from 03 mile north of excelsior avenue undercrossing to 10 mile north of elkhorn avenue widen from 2lane conventional highway to 4lane expressway this project will eliminate the only remaining bottleneck 2lane section of sr 41 between fresno and lemoore on this interregional route between the san joaquin valley and the central coast and reduce accident rates\n\nconstruct 24 miles of mixed flow lanes 24 miles of mixed flow mainline 8 new curb ramps and 2 intersectionsignal improvements',
 'in fresno county in and near fresno from 04 mile south of american avenue to 04 mile north of north avenue modify interchanges',
 'in and near lost hills from 1 mile west of browns material road to california aquaduct  convert from 2lane conventional highway to 4lane expressway\n\nnew roadway lane miles 69',
 'in diamond bar and the city of industry on route 60 from eb 60 to sb 57 connector overcrossing to near golden springs drive undercrossing and route 57 from nb 57 to wb 60 connector overcrossing to south 5760 separation interchange modifications including auxiliary lanes and three new bridges\n\npse completion target date  22822\n4 interchange modifications 15 miles of auxilary lanes 163929 sqft of new bridgestunnels',
 'add one eastbound auxiliary lane 086 miles on sr91 in the city of long beach on sr91 from i710 to cherry avenue undercrossing the proposed improvements consist of adding one auxiliary lane in the eastbound direction extending the outside 5 lane beyond the atlantic ave eb offramp to cherry ave then dropping it before the cherry ave undercrossing and widening the orange ave and walnut ave undercrossing',
 'replace eastbound rte 91 to northbound rte 71 loop connector with a direct connector ramp realign the green river road eastbound entrance ramp to rte 91 and construct a collectordistributor system on rte 91 in the eastbound direction between the green river road and serfas club drive\n1  traffic monitoring detection station\n1  interchange modification\n1 mile  aux lane constructed\n1  freeway ramp meter\n10000 sqft  new local road bridge structuretunnel\n1  grade separation  rail crossing improvements\n2  new curb ramp installed\n05 miles  aux lanes\n2000 lf  culvert\n25 miles  ramp and connector constructed\n2000000 sqft  modifiedimproved interchange\n1  ramp modification',
 'realign approximately 3 miles of a segment of sr 4 from west of hunt road to appaloosa way beginning approximately 38 miles east of copperopolis and ending approximately 52 miles west of the sr 4sr 49 junction in altaville calaveras county\n\nproject will provide 5 curve and vertical alignment corrections construct 7 turn pocket to improve operations construct 624 miles of mainline shoulders and construct 642 miles of new roadway lane miles',
 'located in the city of tracy at the i580 and international parkwaypatterson pass rd interchange the project proposes to modify the interchange from the existing compact diamond type l1 to a diverging diamond interchange ddi to accommodate increased commercial truck and traffic demand from existing fulfillment distribution center growth and planned future development in san joaquin county the project will improve interregional goods movement for trucks traveling between the port of oakland bay area local warehouses southern california and outofstate while reducing congestion increases truck throughput and eliminate bottleneck for trucks accessing i580 a key freight highway\n\nprovide 1 interchange modifications 2 intersectionsignal improvements 056 miles of communications fiber optics 14 miles of bikeped facilities and modifyimprove 18680 sqft of interchange']

In [None]:
nonshopp_sb1_m3 = pd.merge(
    m3,
    nonshopp_with_desc[nonshopp_with_desc.project_description.isin(fuzzy_desc_list)],
    how="inner",
    left_on=["projectdescription", "countynames"],
    right_on=["project_desc_fuzzy_match", "full_county_name"],)

In [None]:
len(nonshopp_sb1_m3)

In [None]:
type(nonshopp_sb1_m3)

In [None]:
# nonshopp_sb1_m3

### Compare with 9 Sample Projects

In [None]:
nine_projects_id = [
    "0422000202",
    "0414000032",
    "0520000083",
    "0515000063",
    "0721000056",
    "0716000370",
    "0813000222",
    "0814000144",
    "0414000032",
    "0720000165",
]

In [None]:
nine_sample_projects = (nonshopp2[nonshopp2.ct_project_id.isin(nine_projects_id)].reset_index(drop=True))

In [None]:
len(nine_sample_projects)

In [None]:
# Subset sb1_geo to only programs these 9 projects have applied for
tcep_sccp1 = m3[m3["programcodes"].str.contains(('TCEP|SCCP'))].reset_index(drop = True)

In [None]:
# Subset sb1_geo to only programs these 9 projects are located in
tcep_sccp2 = m3[m3["countynames"].str.contains(('Alameda|San Mateo|Santa Cruz|San Luis Obispo|Los Angeles|San Bernardino|Riverside'))].reset_index(drop = True)

In [None]:
# Subset sb1_geo to only programs these 9 projects have applied for
tcep_sccp2 = m3[m3["programcodes"].str.contains(('TCEP|SCCP'))].reset_index(drop = True)

In [None]:
tcep_sccp2.shape

In [None]:
tcep_sccp2.programcodes.value_counts()

In [None]:
tcep_sccp2.columns

In [None]:
pd.merge(
    nine_sample_projects,
    tcep_sccp2.drop(columns = ["_merge"]),
    how="outer",
    left_on=["ct_project_id"],
    right_on=["projectid"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    nine_sample_projects,
    tcep_sccp2.drop(columns = ["_merge"]),
    how="outer",
    left_on=["project_name"],
    right_on=["projecttitle_x"],
    indicator=True,
)[["_merge"]].value_counts()

* Eyeballing matches
    * route 395 widening from sr 18 to chamberlaine way in SB1 could match us 395 freight mobility and safety project in Non SHOPP
    * state route 1 state park to bayporter auxiliary lanes in SB1 is watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12 in non SHOPP

In [None]:
# tcep_sccp2[['projecttitle_x','agencies','countynames', 'projectdescription']].sort_values(['countynames','projecttitle_x'])

In [None]:
# nine_sample_projects[['project_name','lead_agency','full_county_name','project_description']].sort_values(['full_county_name','project_name'])

In [None]:
non_shopp_projects_sb1_list = ['route 395 widening from sr 18 to chamberlaine way', 
                             'state route 1  state park to bayporter auxiliary lanes']

In [None]:
non_shopp_projects_in_sb1 = tcep_sccp2[tcep_sccp2["projecttitle_x"].isin(non_shopp_projects_sb1_list)].reset_index(drop = True)

In [None]:
non_shopp_projects_in_sb1.projecttitle_x = non_shopp_projects_in_sb1.projecttitle_x.replace({
    'route 395 widening from sr 18 to chamberlaine way': 'us 395 freight mobility and safety project',
    'state route 1  state park to bayporter auxiliary lanes': 'watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1  sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12'
})

In [None]:
non_shopp_projects_in_sb1[['agencies','programcodes', 'countynames','projecttitle_x','projectdescription']]

In [None]:
nonshopp_sb1_m4 = pd.merge(
    non_shopp_projects_in_sb1,
    nine_sample_projects,
    how="inner",
    left_on=["projecttitle_x"],
    right_on=["project_name"],
)

In [None]:
len(nonshopp_sb1_m4)

In [None]:
# nine_sample_projects_geo[['project_name','projecttitle','project_description','projectdescription', 'full_county_name']]

In [None]:
#  _utils.geojson_gcs_export(nine_sample_projects_geo,_utils.GCS_FILE_PATH, 'nine_sample_projects_geom')

### Concat all the matches

In [None]:
nonshopp_geo = pd.concat([nonshopp_sb1_m1, nonshopp_sb1_m2, nonshopp_sb1_m3, nonshopp_sb1_m4], sort=False)

In [None]:
type(nonshopp_geo)

In [None]:
len(nonshopp_geo)

In [None]:
sb1_cols = m3.columns.tolist()

In [None]:
sb1_cols.remove('geometry')

In [None]:
sb1_cols.extend(['project_name_count',
 'project_description_count',
 'project_title_fuzzy_match',
 'project_desc_fuzzy_match'])

In [None]:
# Delete out the columns from SB1 except geometry
nonshopp_geo2 = nonshopp_geo.drop(columns = sb1_cols)

In [None]:
nonshopp_geo2[['previous_caltrans_nominations']].drop_duplicates()

In [None]:
# nonshopp_geo2.explore(tooltip = ['project_name','project_description', 'lead_agency', 'previous_caltrans_nominations'], marker_type='marker', marker_kwds={"radius":3})

In [None]:
_utils.geojson_gcs_export(nonshopp_geo2,_utils.GCS_FILE_PATH, 'nonshopp_gdf')