## Add SB1 geographic information.

In [116]:
import _utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase

In [117]:
import fuzzywuzzy
from fuzzywuzzy import process

In [118]:
import fsspec
from calitp import *
from calitp.storage import get_fs

fs = get_fs()
import os

In [119]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [120]:
def basic_cleaning(df, agency_col: str,
                   project_name_col:str,
                   project_id_col: str, 
                   project_desc_col:str,
                   data:str):
    
    df = _utils.organization_cleaning(df, agency_col)
    
    # Remove all punctation, lowercase, and strip whitespaces from 
    # project titles & descriptions. Count number of strings.
    for i in [project_name_col, project_desc_col]:
        df[i] = (df[i].str.lower().str.replace('[^\w\s]','').str.strip())
        df[f"{i}_count"] = df[i].str.count('\w+')
                 
    # Some project names contain the year. Remove anything after 20..
    df[project_name_col] = df[project_name_col].str.split("20").str[0]
    
    # Project ID, remove all commas and lowercase if there are strings
    df[project_id_col] = (df[project_id_col].str.replace("'", "").str.lower().str.strip())
    
    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = df.select_dtypes(include=['object']).columns.to_list()
    try:
        for i in string_cols:
            df[i] = df[i].str.replace("|", "")
    except:
        pass
    
    # Try to extract titles from popups
    try:
        df["popup"] = df['popup'].str.split("<br  />").str[1].str.split("20").str[0].str.lower().str.strip().str.replace('[^\w\s]','')
    except:
        pass
    
    #if data == "sb1":
        # Fill in NA
     #   df["projecttitle"] = ''
     #   df["projecttitle"] = df["projecttitle"].fillna(df["popup"])
     #   df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))
        
   # else: 
    #    print("nonshopp")
    
    # df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))
    return df

### Non SHOPP
* No year information for projects.

In [121]:
# Read in 10 Year non SHOPP with ATP and TIRCP
nonshopp = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx")
)

In [122]:
# Subset to join.
non_shopp_subset = [
    "ppno",
    "ct_project_id",
    "ea",
    "project_name",
    "lead_agency",
    "previous_caltrans_nominations",
    "full_county_name",
    "county",
    "district_full_name",
    "district",
    "project_description",
    "current_phase",
    "primary_mode",
    "urban_rural",
    "total_project_cost__$1,000",
    "total_unfunded_need__$1,000",
    "shs_capacity_increase_detail",
    "current_phase",
]

In [123]:
# nonshopp = nonshopp[non_shopp_subset]

In [124]:
# Add a digit in front of single digits
nonshopp.district = nonshopp.district.map("{:02}".format)

In [125]:
nonshopp = basic_cleaning(nonshopp, "lead_agency", "project_name", "ct_project_id", "project_description", "nonshopp")



In [126]:
nonshopp[['project_name', 'project_name_count']].sample(3)

Unnamed: 0,project_name,project_name_count
729,renewable diesel and aftertreatment,4
576,stanislaus river bridge mp 10439,5
687,rail crossing and grade separation safety improvements,7


### 9 Sample Non SHOPP 

In [127]:
nine_projects_names = [
    "LA-210 Median Concrete Barrier Renovation",
    "SR-14 Widening Project",
    "US 395 Freight Mobility and Safety Project",
    "East Bay Greenway Multimodal Corridor Project",
    "Watsonville-Santa Cruz Multimodal Corridor Program",
    "SM 101 Woodside Road Interchange and Port Access Project",
    "I-710 Integrated Corridor Management",
    "Five Cities Multimodal Transportation Network Enhancement Project",
    "SR-86/Avenue 50 New Interchange (Phase II)",
]

In [128]:
# nine_projects_names = [x.lower() for x in nine_projects_names]

In [129]:
nine_projects_id = [
    "0422000202",
    "0414000032",
    "0520000083",
    "0515000063",
    "0721000056",
    "0716000370",
    "0813000222",
    "0814000144",
    "0414000032",
    "0720000165",
]

In [130]:
# nine_sample_projects = (nonshopp[nonshopp.ct_project_id.isin(nine_projects_id)].reset_index(drop=True))

* Solutions for Congest Corridors (SCCP): 1
* Trade Corridor Enhancement Program (TCEP): 3
* Only 3 projects seem to have been awarded. 
    * east bay greenway multimodal corridor project phase 1
    * us 101woodside road interchange and port access project
    * watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12

In [131]:
# nine_sample_projects[['project_name','project_description','county','previous_caltrans_nominations']]

### Sb1 Geo
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer

In [132]:
# Subset to preview SB1 vs Nonshopp. Nonshopp is on the left, sb1 on the right
preview_cols =  ['project_name','projecttitle','project_description','projectdescription', 'full_county_name', 'countynames']

In [133]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [134]:
def rest_server():
    full_gdf = pd.DataFrame()
    for i in [*range(0,22)]:
        df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)
    return full_gdf

In [135]:
sb1_geo1 = rest_server()

In [136]:
# tircp = to_snakecase(gpd.read_file("https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/5/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"))

In [184]:
# atp = to_snakecase(gpd.read_file("https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/12/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"))

In [185]:
# atp.columns

In [138]:
sb1_geo2 = basic_cleaning(sb1_geo1, 'agencies','projecttitle','projectid',
                         'projectdescription', 'sb1')



In [139]:
sb1_geo2['projecttitle'] = sb1_geo2['projecttitle'].fillna(sb1_geo2['popup'])

In [140]:
sb1_geo2.programcodes.value_counts()

SHOPP    2741
HM       1163
LSR       509
ATP       321
SHOPA     165
SGR       161
STIP      126
TIRCP      96
LPP-F      68
TCEP       63
LPP-C      57
STA        49
SCCP       40
FM         12
SRA        11
Name: programcodes, dtype: int64

#### Check geometries
* Take away invalid row.

In [141]:
# All geometry is valid
#sb1_geo2.geometry.is_valid.sum() == len(sb1_geo2)

In [142]:
#sb1_geo2.geometry.notna().sum()  == len(sb1_geo2)

In [143]:
#len(sb1_geo2[~sb1_geo2.geometry.is_empty]) 

In [144]:
# Throw out missing geometry
missing_geo = sb1_geo2[sb1_geo2.geometry.is_empty]

In [145]:
len(missing_geo)

1

In [146]:
sb1_geo2 = sb1_geo2[~sb1_geo2.geometry.is_empty].reset_index(drop = True)

In [147]:
# len(sb1_geo2) == len(sb1_geo2[~sb1_geo2.geometry.is_empty]) 

In [148]:
sb1_geo2.shape

(5581, 40)

In [149]:
# sb1_geo.countynames.sort_values().unique()

In [150]:
# sb1_geo.explore()

### Read in all projects
* Need this because not every file has project names.

In [151]:
sb1_all_projects_url = f"{url_pt1}22{url_pt2}"

In [152]:
sb1_all_projects = to_snakecase(gpd.read_file(sb1_all_projects_url))

In [153]:
sb1_all_projects.geometry.value_counts()

Series([], Name: geometry, dtype: int64)

In [154]:
# No geometry, just drop it
sb1_all_projects = sb1_all_projects.drop(columns=["geometry"])

In [155]:
sb1_all_projects = basic_cleaning(sb1_all_projects, 'implementingagency','projecttitle','projectid',
                         'projectdescription', 'sb1')



In [156]:
sb1_all_projects.shape, sb1_all_projects.projecttitle.nunique()

((9632, 38), 5041)

In [157]:
# Project ID matches...
all_projects_project_id = set(sb1_all_projects.projectid.unique().tolist())
sb1_geo2_project_id = set(sb1_geo2.projectid.unique().tolist())

In [158]:
len(list(set(sb1_geo2_project_id).intersection(all_projects_project_id)))

3954

In [159]:
len(sb1_geo2_project_id - all_projects_project_id)

95

In [160]:
len(sb1_geo2)

5581

In [161]:
# None of the object id matches 
all_projects_object_id = set(sb1_all_projects.objectid.unique().tolist())
sb1_geo2_object_id = set(sb1_geo2.objectid.unique().tolist())

In [162]:
len(sb1_geo2_object_id - all_projects_object_id), len(sb1_geo2_object_id)

(4614, 4614)

In [163]:
len(all_projects_object_id - sb1_geo2_object_id)

9632

In [164]:
# set(sb1_all_projects.columns).difference(set(sb1_csv.columns))

In [165]:
m1 = pd.merge(
    sb1_geo2,
    sb1_all_projects[['projectid','projecttitle']],
    how="left",
    on=["projectid"],
)

In [166]:
m1.projecttitle_x.value_counts().head()

major damage restoration    467
safety improvements         368
pavement rehabilitation     305
pavement  hm1               220
permanent restoration       149
Name: projecttitle_x, dtype: int64

In [167]:
m1.projecttitle_y.isna().sum()

1321

In [168]:
sb1_all_projects.columns, sb1_geo2.columns

(Index(['objectid', 'projectid', 'projecttitle', 'program', 'iijaprogram',
        'projectdescription', 'totalcost', 'fiscalyear', 'sb1funds',
        'iijafunds', 'agencyid', 'implementingagency', 'assemblydistricts',
        'senatedistricts', 'congressionaldistricts', 'countynames', 'citynames',
        'ct_districts', 'dateupdated', 'projectstatus', 'programcodes',
        'iijacodes', 'issb1', 'isiija', 'isonshs', 'isonshscode', 'issb1code',
        'isiijacode', 'assemblycodes', 'senatecodes', 'congressionalcodes',
        'countycodes', 'citycodes', 'ct_codes', 'fiscalyearcode',
        'projectstatuscodes', 'projecttitle_count', 'projectdescription_count'],
       dtype='object'),
 Index(['objectid', 'agencyids', 'agencies', 'programcodes', 'iijaprogram',
        'iijacodes', 'projectstatuscodes', 'fiscalyears', 'fiscalyearcodes',
        'projectstatuses', 'sb1funds', 'iijafunds', 'totalcost', 'dateupdated',
        'projectcount', 'assemblydistricts', 'senatedistricts',
    

In [192]:
len(sb1_all_projects), len(sb1_geo2), 5513+4415

(9632, 5581, 9928)

In [None]:
pd.merge(
    sb1_all_projects,
    sb1_geo2,
    how="outer",
    left_on=[ "ct_districts", "agencyid", "totalcost", "fiscalyearcode"],
    right_on=["ct_districts", "agencyids", "totalcost", "fiscalyearcodes"],
    indicator=True,
)[["_merge"]].value_counts()

_merge    
left_only     5513
both          4515
right_only    1212
dtype: int64

In [None]:
m1 = pd.merge(
    sb1_all_projects,
    sb1_geo2[['projecttitle', "ct_districts", "programcodes", "totalcost", "agencies"]],
    how="right",
    left_on=[ "ct_districts", "programcodes", "totalcost", "implementingagency"],
    right_on=["ct_districts", "programcodes", "totalcost", "agencies"],
    indicator=True,
)

In [209]:
# m1[['projecttitle_x','projecttitle_y', 'programcodes',]].sort_values('programcodes')

In [194]:
m2 = pd.merge(
     sb1_geo2,
    sb1_all_projects[['projecttitle',  "ct_districts", "agencyid", "totalcost", "fiscalyearcode"]],
    left_on=["ct_districts", "agencyids", "totalcost", "fiscalyearcodes"],
    how="left",
    right_on=[ "ct_districts", "agencyid", "totalcost", "fiscalyearcode"],

    indicator=True,
)

In [195]:
m2.shape

(5727, 44)

In [210]:
# m2[['projecttitle_x','projecttitle_y', 'programcodes',]].sort_values('programcodes')

In [221]:
m3 = pd.merge(
   
    sb1_geo2,
     sb1_all_projects[['projecttitle', "programcodes", "totalcost", "implementingagency"]],
    how="left",

    left_on=[ "programcodes", "totalcost", "agencies"],
        right_on=[ "programcodes", "totalcost", "implementingagency"],
    indicator=True,
)

In [222]:
m3._merge.value_counts()

both          5853
left_only     1094
right_only       0
Name: _merge, dtype: int64

In [226]:
type(m3), m3.shape

(geopandas.geodataframe.GeoDataFrame, (6947, 43))

In [233]:
m3 = m3.drop_duplicates().reset_index(drop = True).drop(columns = ['_merge'])

In [229]:
m3.projecttitle_x = m3.projecttitle_x.fillna(m3.projecttitle_y)

In [230]:
m3[['projecttitle_x','programcodes',]].sort_values('programcodes')

Unnamed: 0,projecttitle_x,programcodes
1838,,ATP
1765,loomis town center implementation plan phase 2,ATP
1764,,ATP
1763,mammoth creek gap closure project,ATP
1762,east palo alto highway 101 pedestrian and bicycle overcrossing,ATP
1761,la quinta village complete streets a road diet,ATP
1760,downtown cathedral city connectors gap closure complete streets improvements,ATP
1759,,ATP
1766,santa fe drive corridor bike and pedestrian improvements,ATP
1758,alpine pedal path rathbun creek extension rce big bear lake,ATP


### Tircp & ATP
* None of the projects from TIRCP are mapping, even though the names appear the same.

In [170]:
nonshopp_preview = ['project_description','project_name','previous_caltrans_nominations']

In [231]:
tircp_atp_shopp = nonshopp.loc[
    nonshopp.previous_caltrans_nominations.str.contains(("TIRCP|ATP"))
].reset_index(drop=True)

In [234]:
# why are there no matches??
m_test = pd.merge(tircp_atp_shopp, m3, how="outer",
         left_on=["project_name"], 
         right_on=["projecttitle_x"], indicator=True)

In [235]:
m_test._merge.value_counts()

right_only    6267
both           107
left_only       45
Name: _merge, dtype: int64

In [176]:
safsd

NameError: name 'safsd' is not defined

### Compare with 9 Sample Projects

In [None]:
# Subset sb1_geo to only programs these 9 projects have applied for
tcep_sccp1 = sb1_geo2[sb1_geo2["programcodes"].str.contains(('TCEP|SCCP'))].reset_index(drop = True)

In [None]:
# Subset sb1_geo to only programs these 9 projects are located in
tcep_sccp2 = sb1_geo2[sb1_geo2["countynames"].str.contains(('Alameda|San Mateo|Santa Cruz|San Luis Obispo|Los Angeles|San Bernardino|Riverside'))].reset_index(drop = True)

In [None]:
# Subset sb1_geo to only programs these 9 projects have applied for
tcep_sccp2 = tcep_sccp2[tcep_sccp2["programcodes"].str.contains(('TCEP|SCCP'))].reset_index(drop = True)

In [None]:
tcep_sccp2.shape

In [None]:
tcep_sccp2.programcodes.value_counts()

In [None]:
pd.merge(
    nine_sample_projects,
    tcep_sccp2,
    how="outer",
    left_on=["ct_project_id"],
    right_on=["projectid"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    nine_sample_projects,
    tcep_sccp2,
    how="outer",
    left_on=["project_name"],
    right_on=["projecttitle"],
    indicator=True,
)[["_merge"]].value_counts()

* Eyeballing matches
    * route 395 widening from sr 18 to chamberlaine way in SB1 could match us 395 freight mobility and safety project in Non SHOPP
    * state route 1 state park to bayporter auxiliary lanes in SB1 is watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12 in non SHOPP

In [None]:
# tcep_sccp2[['projecttitle','agencies','countynames', 'projectdescription']].sort_values(['countynames','projecttitle'])

In [None]:
# nine_sample_projects[['project_name','lead_agency','full_county_name','project_description']].sort_values(['full_county_name','project_name'])

In [None]:
non_shopp_projects_sb1_list = ['route 395 widening from sr 18 to chamberlaine way', 
                             'state route 1  state park to bayporter auxiliary lanes']

In [None]:
non_shopp_projects_in_sb1 = tcep_sccp2[tcep_sccp2["projecttitle"].isin(non_shopp_projects_sb1_list)].reset_index(drop = True)

In [None]:
non_shopp_projects_in_sb1.projecttitle = non_shopp_projects_in_sb1.projecttitle.replace({
    'route 395 widening from sr 18 to chamberlaine way': 'us 395 freight mobility and safety project',
    'state route 1  state park to bayporter auxiliary lanes': 'watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1  sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12'
})

In [None]:
non_shopp_projects_in_sb1[['agencies','programcodes', 'countynames','projecttitle','projectdescription']]

In [None]:
len(non_shopp_projects_in_sb1)

In [None]:
nine_sample_projects_geo = pd.merge(
    non_shopp_projects_in_sb1[['projecttitle','geometry', 'projectdescription']],
    nine_sample_projects,
    how="outer",
    left_on=["projecttitle"],
    right_on=["project_name"],
)

In [None]:
len(nine_sample_projects_geo)

In [None]:
type(nine_sample_projects_geo)

In [None]:
# nine_sample_projects_geo[['project_name','projecttitle','project_description','projectdescription', 'full_county_name']]

In [None]:
#  _utils.geojson_gcs_export(nine_sample_projects_geo,_utils.GCS_FILE_PATH, 'nine_sample_projects_geom')

### Compare with ALL Projects

#### Fuzzy Matches

##### Try with Project Titles

In [None]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [None]:
nonshopp_projects = nonshopp.project_name.unique().tolist()

In [None]:
# Delete project titles that are short
sb1_w_projectnames = (sb1_geo2.loc[sb1_geo2.projecttitle_count > 4]).reset_index(drop = True)

In [None]:
sb1_w_projectnames.shape

In [None]:
for i in nonshopp_projects:
    replace_matches_in_column(
        sb1_w_projectnames
        , "projecttitle", "project_title_fuzzy_match", i,90 
    )

In [None]:
# Drop nulls and duplicates
fuzzy_match_results = (sb1_w_projectnames.loc[sb1_w_projectnames.project_title_fuzzy_match.notnull()]
                       .drop_duplicates(subset = ["projecttitle", "project_title_fuzzy_match", "projectdescription"])
                       .reset_index(drop = True)
                      )

In [None]:
len(fuzzy_match_results)

In [None]:
# fuzzy_match_results[['projecttitle','project_title_fuzzy_match', 'countynames', 'fiscalyears']].sort_values('projecttitle')

In [None]:
outer_m_project_titles = pd.merge(
    fuzzy_match_results,
    nonshopp,
    how="outer",
    left_on=["project_title_fuzzy_match", "countynames"],
    right_on=["project_name", "full_county_name"],
    indicator=True)

In [None]:
type(outer_m_project_titles)

In [None]:
outer_m_project_titles.shape

In [None]:
outer_m_project_titles.loc[outer_m_project_titles._merge == 'both'][preview_cols]

##### Try with project description since titles are very vague.

In [None]:
def replace_matches_set_ratio(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [None]:
# Only include projects with a long enough desc
# Drop dups
sb1_w_projectdesc = ((sb1_geo2.loc[sb1_geo2.projectdescription_count > 10])
.drop_duplicates(subset = ["projecttitle", "projectdescription", "countynames"])
.reset_index(drop = True)
                    )

In [None]:
# Test with nonshopp 
nonshopp_with_desc = ((nonshopp.loc[nonshopp.project_description_count > 10])
                     .drop_duplicates(subset = ["project_name", "project_description", "full_county_name"]).reset_index(drop = True)
                    )

In [None]:
# Reverse -> replace 
#for i in sb1_w_projectdesc["projectdescription"].loc[3000:3515].unique().tolist():
#    replace_matches_set_ratio(
#        nonshopp_with_desc[['project_name','project_description','project_desc_fuzzy_match','previous_caltrans_nominations']]
#        , "project_description", "project_desc_fuzzy_match", i, 95
#    )

In [None]:
nonshopp_with_desc = nonshopp_with_desc.loc[nonshopp_with_desc.project_desc_fuzzy_match.notnull()].reset_index(drop = True)

In [None]:
fuzzy_descriptions_list = ['in placer county from highway 65 to rocklin road the project will add an auxiliary lane between highway 65 and the rocklin road interchanges providing improved travel time reliability for the more than 90 bus trips that currently pass through this area daily',
 'in sacramento county on watt avenue from i80 westbound ramps to roseville rd  between orange grove avenue and roseville rd construct buffered bike lanes separated pedestrianfriendly sidewalks landscaped medians improved transit facilities for pedestrians including bus turnouts improve street lighting improve signalized intersections and other streetscape amenities to encourage mobility by active modes of transportation and provide community identity  between orange grove avenue to i80 westbound ramps extend class 2 bike lane and sidewalk improvements',
 'from state route 84 alameda county to alcosta boulevard contra costa county project will add one high occupancy hovexpress lane and construct other improvements including replacementupgrade of median concrete barrier and lighting on interstate 680 in the southbound direction between state route sr84 and alcosta boulevard through alameda and contra costa counties',
 'on route 101 in marin county in and near city of novato from just south of the franklin avenue overhead to 03 miles south of the marinsonoma county line  the project will widen route 101 to construct a southbound hov lane from 03 miles south of the marinsonoma county line to just south of the franklin avenue overhead 60 miles and a northbound hov lane from 17 miles north of the atherton avenue overcrossing to 03 miles south of the marinsonoma county line 35 miles the project includes roadway and bridge widening for hov lanes and standard shoulders the project will also upgrade the horizontal and vertical roadway alignment for a 70 mph design speed modify the redwood landfill interchange ramps to conform with the new alignment and restripe a frontage road redwood boulevard for class ii bike lanes in novato',
 'near capitola and aptos state route 1 from state park drive to bayporter interchanges  construct auxiliary lanes between interchanges  includes reconstruction of the capitola avenue overcrossing to accommodate new lanes on state route 1 and improve bicycle and pedestrian facilities hybrid busonshoulderauxiliary lane facility between bay aveporter st and state park dr total distance 3 miles bicyclepedestrian overcrossing of hwy 1 at mar vista dr with sidewalk ada ramps and intersection improvements at bridge approaches additional project elements add emergency pullouts and enforcement areas sound wall retaining walls improved median barrier lighting overhead signs traffic monitoring stations drainage and droughttolerant landscapingsccplpp title watsonvillesanta cruz  multimodal improvements bay aveporter st to state park dr  contract 2',
 'in monterey county at castroville boulevard from post mile r16 to 14  build a new interchange at castroville boulevard and highway 156',
 'in tulare county near the city of tulare at commercial avenue and state route 99 between 09 mile north of avenue 200 oc and paige avenue oc construct new interchange and construct north and south bound auxiliary lanes']

In [None]:
outer_m_project_desc = pd.merge(
    sb1_geo2,
    nonshopp_with_desc[nonshopp_with_desc.project_desc_fuzzy_match.isin(fuzzy_descriptions_list)],
    how="outer",
    left_on=["projectdescription", "countynames"],
    right_on=["project_desc_fuzzy_match", "full_county_name"],
    indicator=True)

In [None]:
outer_m_project_desc._merge.value_counts()