## Add SB1 geographic information.

In [1]:
import _utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
import fsspec
from calitp import *
from calitp.storage import get_fs

fs = get_fs()
import os

In [3]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
def basic_cleaning(df, agency_col: str, project_name_col:str, project_id_col: str, project_desc_col:str):
    
    df = _utils.organization_cleaning(df, agency_col)
    
    # Remove all punctation, lowercase, and strip whitespaces from 
    # project titles & descriptions
    for i in [project_name_col, project_desc_col]:
        df[i] = (df[i].str.lower().str.replace('[^\w\s]','').str.strip())
                 
    # Some project names contain the year. Remove anything after 20..
    df[project_name_col] = df[project_name_col].str.split("20").str[0]
    
    # Project ID, remove all commas and lowercase if there are strings
    df[project_id_col] = (df[project_id_col].str.replace("'", "").str.lower().str.strip())
    
    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = df.select_dtypes(include=['object']).columns.to_list()
    try:
        for i in string_cols:
            df[i] = df[i].str.replace("|", "")
    except:
        pass
        
    # Fill in NA
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    return df

### Non SHOPP-ATP-TIRCP
* No year information for projects.

In [5]:
# Read in 10 Year non SHOPP with ATP and TIRCP
nonshopp = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx")
)

In [6]:
# Subset to join.
non_shopp_subset = [
    "ppno",
    "ct_project_id",
    "ea",
    "project_name",
    "lead_agency",
    "previous_caltrans_nominations",
    "full_county_name",
    "county",
    "district_full_name",
    "district",
    "project_description",
    "current_phase",
    "primary_mode",
    "urban_rural",
    "total_project_cost__$1,000",
    "total_unfunded_need__$1,000",
    "shs_capacity_increase_detail",
    "current_phase",
]

In [7]:
# nonshopp = nonshopp[non_shopp_subset]

In [8]:
# Add a digit in front of single digits
nonshopp.district = nonshopp.district.map("{:02}".format)

In [9]:
nonshopp = basic_cleaning(nonshopp, "lead_agency", "project_name", "ct_project_id", "project_description")



### 9 Sample Non SHOPP 

In [10]:
nine_projects_names = [
    "LA-210 Median Concrete Barrier Renovation",
    "SR-14 Widening Project",
    "US 395 Freight Mobility and Safety Project",
    "East Bay Greenway Multimodal Corridor Project",
    "Watsonville-Santa Cruz Multimodal Corridor Program",
    "SM 101 Woodside Road Interchange and Port Access Project",
    "I-710 Integrated Corridor Management",
    "Five Cities Multimodal Transportation Network Enhancement Project",
    "SR-86/Avenue 50 New Interchange (Phase II)",
]

In [11]:
nine_projects_names = [x.lower() for x in nine_projects_names]

In [12]:
nine_projects_id = [
    "0422000202",
    "0414000032",
    "0520000083",
    "0515000063",
    "0721000056",
    "0716000370",
    "0813000222",
    "0814000144",
    "0414000032",
    "0720000165",
]

In [13]:
nine_sample_projects = (nonshopp[nonshopp.ct_project_id.isin(nine_projects_id)].reset_index(drop=True))

* Solutions for Congest Corridors (SCCP): 1
* Trade Corridor Enhancement Program (TCEP): 3
* Only 3 projects seem to have been awarded. 
    * east bay greenway multimodal corridor project phase 1
    * us 101woodside road interchange and port access project
    * watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12

In [14]:
# nine_sample_projects[['project_name','project_description','county','previous_caltrans_nominations']]

### Sb1 Geo
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer

#### Step 1: Read in files with geometry 

In [15]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [16]:
def rest_server():
    full_gdf = pd.DataFrame()
    for i in [*range(0,22)]:
        df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)
    return full_gdf

In [17]:
sb1_geo1 = rest_server()

In [18]:
sb1_geo1.columns

Index(['objectid', 'agencyids', 'agencies', 'programcodes', 'iijaprogram',
       'iijacodes', 'projectstatuscodes', 'fiscalyears', 'fiscalyearcodes',
       'projectstatuses', 'sb1funds', 'iijafunds', 'totalcost', 'dateupdated',
       'projectcount', 'assemblydistricts', 'senatedistricts',
       'congressionaldistricts', 'assemblycodes', 'senatecodes',
       'congressionalcodes', 'countynames', 'citynames', 'countycodes',
       'citycodes', 'ct_codes', 'ct_districts', 'issb1', 'isiija', 'isonshs',
       'issb1codes', 'isiijacode', 'isonshscodes', 'popup', 'geometry',
       'projectid', 'projecttitle', 'projectdescription'],
      dtype='object')

In [19]:
sb1_geo2 = basic_cleaning(sb1_geo1, 'agencies','projecttitle','projectid',
                         'projectdescription')



In [20]:
# sb1_geo = sb1_geo.drop(columns = 'popup')

In [21]:
sb1_geo2.programcodes.value_counts()

SHOPP    2741
HM       1163
LSR       509
ATP       321
SHOPA     165
SGR       161
STIP      126
TIRCP      96
LPP-F      68
TCEP       63
LPP-C      57
STA        49
SCCP       40
FM         12
SRA        11
Name: programcodes, dtype: int64

In [22]:
# All geometry is valid
sb1_geo2.geometry.is_valid.sum() == len(sb1_geo2)

True

In [23]:
sb1_geo2.geometry.notna().sum()  == len(sb1_geo2)

Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.



True

In [24]:
len(sb1_geo2[~sb1_geo2.geometry.is_empty]) 

5581

In [25]:
# Throw out missing geometry
missing_geo = sb1_geo2[sb1_geo2.geometry.is_empty]

In [26]:
sb1_geo2 = sb1_geo2[~sb1_geo2.geometry.is_empty].reset_index(drop = True)

In [27]:
len(sb1_geo2) == len(sb1_geo2[~sb1_geo2.geometry.is_empty]) 

True

In [28]:
sb1_geo2.projecttitle.value_counts().head()

None                        1321
major damage restoration     465
safety improvements          366
pavement rehabilitation      305
pavement  hm1                220
Name: projecttitle, dtype: int64

In [29]:
# sb1_geo.countynames.sort_values().unique()

In [30]:
# sb1_geo.explore()

#### Compare with 9 Sample Projects

In [31]:
# Subset sb1_geo to only programs these 9 projects have applied for
tcep_sccp1 = sb1_geo2[sb1_geo2["programcodes"].str.contains(('TCEP|SCCP'))].reset_index(drop = True)

In [32]:
# Subset sb1_geo to only programs these 9 projects are located in
tcep_sccp2 = sb1_geo2[sb1_geo2["countynames"].str.contains(('Alameda|San Mateo|Santa Cruz|San Luis Obispo|Los Angeles|San Bernardino|Riverside'))].reset_index(drop = True)

In [33]:
# Subset sb1_geo to only programs these 9 projects have applied for
tcep_sccp2 = tcep_sccp2[tcep_sccp2["programcodes"].str.contains(('TCEP|SCCP'))].reset_index(drop = True)

In [34]:
tcep_sccp2.shape

(37, 38)

In [35]:
tcep_sccp2.programcodes.value_counts()

TCEP    26
SCCP    11
Name: programcodes, dtype: int64

In [36]:
pd.merge(
    nine_sample_projects,
    tcep_sccp2,
    how="outer",
    left_on=["ct_project_id"],
    right_on=["projectid"],
    indicator=True,
)[["_merge"]].value_counts()

_merge    
right_only    37
left_only      9
both           0
dtype: int64

In [37]:
pd.merge(
    nine_sample_projects,
    tcep_sccp2,
    how="outer",
    left_on=["project_name"],
    right_on=["projecttitle"],
    indicator=True,
)[["_merge"]].value_counts()

_merge    
right_only    37
left_only      9
both           0
dtype: int64

* Eyeballing matches
    * route 395 widening from sr 18 to chamberlaine way in SB1 could match us 395 freight mobility and safety project in Non SHOPP
    * state route 1 state park to bayporter auxiliary lanes in SB1 is watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1 sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12 in non SHOPP

In [57]:
# tcep_sccp2[['projecttitle','agencies','countynames', 'projectdescription']].sort_values(['countynames','projecttitle'])

In [53]:
# nine_sample_projects[['project_name','lead_agency','full_county_name','project_description']].sort_values(['full_county_name','project_name'])

In [58]:
non_shopp_projects_sb1_list = ['route 395 widening from sr 18 to chamberlaine way', 
                             'state route 1  state park to bayporter auxiliary lanes']

In [67]:
test = sb1_geo2[sb1_geo2["projecttitle"].isin(non_shopp_projects_sb1_list)].reset_index(drop = True)

In [72]:
# test[['geometry','projecttitle']]

In [59]:
non_shopp_projects_in_sb1 = tcep_sccp2[tcep_sccp2["projecttitle"].isin(non_shopp_projects_sb1_list)].reset_index(drop = True)

In [60]:
non_shopp_projects_in_sb1.projecttitle = non_shopp_projects_in_sb1.projecttitle.replace({
    'route 395 widening from sr 18 to chamberlaine way': 'us 395 freight mobility and safety project',
    'state route 1  state park to bayporter auxiliary lanes': 'watsonvillesanta cruz multimodal corridor program wscmcp cycle 3 project contract 1  sr 1 freedom to state park aux lanes bus on shoulders and coastal rail trail segment 12'
})

In [61]:
len(non_shopp_projects_in_sb1)

2

In [63]:
nine_sample_projects_geo = pd.merge(
    non_shopp_projects_in_sb1[['projecttitle','geometry', 'projectdescription']],
    nine_sample_projects,
    how="outer",
    left_on=["projecttitle"],
    right_on=["project_name"],
)

In [64]:
len(nine_sample_projects_geo)

9

In [74]:
type(nine_sample_projects_geo)

geopandas.geodataframe.GeoDataFrame

In [73]:
# nine_sample_projects_geo[['project_name','projecttitle','project_description','projectdescription', 'full_county_name']]

In [75]:
 _utils.geojson_gcs_export(nine_sample_projects_geo,_utils.GCS_FILE_PATH, 'nine_sample_projects_geom')

#### Step 2: Read in all projects
* Compare with CSV.
* Clean it up.

In [None]:
sb1_all_projects_url = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/22/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
# Read in SB1 csv
# sb1_csv = to_snakecase(pd.read_csv(f"{_utils.GCS_FILE_PATH}RebuildingCA_map_Data.csv"))

In [None]:
sb1_all_projects = to_snakecase(gpd.read_file(sb1_all_projects_url))

In [None]:
sb1_all_projects.shape, sb1_all_projects.projecttitle.nunique()

In [None]:
# Same rows, different columns.
# sb1_all_projects.shape, sb1_csv.shape

In [None]:
# Project ID matches
# csv_projectid = set(sb1_csv.project_id.unique().tolist())
# geojson_projectid = set(sb1_all_projects.projectid.unique().tolist())
# csv_projectid - geojson_projectid

In [None]:
# set(sb1_all_projects.columns).difference(set(sb1_csv.columns))

In [None]:
# Align funding
sb1_all_projects = _utils.align_funding_numbers(
    sb1_all_projects,
    [
        "totalcost",
        "sb1funds",
    ],
)

In [None]:
# Lower case and clean project names
sb1_all_projects.projecttitle = (
    sb1_all_projects.projecttitle.str.lower().str.strip().str.split("20").str[0]
)

In [None]:
# No geometry, just drop it
sb1_all_projects = sb1_all_projects.drop(columns=["geometry"])

#### Step 3: Figure out why the rows differ between `sb1_all_projects` and `sb1_geo2`

In [None]:
# Subset to join back to the 9,000 projects above
# subset = ['objectid', 'agencyids', 'projecttitle','programcodes', 'projectid','geometry']

In [None]:
full_gdf2.projectid.nunique(), sb1_project_id.project_id.nunique(), len(sb1_project_id)

In [None]:
pd.merge(
    sb1_project_id,
    full_gdf2,
    how="outer",
    left_on=["project_id"],
    right_on=["projectid"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
sb1_m = pd.merge(
    full_gdf2,
    sb1_project_id,
    how="left",
    left_on=["projectid"],
    right_on=["project_id"],
)

In [None]:
sb1_m.project_name = sb1_m.project_name.fillna(sb1_m.projecttitle)

In [None]:
pd.merge(nonshopp, sb1_m, how="outer", on=["project_name"], indicator=True)[
    ["_merge"]
].value_counts()

In [None]:
sb1.shape, sb1.project_name.nunique()

In [None]:
# Titles are not necessarily specifically named.
# Tends to be very general
# sb1.project_name.value_counts()

In [None]:
sb1_subset = [
    "project_name",
    "implementing_agency",
    "sb1_program",
    "fiscal_year",
    "project_description",
    "total_cost",
    "sb1_funds",
    "is_sb1?",
    "project_status",
    "assembly_districts",
    "senate_districts",
    "congressional_districts",
    "counties",
    "cities",
    "caltrans_districts",
    "on_shs?",
]

In [None]:
sb1_2 = sb1[sb1_subset]

In [None]:
sb1.sb1_program.value_counts()

In [None]:
sb1_2.shape

In [None]:
# Align funding
sb1_2 = _utils.align_funding_numbers(
    sb1_2,
    [
        "total_cost",
        "sb1_funds",
    ],
)

In [None]:
# sb1_geo2 = sb1_geo[subset]

In [None]:
sb1_all_projects.shape, sb1_geo.shape

In [None]:
# Objectid: no cleaning for either of dfs
# This is a int64 dtype...yet zero merges
pd.merge(
    sb1_all_projects,
    sb1_geo,
    how="outer",
    on=["objectid"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
sb1_all_projects = clean_project_names_id(sb1_all_projects, 'projectid', 'projecttitle')

In [None]:
sb1_geo2 = clean_project_names_id(sb1_geo, 'projectid','projecttitle')

In [None]:
pd.merge(
    sb1_all_projects,
    sb1_geo2,
    how="outer",
    left_on=["projectid"],
    right_on = ["projectid"],
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sb1_all_projects,
    sb1_geo2,
    how="outer",
    left_on=["projecttitle", "ct_districts", "agencyid", "fiscalyear"],
    right_on=["projecttitle", "ct_districts", "agencyids", "fiscalyears"],
    indicator=True,
)[["_merge"]].value_counts()

### Merge all other Projects

In [None]:
# 62 matches
pd.merge(
    nonshopp,
    sb1_all_projects,
    how="outer",
    left_on=["project_name"],
    right_on=["projecttitle"],
    indicator=True,
)[["_merge"]].value_counts()

### Tircp
* None of the projects from TIRCP are mapping, even though the names appear the same.

In [None]:
# sb1_tircp = sb1_2.loc[sb1_2.sb1_program == "Transit and Intercity Rail Capital Program"].reset_index(drop = True)

In [None]:
# sb1_tircp[['project_name','caltrans_districts','counties']].sort_values('project_name').head(2)

In [None]:
# tircp_sb[['project_name']].sort_values(by = 'project_name')

In [None]:
# tircp_shopp[['project_name']].sort_values(by = 'project_name')

In [None]:
tircp_shopp = df.loc[
    df.previous_caltrans_nominations.str.contains("TIRCP")
].reset_index(drop=True)

In [None]:
# why are there no matches??
pd.merge(tircp_shopp, tircp_sb, how="outer", on=["project_name"], indicator=True)[
    ["_merge"]
].value_counts()