## SB1 
* Need to find out
    * What's the difference between RCA/RCA_Projects_092023 (FeatureServer) and RCA/RCA_Projects_102023 (FeatureServer) here https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA?
    * What's the right link to use to grab the projects: https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer or https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA?
        * I think link 1 is related to the SB1 map online
        * Unsure what the second map is about?

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import _harmonization_utils 
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis import utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# sb1, missing_geo_sb1 = load_sb1_rest_server()

In [4]:
# len(sb1), type(sb1)

In [5]:
# len(missing_geo_sb1)

In [None]:
# sb1_all_projects = load_sb1_all_projects()

In [None]:
# len(sb1_all_projects)

### Link above broken, try new link
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer

#### Link 1
* SB1 Only
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer

In [71]:
def sb1_basic_cleaning(
    gdf: gpd.GeoDataFrame, 
    project_name_col:str,
    agency_col:str)-> gpd.GeoDataFrame:
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Some project names contain the year. Remove anything after 20..
    gdf[project_name_col] = gdf[project_name_col].str.split("20").str[0]
    
   
    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = gdf.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            gdf[i] = gdf[i].str.replace("|", "").str.title().str.replace("[^\w\s]", "").str.strip()
            gdf[i] = gdf[i].fillna('None')
    except:
        pass
    
    # Project agency always says "Los Angeles submitted by county" 
    # or "Fremont submitted by city." Remove submitted by.
    gdf[agency_col]  = gdf[agency_col].str.replace('Submitted By', ' ')
    
 
    return gdf

In [82]:
def load_sb1() -> gpd.GeoDataFrame:
    """
    Load all the projects on the SB1 map from the Feature Server.

    https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer
    """
    sb1_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer/"
    sb1_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

    full_gdf = pd.DataFrame()
    for i in list(map(str, [*range(0, 16)])):
        df = to_snakecase(gpd.read_file(f"{sb1_pt1}{i}{sb1_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)
    
    # Basic cleaning
    full_gdf = sb1_basic_cleaning(
        full_gdf, "projname", "projagency",
    )
    
    # Find missing geo
    missing_geo = full_gdf[(full_gdf.geometry.is_empty)]
    print(f"{len(missing_geo)} rows are mising geometry")
    
    # Find invalid geo
    invalid_geo = full_gdf[~full_gdf.geometry.is_valid].reset_index(drop = True)
    print(f"{len(invalid_geo)} rows contain invalid geography")
    
    # Filter out completed
    full_gdf = full_gdf.loc[full_gdf.projstatus != "Completed"].reset_index(drop = True)
    return full_gdf

In [83]:
sb1_df1 = load_sb1()

  gdf[i] = gdf[i].str.replace("|", "").str.title().str.replace("[^\w\s]", "").str.strip()
  gdf[i] = gdf[i].str.replace("|", "").str.title().str.replace("[^\w\s]", "").str.strip()


0 rows are mising geometry
7917 rows contain invalid geography


In [74]:
sb1_df1.projname.nunique()

5430

In [75]:
sb1_df1.projectid.nunique()

7919

In [76]:
sb1_df1.projprogram.unique()

array(['Solutions To Congested Corridors Program',
       'Trade Corridor Enhancement Program',
       'Solutions For Congested Corridors Program',
       'State Rail Assistance',
       'Transit And Intercity Rail Capital Program',
       'State Of Good Repair', 'State Highway Projects',
       'State Transit Assistance', 'Local Partnership Program',
       'Active Transportation Program', 'Local Streets And Roads',
       '201718 Local Streets And Roads', '201819 Local Streets And Roads'],
      dtype=object)

In [77]:
 len(sb1_df1)

9536

In [79]:
sb1_df1.columns

Index(['objectid', 'projectid', 'projname', 'projcatcode', 'projcategory',
       'projprogcode', 'projprogram', 'multiprogfunded', 'projstatus',
       'description', 'cost', 'assemblydistrict', 'senatedistrict',
       'assemblycode', 'senatecode', 'countyname', 'cityname', 'countycode',
       'citycode', 'appagencyname', 'impagencyname', 'popuptitle', 'popup',
       'geometry', 'projcount', 'totalcosts', 'projlisturl', 'projlist2url',
       'mponame', 'mpocode', 'assetcategory', 'bphotojpg', 'bphotohtml',
       'aphotojpg', 'aphotohtml', 'routes', 'constyear', 'costfull',
       'projagency'],
      dtype='object')

In [81]:
sb1_df1.projstatus.value_counts()

PreConstruction    7093
Construction       1129
None                964
Completed           350
Name: projstatus, dtype: int64

In [80]:
sb1_df1.drop(columns = ['geometry']).sample(3)

Unnamed: 0,objectid,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,description,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,popuptitle,popup,projcount,totalcosts,projlisturl,projlist2url,mponame,mpocode,assetcategory,bphotojpg,bphotohtml,aphotojpg,aphotohtml,routes,constyear,costfull,projagency
3838,169568,LsrFy17185944Pp012,Intergarrison Road,Local,Local And Regional,Lsr1718,201718 Local Streets And Roads,,PreConstruction,Overlay 08 Of A Mile Of Road,,29,17,29,17,Monterey Submitted By County,,Mon,,,,,,,,,,,,,,,,,,2020,0.0,Monterey County
2311,168032,LsrFy17185360Pp002,Alley Hawthorne,Local,Local And Regional,Lsr1718,201718 Local Streets And Roads,,PreConstruction,RepaveResurface 005 Miles Of Road Replace 600 Square Feet Of Cross Gutter,,62,35,62,35,Los Angeles,Lawndale Submitted By City,La,Lndl,,,,,,,,,,,,,,,,,2018,0.0,Lawndale City
3740,169470,LsrFy17185936Pp002,Bear Creek Rd,Local,Local And Regional,Lsr1718,201718 Local Streets And Roads,,PreConstruction,RoadwayEmbankment Settling 70 L Recommend Wb Traveled Way And Shoulder Reconstruction And Retaining Wall Soldier Pile Installation,,29,17,29,17,Santa Cruz Submitted By County,,Scr,,,,,,,,,,,,,,,,,,2018,0.0,Santa Cruz County


In [50]:
# sb1_df1.explore('projname', legend = False)

#### Link 2
* This includes IIJA
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer
* Why are there LESS projects below compared to SB1??

In [101]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer/"

In [102]:
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [103]:
int_to_string = list(map(str, [*range(0, 28)]))

In [104]:
full_gdf = pd.DataFrame()
for i in int_to_string:
    df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
    full_gdf = pd.concat([full_gdf, df], axis=0)

In [105]:
full_gdf.shape

(7631, 38)

In [106]:
full_gdf.shape, sb1_df1.shape

((7631, 38), (9186, 39))

#### Compare

In [109]:
full_gdf.columns

Index(['objectid', 'agencyids', 'agencies', 'programcodes', 'iijaprogram',
       'iijacodes', 'projectstatuscodes', 'fiscalyears', 'fiscalyearcodes',
       'projectstatuses', 'sb1funds', 'iijafunds', 'totalcost', 'dateupdated',
       'projectcount', 'assemblydistricts', 'senatedistricts',
       'congressionaldistricts', 'assemblycodes', 'senatecodes',
       'congressionalcodes', 'countynames', 'citynames', 'countycodes',
       'citycodes', 'ct_codes', 'ct_districts', 'issb1', 'isiija', 'isonshs',
       'issb1codes', 'isiijacode', 'isonshscodes', 'popup', 'geometry',
       'projectid', 'projecttitle', 'projectdescription'],
      dtype='object')

In [112]:
full_gdf2 = full_gdf[['projecttitle', 'projectdescription','countycodes', 'ct_districts']]

In [110]:
sb1_df1.columns

Index(['objectid', 'projectid', 'projname', 'projcatcode', 'projcategory',
       'projprogcode', 'projprogram', 'multiprogfunded', 'projstatus',
       'description', 'cost', 'assemblydistrict', 'senatedistrict',
       'assemblycode', 'senatecode', 'countyname', 'cityname', 'countycode',
       'citycode', 'appagencyname', 'impagencyname', 'popuptitle', 'popup',
       'geometry', 'projcount', 'totalcosts', 'projlisturl', 'projlist2url',
       'mponame', 'mpocode', 'assetcategory', 'bphotojpg', 'bphotohtml',
       'aphotojpg', 'aphotohtml', 'routes', 'constyear', 'costfull',
       'projagency'],
      dtype='object')

In [117]:
sb1_df2 = sb1_df1[['projname','projagency', 'description', 'countycode']]

In [118]:
pd.merge(sb1_df2, full_gdf2, left_on = ['projname','description', 'countycode'],
         right_on = ['projecttitle', 'projectdescription','countycodes'], how = 'outer',
         indicator = True)[['_merge']].value_counts()

_merge    
left_only     9186
right_only    7631
both             0
dtype: int64

In [119]:
pd.merge(sb1_df2, full_gdf2, left_on = ['projname','description'],
         right_on = ['projecttitle', 'projectdescription'], how = 'outer',
         indicator = True)[['_merge']].value_counts()

_merge    
left_only     9186
right_only    7631
both             0
dtype: int64

In [120]:
pd.merge(sb1_df2, full_gdf2, left_on = ['projname','countycode'],
         right_on = ['projecttitle', 'countycodes'], how = 'outer',
         indicator = True)[['_merge']].value_counts()

_merge    
left_only     9186
right_only    7631
both             0
dtype: int64

### Original functions

In [None]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
def sb1_basic_cleaning(
    gdf: gpd.GeoDataFrame, 
    project_name_col:str )-> gpd.GeoDataFrame:
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Some project names contain the year. Remove anything after 20..
    gdf[project_name_col] = gdf[project_name_col].str.split("20").str[0]

    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = gdf.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            gdf[i] = gdf[i].str.replace("|", "").str.lower().str.replace("[^\w\s]", "").str.strip()
    except:
        pass

    # Try to extract titles from popups
    try:
        df["popup"] = (
            df["popup"]
            .str.split("<br  />")
            .str[1]
            .str.split("20")
            .str[0]
            .str.lower()
            .str.strip()
            .str.replace("[^\w\s]", "")
        )
    except:
        pass

    return gdf

In [None]:
def load_sb1_all_projects() -> pd.DataFrame:
    """
    Load in all projects layer of SB1 because it
    contains a value for every row in the
    project title column.
    """
    df = f"{url_pt1}22{url_pt2}"

    df = to_snakecase(gpd.read_file(df))

    # No geometry, just drop it
    df = df.drop(columns=["geometry"])

    # Basic cleaning
    df = sb1_basic_cleaning(
        df, "implementingagency", "projecttitle", "projectid", "projectdescription"
    )

    return df

In [None]:
def sb1_final() -> gpd.GeoDataFrame:
    """
    Layers 0-21 with geographic information
    don't always have project titles for each
    of the projects. Merge these layers with
    layer 22, which does have title information.
    """
    all_projects_subset = [
        "projecttitle",
        "programcodes",
        "totalcost",
        "implementingagency",
        "fiscalyearcode",
    ]

    sb1_geo, missing_geo = load_sb1_rest_server()
    sb1_all_projects = load_sb1_all_projects()[all_projects_subset]

    # Merge
    merge1 = pd.merge(
        sb1_geo,
        sb1_all_projects,
        how="left",
        left_on=["programcodes", "totalcost", "agencies", "fiscalyearcodes"],
        right_on=["programcodes", "totalcost", "implementingagency", "fiscalyearcode"],
    )

    # Fill in missing project titles in sb1_geo with information from
    # sb1_all_projects
    merge1.projecttitle_x = merge1.projecttitle_x.fillna(merge1.projecttitle_y)

    # Fill missing titles with none
    merge1.projecttitle_x = merge1.projecttitle_x.fillna("None")
    
    # Drop columns
    merge1 = merge1.drop(columns = ['projecttitle_y'])
    
    # Add a new column detailing where this information comes from 
    merge1 = merge1.assign(
        notes = merge1.programcodes + '/' + 'SB1')
    
    merge1 = merge1.fillna(merge1.dtypes.replace({'float64': 0.0, 'object': 'None'}))
    return merge1

In [None]:
sb1_final_gdf = sb1_final()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
len(sb1_final_gdf), sb1_final_gdf.geometry.nunique()

In [None]:
sb1_final_gdf.info()

In [None]:
sb1_final_gdf.projecttitle_x.value_counts().head(10)

In [None]:
sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].groupby(['projecttitle_x', 'countynames']).agg({'objectid':'count'}).head()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
sb1_final_gdf.projectstatuscodes.unique()

In [None]:
sb1_final_gdf.projectstatuses.value_counts()

In [None]:
sb1_final_gdf[(sb1_final_gdf.projecttitle_x == "major damage restoration") & (sb1_final_gdf.countynames == "Alameda")]

In [None]:
sb1_final_gdf[(sb1_final_gdf.projecttitle_x == "major damage restoration") & (sb1_final_gdf.countynames == "Alameda")].plot()

In [None]:
# sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].explore()