In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import _harmonization_utils 
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis import utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [4]:
def sb1_basic_cleaning(
    df,
    agency_col: str,
    project_name_col: str,
    project_id_col: str,
    project_desc_col: str,
):
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Remove all punctation, lowercase, and strip whitespaces from
    # project titles & descriptions.
    for i in [project_name_col, project_desc_col]:
        df[i] = df[i].str.lower().str.replace("[^\w\s]", "").str.strip()

    # Some project names contain the year. Remove anything after 20..
    df[project_name_col] = df[project_name_col].str.split("20").str[0]

    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = df.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            df[i] = df[i].str.replace("|", "")
    except:
        pass

    # Try to extract titles from popups
    try:
        df["popup"] = (
            df["popup"]
            .str.split("<br  />")
            .str[1]
            .str.split("20")
            .str[0]
            .str.lower()
            .str.strip()
            .str.replace("[^\w\s]", "")
        )
    except:
        pass

    return df

In [5]:
def load_sb1_rest_server() -> gpd.GeoDataFrame:
    """
    Load all the projects on the SB1
    map from the Feature Server.

    https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer
    """
    full_gdf = pd.DataFrame()
    for i in [*range(0, 22)]:
        df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)

    # Basic cleaning
    full_gdf = sb1_basic_cleaning(
        full_gdf, "agencies", "projecttitle", "projectid", "projectdescription"
    )

    # Fill in project titles that are empty with information
    # gleaned from the pop up.
    full_gdf["projecttitle"] = full_gdf["projecttitle"].fillna(full_gdf["popup"])

    # Throw out missing geometry
    missing_geo = full_gdf[full_gdf.geometry.is_empty]
    full_gdf = full_gdf[~full_gdf.geometry.is_empty].reset_index(drop=True)

    return full_gdf, missing_geo

In [6]:
# sb1, missing_geo_sb1 = load_sb1_rest_server()

In [7]:
# len(sb1), type(sb1)

In [8]:
# len(missing_geo_sb1)

In [9]:
def load_sb1_all_projects() -> pd.DataFrame:
    """
    Load in all projects layer of SB1 because it
    contains a value for every row in the
    project title column.
    """
    df = f"{url_pt1}22{url_pt2}"

    df = to_snakecase(gpd.read_file(df))

    # No geometry, just drop it
    df = df.drop(columns=["geometry"])

    # Basic cleaning
    df = sb1_basic_cleaning(
        df, "implementingagency", "projecttitle", "projectid", "projectdescription"
    )

    return df

In [10]:
# sb1_all_projects = load_sb1_all_projects()

In [11]:
# len(sb1_all_projects)

#### Link above broken, try new link
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer

In [30]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer/"

In [31]:
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [34]:
url1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [28]:
url2 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer/1/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [29]:
test_df = gpd.read_file(url2)

In [32]:
test_0 = "0"

In [33]:
test_df = gpd.read_file(f"{url_pt1}{test_0}{url_pt2}")

In [59]:
int_to_string = list(map(str, [*range(0, 28)]))

In [45]:
for i in int_to_string:
    df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
    full_gdf = pd.concat([full_gdf, df], axis=0)

In [46]:
full_gdf.shape

(21759, 76)

In [54]:
full_gdf.objectid.nunique()

14814

In [63]:
full_gdf.projname.value_counts().head()

2017 Residential Slurry Seal Project    286
Fog Seal                                121
Crack Seal                              120
Citywide Slurry Seal Project            107
ANNUAL STREET RESURFACING                69
Name: projname, dtype: int64

In [65]:
full_gdf[['projname', 'projecttitle']].sample(50)
          

Unnamed: 0,projname,projecttitle
920,,Major Damage Restoration
460,,Pavement - HM1 -SB1
6160,Hazel Dell Road,
598,,Safety - HM4
1135,,Bridge Rail Replacement and Upgrade
1102,,Multiple Objective
1265,Crack Seal,
764,Locust St.,
1106,Fog Seal,
6682,Preventive Maintenance,


In [53]:
full_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 21759 entries, 0 to 51
Data columns (total 76 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   objectid                21759 non-null  float64 
 1   projectid               17269 non-null  object  
 2   projname                8755 non-null   object  
 3   projcatcode             9536 non-null   object  
 4   projcategory            9536 non-null   object  
 5   projprogcode            9536 non-null   object  
 6   projprogram             9536 non-null   object  
 7   multiprogfunded         5 non-null      object  
 8   projstatus              8572 non-null   object  
 9   description             8492 non-null   object  
 10  cost                    2181 non-null   object  
 11  assemblydistrict        9535 non-null   object  
 12  senatedistrict          9535 non-null   object  
 13  assemblycode            9535 non-null   object  
 14  senatecode       

In [47]:
full_gdf2 = full_gdf[full_gdf.geometry.is_valid].reset_index(drop=True)

In [48]:
full_gdf2.shape

(13826, 76)

In [60]:
# Project table
project_url = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer/28/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"
projct_df = gpd.read_file(project_url)

In [61]:
projct_df.shape

(15384, 37)

In [62]:
projct_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 15384 entries, 0 to 15383
Data columns (total 37 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   OBJECTID                15384 non-null  int64   
 1   ProjectID               15384 non-null  object  
 2   ProjectTitle            15384 non-null  object  
 3   Program                 15384 non-null  object  
 4   IIJAProgram             15384 non-null  object  
 5   ProjectDescription      15384 non-null  object  
 6   TotalCost               14392 non-null  float64 
 7   FiscalYear              15384 non-null  object  
 8   SB1Funds                15356 non-null  float64 
 9   IIJAFunds               13729 non-null  float64 
 10  AgencyID                15384 non-null  object  
 11  ImplementingAgency      15384 non-null  object  
 12  AssemblyDistricts       15384 non-null  object  
 13  SenateDistricts         15384 non-null  object  
 14  CongressionalD

In [56]:
# full_gdf2.explore('projname')

In [12]:
def sb1_final() -> gpd.GeoDataFrame:
    """
    Layers 0-21 with geographic information
    don't always have project titles for each
    of the projects. Merge these layers with
    layer 22, which does have title information.
    """
    all_projects_subset = [
        "projecttitle",
        "programcodes",
        "totalcost",
        "implementingagency",
        "fiscalyearcode",
    ]

    sb1_geo, missing_geo = load_sb1_rest_server()
    sb1_all_projects = load_sb1_all_projects()[all_projects_subset]

    # Merge
    merge1 = pd.merge(
        sb1_geo,
        sb1_all_projects,
        how="left",
        left_on=["programcodes", "totalcost", "agencies", "fiscalyearcodes"],
        right_on=["programcodes", "totalcost", "implementingagency", "fiscalyearcode"],
    )

    # Fill in missing project titles in sb1_geo with information from
    # sb1_all_projects
    merge1.projecttitle_x = merge1.projecttitle_x.fillna(merge1.projecttitle_y)

    # Fill missing titles with none
    merge1.projecttitle_x = merge1.projecttitle_x.fillna("None")
    
    # Drop columns
    merge1 = merge1.drop(columns = ['projecttitle_y'])
    
    # Add a new column detailing where this information comes from 
    merge1 = merge1.assign(
        notes = merge1.programcodes + '/' + 'SB1')
    
    merge1 = merge1.fillna(merge1.dtypes.replace({'float64': 0.0, 'object': 'None'}))
    return merge1

In [13]:
sb1_final_gdf = sb1_final()

HTTPError: HTTP Error 500: Internal Server Error

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
len(sb1_final_gdf), sb1_final_gdf.geometry.nunique()

In [None]:
sb1_final_gdf.info()

In [None]:
sb1_final_gdf.projecttitle_x.value_counts().head(10)

In [None]:
sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].groupby(['projecttitle_x', 'countynames']).agg({'objectid':'count'}).head()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
sb1_final_gdf.projectstatuscodes.unique()

In [None]:
sb1_final_gdf.projectstatuses.value_counts()

In [None]:
sb1_final_gdf[(sb1_final_gdf.projecttitle_x == "major damage restoration") & (sb1_final_gdf.countynames == "Alameda")]

In [None]:
sb1_final_gdf[(sb1_final_gdf.projecttitle_x == "major damage restoration") & (sb1_final_gdf.countynames == "Alameda")].plot()

In [None]:
# sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].explore()