## SB1 
* Need to find out
    * What's the difference between RCA/RCA_Projects_092023 (FeatureServer) and RCA/RCA_Projects_102023 (FeatureServer) here https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA?
    * What's the right link to use to grab the projects: https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer or https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA?
        * I think link 1 is related to the SB1 map online
        * Unsure what the second map is about?

In [1]:
import _harmonization_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis import utils
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

#### Link 1
* SB1 Only
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer

In [3]:
def sb1_basic_cleaning(
    gdf: gpd.GeoDataFrame, project_name_col: str, agency_col: str
) -> gpd.GeoDataFrame:
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Some project names contain the year. Remove anything after 20..
    gdf[project_name_col] = gdf[project_name_col].str.split("20").str[0]

    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = gdf.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            gdf[i] = (
                gdf[i]
                .str.replace("|", "")
                .str.title()
                .str.replace("[^\w\s]", "")
                .str.strip()
            )
            gdf[i] = gdf[i].fillna("None")
    except:
        pass

    # Project agency always says "Los Angeles submitted by county"
    # or "Fremont submitted by city." Remove submitted by.
    gdf[agency_col] = gdf[agency_col].str.replace("Submitted By", " ")

    return gdf

In [4]:
def load_sb1() -> gpd.GeoDataFrame:
    """
    Load all the projects on the SB1 map from the Feature Server.

    https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer
    """
    sb1_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer/"
    sb1_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

    full_gdf = pd.DataFrame()
    for i in list(map(str, [*range(0, 16)])):
        df = to_snakecase(gpd.read_file(f"{sb1_pt1}{i}{sb1_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)

    # Basic cleaning
    full_gdf = sb1_basic_cleaning(
        full_gdf,
        "projname",
        "projagency",
    )

    # Find missing geo
    missing_geo = full_gdf[(full_gdf.geometry.is_empty)]
    print(f"{len(missing_geo)} rows are mising geometry")

    # Find invalid geo
    invalid_geo = full_gdf[~full_gdf.geometry.is_valid].reset_index(drop=True)
    print(f"{len(invalid_geo)} rows contain invalid geography")

    # Filter out completed
    full_gdf = full_gdf.loc[full_gdf.projstatus != "Completed"].reset_index(drop=True)
    return full_gdf

In [5]:
sb1_df1 = load_sb1()

  gdf[i] = gdf[i].str.replace("|", "").str.title().str.replace("[^\w\s]", "").str.strip()
  gdf[i] = gdf[i].str.replace("|", "").str.title().str.replace("[^\w\s]", "").str.strip()


0 rows are mising geometry
7917 rows contain invalid geography


In [6]:
sb1_df1.projname.nunique()

5278

In [7]:
sb1_df1.projectid.nunique()

7603

In [8]:
len(sb1_df1)

9186

In [9]:
sb1_df1.projstatus.value_counts()

PreConstruction    7093
Construction       1129
None                964
Name: projstatus, dtype: int64

In [10]:
sb1_df1.projprogram.value_counts()

201718 Local Streets And Roads                3814
201819 Local Streets And Roads                2295
State Highway Projects                        1089
Local Streets And Roads                        539
State Of Good Repair                           429
State Transit Assistance                       370
Active Transportation Program                  318
Local Partnership Program                      174
Transit And Intercity Rail Capital Program      60
Trade Corridor Enhancement Program              57
State Rail Assistance                           23
Solutions For Congested Corridors Program       17
Solutions To Congested Corridors Program         1
Name: projprogram, dtype: int64

In [12]:
sb1_df1.objectid.nunique(), len(sb1_df1)

(9007, 9186)

### Separate out SB1 based on our table schema

In [13]:
def merge(df: pd.DataFrame, subset_cols: list):
    df2 = df[subset_cols]
    display(
        pd.merge(df, df2, on=subset_cols, how="inner", indicator=True)[
            ["_merge"]
        ].value_counts()
    )
    return df2

#### Assembly Districts

In [14]:
assembly_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "assemblycode",
    "assemblydistrict",
]

In [37]:
sb1_df1.columns

Index(['objectid', 'projectid', 'projname', 'projcatcode', 'projcategory',
       'projprogcode', 'projprogram', 'multiprogfunded', 'projstatus',
       'description', 'cost', 'assemblydistrict', 'senatedistrict',
       'assemblycode', 'senatecode', 'countyname', 'cityname', 'countycode',
       'citycode', 'appagencyname', 'impagencyname', 'popuptitle', 'popup',
       'geometry', 'projcount', 'totalcosts', 'projlisturl', 'projlist2url',
       'mponame', 'mpocode', 'assetcategory', 'bphotojpg', 'bphotohtml',
       'aphotojpg', 'aphotohtml', 'routes', 'constyear', 'costfull',
       'projagency'],
      dtype='object')

In [15]:
assembly_df = merge(sb1_df1, assembly_subset)

_merge    
both          9186
left_only        0
right_only       0
dtype: int64

In [16]:
assembly_df.sample(3)

Unnamed: 0,objectid,projectid,projname,projagency,assemblycode,assemblydistrict
1000,2117,1016000053,San Joaquin 4 5 99 Changeable Message Signs Mobility Project,,13,13
1736,165871,LppfC1Fy17196406Pp17,MarinSonoma Narrows Design Contracts B1Ph1 And A4,Transportation Authority Marin County,10,10
6255,170679,LsrFy18195147Pp1546,9Th Street Rehabilitation Originally Listed In Fy,Upland City,41,41


#### Awards

In [38]:
sb1_df1.columns

Index(['objectid', 'projectid', 'projname', 'projcatcode', 'projcategory',
       'projprogcode', 'projprogram', 'multiprogfunded', 'projstatus',
       'description', 'cost', 'assemblydistrict', 'senatedistrict',
       'assemblycode', 'senatecode', 'countyname', 'cityname', 'countycode',
       'citycode', 'appagencyname', 'impagencyname', 'popuptitle', 'popup',
       'geometry', 'projcount', 'totalcosts', 'projlisturl', 'projlist2url',
       'mponame', 'mpocode', 'assetcategory', 'bphotojpg', 'bphotohtml',
       'aphotojpg', 'aphotohtml', 'routes', 'constyear', 'costfull',
       'projagency'],
      dtype='object')

In [17]:
awards_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "projprogcode",
    "projprogram",
    "multiprogfunded",
]

In [18]:
awards_df = merge(sb1_df1, awards_subset)

_merge    
both          9186
left_only        0
right_only       0
dtype: int64

In [19]:
awards_df.projprogram.value_counts()

201718 Local Streets And Roads                3814
201819 Local Streets And Roads                2295
State Highway Projects                        1089
Local Streets And Roads                        539
State Of Good Repair                           429
State Transit Assistance                       370
Active Transportation Program                  318
Local Partnership Program                      174
Transit And Intercity Rail Capital Program      60
Trade Corridor Enhancement Program              57
State Rail Assistance                           23
Solutions For Congested Corridors Program       17
Solutions To Congested Corridors Program         1
Name: projprogram, dtype: int64

#### County

In [20]:
county_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "countyname",
    "countycode",
]

In [21]:
county_df = merge(sb1_df1, county_subset)

_merge    
both          9186
left_only        0
right_only       0
dtype: int64

In [22]:
county_df.sample(3)

Unnamed: 0,objectid,projectid,projname,projagency,countyname,countycode
1477,4440,,,,Orange,Ora
2705,166997,LsrFy17185160Pp179,Crack Seal,Los Banos City,Merced,Mer
3868,168205,LsrFy17185386Pp001,,Clayton City,Contra Costa,Cc


#### City

In [23]:
city_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "cityname",
    "citycode",
]

In [24]:
city_df = merge(sb1_df1, city_subset)

_merge    
both          9186
left_only        0
right_only       0
dtype: int64

#### Senate District

In [25]:
sb1_df1.columns

Index(['objectid', 'projectid', 'projname', 'projcatcode', 'projcategory',
       'projprogcode', 'projprogram', 'multiprogfunded', 'projstatus',
       'description', 'cost', 'assemblydistrict', 'senatedistrict',
       'assemblycode', 'senatecode', 'countyname', 'cityname', 'countycode',
       'citycode', 'appagencyname', 'impagencyname', 'popuptitle', 'popup',
       'geometry', 'projcount', 'totalcosts', 'projlisturl', 'projlist2url',
       'mponame', 'mpocode', 'assetcategory', 'bphotojpg', 'bphotohtml',
       'aphotojpg', 'aphotohtml', 'routes', 'constyear', 'costfull',
       'projagency'],
      dtype='object')

In [26]:
senate_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "senatedistrict",
    "senatecode",
]

In [27]:
senate_df = merge(sb1_df1, senate_subset)

_merge    
both          9186
left_only        0
right_only       0
dtype: int64

#### Geometry

In [28]:
geo_subset = ["objectid", "projectid", "projname", "projagency", "geometry"]

In [29]:
gdf = merge(sb1_df1, geo_subset)

_merge    
both          9186
left_only        0
right_only       0
dtype: int64

In [30]:
type(gdf)

geopandas.geodataframe.GeoDataFrame

#### Project Base Table

In [31]:
project_subset = [
    "objectid",
    "projectid",
    "projname",
    "description",
    "cost",
    "totalcosts",
    "routes",
    "costfull",
    "projagency",
    "appagencyname",
    "impagencyname",
]

In [32]:
project_df = sb1_df1[project_subset]

In [33]:
# project_df.sample(100)

#### Phase Funding

In [34]:
phase_subset = [
    "objectid",
    "projectid",
    "projname",
    "projstatus",
    "projagency",
    "constyear",
]

In [35]:
phase_df = merge(sb1_df1, phase_subset)

_merge    
both          9186
left_only        0
right_only       0
dtype: int64

In [36]:
stop

NameError: name 'stop' is not defined

### Scratch
#### Link 2
* This includes IIJA
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer
* Why are there LESS projects below compared to SB1??

In [None]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer/"

In [None]:
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
int_to_string = list(map(str, [*range(0, 28)]))

In [None]:
full_gdf = pd.DataFrame()
for i in int_to_string:
    df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
    full_gdf = pd.concat([full_gdf, df], axis=0)

In [None]:
full_gdf.shape

In [None]:
full_gdf.shape, sb1_df1.shape

#### Compare

In [None]:
full_gdf.columns

In [None]:
full_gdf2 = full_gdf[
    ["projecttitle", "projectdescription", "countycodes", "ct_districts"]
]

In [None]:
sb1_df1.columns

In [None]:
sb1_df2 = sb1_df1[["projname", "projagency", "description", "countycode"]]

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "description", "countycode"],
    right_on=["projecttitle", "projectdescription", "countycodes"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "description"],
    right_on=["projecttitle", "projectdescription"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "countycode"],
    right_on=["projecttitle", "countycodes"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

### Original functions

In [None]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
def sb1_basic_cleaning(
    gdf: gpd.GeoDataFrame, project_name_col: str
) -> gpd.GeoDataFrame:
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Some project names contain the year. Remove anything after 20..
    gdf[project_name_col] = gdf[project_name_col].str.split("20").str[0]

    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = gdf.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            gdf[i] = (
                gdf[i]
                .str.replace("|", "")
                .str.lower()
                .str.replace("[^\w\s]", "")
                .str.strip()
            )
    except:
        pass

    # Try to extract titles from popups
    try:
        df["popup"] = (
            df["popup"]
            .str.split("<br  />")
            .str[1]
            .str.split("20")
            .str[0]
            .str.lower()
            .str.strip()
            .str.replace("[^\w\s]", "")
        )
    except:
        pass

    return gdf

In [None]:
def load_sb1_all_projects() -> pd.DataFrame:
    """
    Load in all projects layer of SB1 because it
    contains a value for every row in the
    project title column.
    """
    df = f"{url_pt1}22{url_pt2}"

    df = to_snakecase(gpd.read_file(df))

    # No geometry, just drop it
    df = df.drop(columns=["geometry"])

    # Basic cleaning
    df = sb1_basic_cleaning(
        df, "implementingagency", "projecttitle", "projectid", "projectdescription"
    )

    return df

In [None]:
def sb1_final() -> gpd.GeoDataFrame:
    """
    Layers 0-21 with geographic information
    don't always have project titles for each
    of the projects. Merge these layers with
    layer 22, which does have title information.
    """
    all_projects_subset = [
        "projecttitle",
        "programcodes",
        "totalcost",
        "implementingagency",
        "fiscalyearcode",
    ]

    sb1_geo, missing_geo = load_sb1_rest_server()
    sb1_all_projects = load_sb1_all_projects()[all_projects_subset]

    # Merge
    merge1 = pd.merge(
        sb1_geo,
        sb1_all_projects,
        how="left",
        left_on=["programcodes", "totalcost", "agencies", "fiscalyearcodes"],
        right_on=["programcodes", "totalcost", "implementingagency", "fiscalyearcode"],
    )

    # Fill in missing project titles in sb1_geo with information from
    # sb1_all_projects
    merge1.projecttitle_x = merge1.projecttitle_x.fillna(merge1.projecttitle_y)

    # Fill missing titles with none
    merge1.projecttitle_x = merge1.projecttitle_x.fillna("None")

    # Drop columns
    merge1 = merge1.drop(columns=["projecttitle_y"])

    # Add a new column detailing where this information comes from
    merge1 = merge1.assign(notes=merge1.programcodes + "/" + "SB1")

    merge1 = merge1.fillna(merge1.dtypes.replace({"float64": 0.0, "object": "None"}))
    return merge1

In [None]:
sb1_final_gdf = sb1_final()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
len(sb1_final_gdf), sb1_final_gdf.geometry.nunique()

In [None]:
sb1_final_gdf.info()

In [None]:
sb1_final_gdf.projecttitle_x.value_counts().head(10)

In [None]:
sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].groupby(
    ["projecttitle_x", "countynames"]
).agg({"objectid": "count"}).head()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
sb1_final_gdf.projectstatuscodes.unique()

In [None]:
sb1_final_gdf.projectstatuses.value_counts()

In [None]:
sb1_final_gdf[
    (sb1_final_gdf.projecttitle_x == "major damage restoration")
    & (sb1_final_gdf.countynames == "Alameda")
]

In [None]:
sb1_final_gdf[
    (sb1_final_gdf.projecttitle_x == "major damage restoration")
    & (sb1_final_gdf.countynames == "Alameda")
].plot()

In [None]:
# sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].explore()