## SB1 
* Need to find out
    * What's the difference between RCA/RCA_Projects_092023 (FeatureServer) and RCA/RCA_Projects_102023 (FeatureServer) here https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA?
    * What's the right link to use to grab the projects: https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer or https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA?
        * I think link 1 is related to the SB1 map online
        * Unsure what the second map is about?

In [1]:
import _harmonization_utils
import _sb1_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis import utils
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Checking for districts

In [3]:
sb1_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer/"
sb1_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

full_gdf = pd.DataFrame()
for i in list(map(str, [*range(0, 16)])):
        df = to_snakecase(gpd.read_file(f"{sb1_pt1}{i}{sb1_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)

In [4]:
full_gdf.sample()

Unnamed: 0,objectid,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,description,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,popuptitle,popup,geometry,projcount,totalcosts,projlisturl,projlist2url,mponame,mpocode,assetcategory,bphotojpg,bphotohtml,aphotojpg,aphotohtml,routes,constyear,costfull,projagency
40,2349,,,Transit,State Transit,STA,State Transit Assistance,,,,,5,1,|05|,|01|,Placer,Auburn,|PLA|,|AUB|,,,,"<b>Agency:</b><br />City of Auburn<br /><br /><b>Estimated Award Increase (SB1):</b> $32,000<br /><br />",POINT (-121.07499 38.89992),1.0,"$32,000",,,,,,,,,,,,,


### Looking at duplicated projectid & objectid

In [None]:
project_table = _sb1_utils.load_sb1()

In [None]:
dont_preview = ["geometry", "description"]

In [None]:
project_table.loc[project_table.projectid == "0715000051"].drop(columns=dont_preview)

In [None]:
project_table.loc[project_table.projectid == "0717000060"].drop(columns=dont_preview)

In [None]:
# sb1_df1.loc[sb1_df1.projectid == "0200020286"].drop(columns = dont_preview)

In [None]:
len(project_table)

#### Once I want to concat Sb1 with the other data sources, I need one row = one project. 
* Right now, there are multiple rows for each project to account for a project being constructed in 1+ areas.
* Keep the row with the least number of nans for the project id.

In [None]:
project_table2 = _sb1_utils.one_row_one_project(project_table)

In [None]:
len(project_table2)

In [None]:
project_table.projectid.nunique(), project_table.projname.nunique(), project_table.description.nunique()

In [None]:
project_table2.projectid.nunique(), project_table2.projname.nunique(), project_table2.description.nunique()

In [None]:
    project_table2.projectid.value_counts().head()

In [None]:
project_table2.projectid.nunique(), project_table2.projname.nunique(), project_table2.description.nunique()

In [None]:
project_table.loc[project_table.projectid == "0200020286"].drop(columns=dont_preview)

In [None]:
project_table2.loc[project_table2.projectid == "0200020286"].drop(columns="description")

In [None]:
project_table2.loc[project_table2.projectid == "LppcC1Fy17206507Pp19"].drop(
    columns="description"
)

In [None]:
project_table2.loc[project_table2.projectid == "TcepC1Fy17206249Pp10"].drop(
    columns="description"
)

### Separate out SB1 based on our table schema

In [None]:
def merge(df: pd.DataFrame, subset_cols: list):
    df2 = df[subset_cols]
    display(
        pd.merge(df, df2, on=subset_cols, how="inner", indicator=True)[
            ["_merge"]
        ].value_counts()
    )
    return df2

#### Assembly Districts

In [None]:
assembly_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "assemblycode",
    "assemblydistrict",
]

In [None]:
sb1_df1.columns

In [None]:
assembly_df = merge(sb1_df1, assembly_subset)

In [None]:
assembly_df.sample(3)

#### Awards

In [None]:
sb1_df1.columns

In [None]:
awards_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "projprogcode",
    "projprogram",
    "multiprogfunded",
]

In [None]:
awards_df = merge(sb1_df1, awards_subset)

In [None]:
awards_df.projprogram.value_counts()

#### County

In [None]:
county_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "countyname",
    "countycode",
]

In [None]:
county_df = merge(sb1_df1, county_subset)

In [None]:
county_df.sample(3)

#### City

In [None]:
city_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "cityname",
    "citycode",
]

In [None]:
city_df = merge(sb1_df1, city_subset)

#### Senate District

In [None]:
sb1_df1.columns

In [None]:
senate_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "senatedistrict",
    "senatecode",
]

In [None]:
senate_df = merge(sb1_df1, senate_subset)

#### Geometry

In [None]:
geo_subset = ["objectid", "projectid", "projname", "projagency", "geometry"]

In [None]:
gdf = merge(sb1_df1, geo_subset)

In [None]:
type(gdf)

#### Project Base Table

In [None]:
project_subset = [
    "objectid",
    "projectid",
    "projname",
    "description",
    "cost",
    "totalcosts",
    "routes",
    "costfull",
    "projagency",
    "appagencyname",
    "impagencyname",
]

In [None]:
project_df = sb1_df1[project_subset]

In [None]:
# project_df.sample(100)

#### Phase Funding

In [None]:
phase_subset = [
    "objectid",
    "projectid",
    "projname",
    "projstatus",
    "projagency",
    "constyear",
]

In [None]:
phase_df = merge(sb1_df1, phase_subset)

In [None]:
phase_df.head()

In [None]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer/"

In [None]:
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
int_to_string = list(map(str, [*range(0, 28)]))

In [None]:
full_gdf = pd.DataFrame()
for i in int_to_string:
    df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
    full_gdf = pd.concat([full_gdf, df], axis=0)

In [None]:
full_gdf.shape

In [None]:
full_gdf.shape, sb1_df1.shape

#### Compare

In [None]:
full_gdf.columns

In [None]:
full_gdf2 = full_gdf[
    ["projecttitle", "projectdescription", "countycodes", "ct_districts"]
]

In [None]:
sb1_df1.columns

In [None]:
sb1_df2 = sb1_df1[["projname", "projagency", "description", "countycode"]]

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "description", "countycode"],
    right_on=["projecttitle", "projectdescription", "countycodes"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "description"],
    right_on=["projecttitle", "projectdescription"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "countycode"],
    right_on=["projecttitle", "countycodes"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
def sb1_basic_cleaning(
    gdf: gpd.GeoDataFrame, project_name_col: str
) -> gpd.GeoDataFrame:
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Some project names contain the year. Remove anything after 20..
    gdf[project_name_col] = gdf[project_name_col].str.split("20").str[0]

    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = gdf.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            gdf[i] = (
                gdf[i]
                .str.replace("|", "")
                .str.lower()
                .str.replace("[^\w\s]", "")
                .str.strip()
            )
    except:
        pass

    # Try to extract titles from popups
    try:
        df["popup"] = (
            df["popup"]
            .str.split("<br  />")
            .str[1]
            .str.split("20")
            .str[0]
            .str.lower()
            .str.strip()
            .str.replace("[^\w\s]", "")
        )
    except:
        pass

    return gdf

In [None]:
def load_sb1_all_projects() -> pd.DataFrame:
    """
    Load in all projects layer of SB1 because it
    contains a value for every row in the
    project title column.
    """
    df = f"{url_pt1}22{url_pt2}"

    df = to_snakecase(gpd.read_file(df))

    # No geometry, just drop it
    df = df.drop(columns=["geometry"])

    # Basic cleaning
    df = sb1_basic_cleaning(
        df, "implementingagency", "projecttitle", "projectid", "projectdescription"
    )

    return df

In [None]:
def sb1_final() -> gpd.GeoDataFrame:
    """
    Layers 0-21 with geographic information
    don't always have project titles for each
    of the projects. Merge these layers with
    layer 22, which does have title information.
    """
    all_projects_subset = [
        "projecttitle",
        "programcodes",
        "totalcost",
        "implementingagency",
        "fiscalyearcode",
    ]

    sb1_geo, missing_geo = load_sb1_rest_server()
    sb1_all_projects = load_sb1_all_projects()[all_projects_subset]

    # Merge
    merge1 = pd.merge(
        sb1_geo,
        sb1_all_projects,
        how="left",
        left_on=["programcodes", "totalcost", "agencies", "fiscalyearcodes"],
        right_on=["programcodes", "totalcost", "implementingagency", "fiscalyearcode"],
    )

    # Fill in missing project titles in sb1_geo with information from
    # sb1_all_projects
    merge1.projecttitle_x = merge1.projecttitle_x.fillna(merge1.projecttitle_y)

    # Fill missing titles with none
    merge1.projecttitle_x = merge1.projecttitle_x.fillna("None")

    # Drop columns
    merge1 = merge1.drop(columns=["projecttitle_y"])

    # Add a new column detailing where this information comes from
    merge1 = merge1.assign(notes=merge1.programcodes + "/" + "SB1")

    merge1 = merge1.fillna(merge1.dtypes.replace({"float64": 0.0, "object": "None"}))
    return merge1

In [None]:
sb1_final_gdf = sb1_final()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
len(sb1_final_gdf), sb1_final_gdf.geometry.nunique()

In [None]:
sb1_final_gdf.info()

In [None]:
sb1_final_gdf.projecttitle_x.value_counts().head(10)

In [None]:
sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].groupby(
    ["projecttitle_x", "countynames"]
).agg({"objectid": "count"}).head()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
sb1_final_gdf.projectstatuscodes.unique()

In [None]:
sb1_final_gdf.projectstatuses.value_counts()

In [None]:
sb1_final_gdf[
    (sb1_final_gdf.projecttitle_x == "major damage restoration")
    & (sb1_final_gdf.countynames == "Alameda")
]

In [None]:
sb1_final_gdf[
    (sb1_final_gdf.projecttitle_x == "major damage restoration")
    & (sb1_final_gdf.countynames == "Alameda")
].plot()

In [None]:
# sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].explore()