## SB1 
* Need to find out
    * What's the difference between RCA/RCA_Projects_092023 (FeatureServer) and RCA/RCA_Projects_102023 (FeatureServer) here https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA?
    * What's the right link to use to grab the projects: https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer or https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA?
        * I think link 1 is related to the SB1 map online
        * Unsure what the second map is about?

In [23]:
import _harmonization_utils
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis import utils
from calitp_data_analysis.sql import to_snakecase

In [24]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Link 1
* SB1 Only
* https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer

In [25]:
def sb1_basic_cleaning(
    gdf: gpd.GeoDataFrame, project_name_col: str, agency_col: str
) -> gpd.GeoDataFrame:
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Some project names contain the year. Remove anything after 20..
    gdf[project_name_col] = gdf[project_name_col].str.split("20").str[0]

    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = gdf.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            gdf[i] = (
                gdf[i]
                .str.replace("|", "")
                .str.title()
                .str.replace("[^\w\s]", "")
                .str.strip()
            )
            gdf[i] = gdf[i].fillna("None")
    except:
        pass

    # Project agency always says "Los Angeles submitted by county"
    # or "Fremont submitted by city." Remove submitted by.
    gdf[agency_col] = gdf[agency_col].str.replace("Submitted By", " ")

    return gdf

In [26]:
def load_sb1() -> gpd.GeoDataFrame:
    """
    Load all the projects on the SB1 map from the Feature Server.

    https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer
    """
    sb1_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/SB1/SB1_ProjectData/FeatureServer/"
    sb1_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

    full_gdf = pd.DataFrame()
    for i in list(map(str, [*range(0, 16)])):
        df = to_snakecase(gpd.read_file(f"{sb1_pt1}{i}{sb1_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)

    # Basic cleaning
    full_gdf = sb1_basic_cleaning(
        full_gdf,
        "projname",
        "projagency",
    )

    # Find missing geo
    missing_geo = full_gdf[(full_gdf.geometry.is_empty)]
    print(f"{len(missing_geo)} rows are mising geometry")

    # Find invalid geo
    invalid_geo = full_gdf[~full_gdf.geometry.is_valid].reset_index(drop=True)
    print(f"{len(invalid_geo)} rows contain invalid geography")

    # Filter out completed
    full_gdf = full_gdf.loc[full_gdf.projstatus != "Completed"].reset_index(drop=True)
    
    # Cols to keep
    keep = ['projectid', 'projname', 'projcatcode', 'projcategory',
       'projprogcode', 'projprogram', 'multiprogfunded', 'projstatus',
       'description', 'cost', 'assemblydistrict', 'senatedistrict',
       'assemblycode', 'senatecode', 'countyname', 'cityname', 'countycode',
       'citycode', 'appagencyname', 'impagencyname',
       'geometry','totalcosts',
        'routes', 'constyear', 'costfull',
       'projagency']
    
    full_gdf = full_gdf[keep] 
    return full_gdf

In [27]:
sb1_df1 = load_sb1()

  gdf[i]
  gdf[i]


0 rows are mising geometry
7917 rows contain invalid geography


In [28]:
sb1_df1.projname.nunique()

5278

In [29]:
sb1_df1.projectid.nunique()

7603

### Looking at duplicated projectid & objectid

In [30]:
len(sb1_df1.drop_duplicates())

9186

In [31]:
sb1_df1.projectid.value_counts().head(5)

None          964
0715000051      3
0716000346      3
0717000054      3
0717000060      3
Name: projectid, dtype: int64

In [32]:
sb1_df1.projectid.value_counts().describe()

count   7603.00
mean       1.21
std       11.05
min        1.00
25%        1.00
50%        1.00
75%        1.00
max      964.00
Name: projectid, dtype: float64

#### Testing dropping duplicates

In [33]:
sb1_df1.columns

Index(['projectid', 'projname', 'projcatcode', 'projcategory', 'projprogcode',
       'projprogram', 'multiprogfunded', 'projstatus', 'description', 'cost',
       'assemblydistrict', 'senatedistrict', 'assemblycode', 'senatecode',
       'countyname', 'cityname', 'countycode', 'citycode', 'appagencyname',
       'impagencyname', 'geometry', 'totalcosts', 'routes', 'constyear',
       'costfull', 'projagency'],
      dtype='object')

In [34]:
df1 = sb1_df1.sort_values(['projectid','projname','description'])

In [35]:
df1 = sb1_df1.sort_values(['totalcosts', 'cost', 'costfull'], ascending = [False, False, False])

In [36]:
df1['costfull'] = df1.groupby(['projectid','projname','description'])['costfull'].ffill()

In [37]:
df1['totalcosts'] = df1.groupby(['projectid','projname','description'])['totalcosts'].ffill()

In [38]:
df1['costfull'] = df1.groupby(['projectid','projname','description'])['costfull'].ffill()

In [39]:
df1 = df1.sort_values(['projagency'], ascending = [True])

In [40]:
df1.projagency = df1.projagency.replace(regex='None', value=np.nan)

In [48]:
df1['projagency'] = df1.groupby(['projectid','projname','description'])['projagency'].ffill()

In [49]:
dont_preview = ['description','geometry']

In [50]:
df1.loc[df1.projectid == "0717000060"].drop(columns = dont_preview)

Unnamed: 0,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,totalcosts,routes,constyear,costfull,projagency
9025,717000060,Los Angeles 2 5 91 405 Upgrade Traffic Management System Mobility Project,State,State,Shp,State Highway Projects,,PreConstruction,24700000,43 51 57 58 62 63 64 65 66,24 25 26 29 32 33 35,435157586263646566,24252629323335,Los Angeles,,La,,,,,,,24697000.0,Caltrans
941,717000060,Los Angeles 2 5 91 405 Upgrade Traffic Management System Mobility Project,State,State,Shp,State Highway Projects,,PreConstruction,24700000,43 51 57 58 62 63 64 65 66,24 25 26 29 32 33 35,435157586263646566,24252629323335,Los Angeles,,La,,,,,2591405.0,,24697000.0,Caltrans
377,717000060,Los Angeles 2 5 91 405 Upgrade Traffic Management System Mobility Project,State,State,Shp,State Highway Projects,,PreConstruction,24700000,43 51 57 58 62 63 64 65 66,24 25 26 29 32 33 35,435157586263646566,24252629323335,Los Angeles,,La,,,,,2591405.0,,24697000.0,Caltrans


In [62]:
df1 = df1.replace(regex='None', value=np.nan)

In [63]:
df1['geom_type'] = df1.geometry.geom_type

In [64]:
df1 = df1.sort_values(['geom_type']).reset_index(drop = True)

In [65]:
drop_dups = ['projectid', 'projname', 'projcatcode', 'projcategory', 'projprogcode',
       'projprogram', 'multiprogfunded', 'projstatus', 'description', 'cost',
       'assemblydistrict', 'senatedistrict', 'assemblycode', 'senatecode',
       'countyname', 'cityname', 'countycode', 'citycode', 'appagencyname',
       'impagencyname', 'totalcosts', 'routes', 'constyear',
       'costfull', 'projagency', ]

In [66]:
len(df1.drop_duplicates(subset = drop_dups))

9147

In [67]:
len(df1)

9186

In [68]:
df2 = df1.drop_duplicates(subset = drop_dups)

In [69]:
df2['number_of_nans'] = df2.isnull().sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [74]:
df2 = df2.sort_values(['number_of_nans']).reset_index(drop = True)

In [75]:
df2.loc[df2.projectid == "0717000060"].drop(columns = dont_preview)

Unnamed: 0,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,totalcosts,routes,constyear,costfull,projagency,geom_type,number_of_nans
521,717000060,Los Angeles 2 5 91 405 Upgrade Traffic Management System Mobility Project,State,State,Shp,State Highway Projects,,PreConstruction,24700000,43 51 57 58 62 63 64 65 66,24 25 26 29 32 33 35,435157586263646566,24252629323335,Los Angeles,,La,,,,,2591405.0,,24697000.0,Caltrans,LineString,5
6125,717000060,Los Angeles 2 5 91 405 Upgrade Traffic Management System Mobility Project,State,State,Shp,State Highway Projects,,PreConstruction,24700000,43 51 57 58 62 63 64 65 66,24 25 26 29 32 33 35,435157586263646566,24252629323335,Los Angeles,,La,,,,,,,24697000.0,Caltrans,,10


### Keep the row with geo
* How to keep the same project with geographies for 1+ row but not in another
* 

In [80]:
drop_dups = ['projectid', 'projname', 'projstatus', 'description', 'cost',
        'totalcosts', 
       'costfull', 'projagency', ]

In [81]:
df3 = df2.drop_duplicates(subset = drop_dups)

In [82]:
df3.loc[df3.projectid == "0717000060"].drop(columns = dont_preview)

Unnamed: 0,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,totalcosts,routes,constyear,costfull,projagency,geom_type,number_of_nans
521,717000060,Los Angeles 2 5 91 405 Upgrade Traffic Management System Mobility Project,State,State,Shp,State Highway Projects,,PreConstruction,24700000,43 51 57 58 62 63 64 65 66,24 25 26 29 32 33 35,435157586263646566,24252629323335,Los Angeles,,La,,,,,2591405,,24697000.0,Caltrans,LineString,5


In [84]:
df3.loc[df3.projectid == "0715000051"].drop(columns = dont_preview)

Unnamed: 0,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,totalcosts,routes,constyear,costfull,projagency,geom_type,number_of_nans
207,715000051,Los Angeles 110 2 5 405 Traffic Management System Installation Project,State,State,Shp,State Highway Projects,,PreConstruction,38500000,43 51 53 59 62 64 66 70,24 25 26 30 35,4351535962646670,2425263035,Los Angeles,Carson Glendale Hawthorne Los Angeles,La,Crsngndlhawla,,,,25110405,,38547000.0,Caltrans,LineString,5


In [85]:
df1.loc[df1.projectid == "0715000051"].drop(columns = dont_preview)

Unnamed: 0,projectid,projname,projcatcode,projcategory,projprogcode,projprogram,multiprogfunded,projstatus,cost,assemblydistrict,senatedistrict,assemblycode,senatecode,countyname,cityname,countycode,citycode,appagencyname,impagencyname,totalcosts,routes,constyear,costfull,projagency,geom_type
201,715000051,Los Angeles 110 2 5 405 Traffic Management System Installation Project,State,State,Shp,State Highway Projects,,PreConstruction,38500000,43 51 53 59 62 64 66 70,24 25 26 30 35,4351535962646670,2425263035,Los Angeles,Carson Glendale Hawthorne Los Angeles,La,Crsngndlhawla,,,,25110405.0,,38547000.0,Caltrans,LineString
391,715000051,Los Angeles 110 2 5 405 Traffic Management System Installation Project,State,State,Shp,State Highway Projects,,PreConstruction,38500000,43 51 53 59 62 64 66 70,24 25 26 30 35,4351535962646670,2425263035,Los Angeles,Carson Glendale Hawthorne Los Angeles,La,Crsngndlhawla,,,,25110405.0,,38547000.0,Caltrans,MultiPoint
2502,715000051,Los Angeles 110 2 5 405 Traffic Management System Installation Project,State,State,Shp,State Highway Projects,,PreConstruction,38500000,43 51 53 59 62 64 66 70,24 25 26 30 35,4351535962646670,2425263035,Los Angeles,Carson Glendale Hawthorne Los Angeles,La,Crsngndlhawla,,,,,2019.0,38547000.0,Caltrans,


In [86]:
len(sb1_df1)

9186

In [87]:
len(df3)

7984

In [88]:
df3.projectid.value_counts().head()

LppcC1Fy17206069Pp22    1
LsrFy17185322Pp011      1
LsrFy17185160Pp203      1
LsrFy17185377Pp008      1
LsrFy17185255Pp024      1
Name: projectid, dtype: int64

In [92]:
df3.projectid.nunique()

7602

In [93]:
sb1_df1.projectid.nunique()

7603

In [94]:
sb1_df1.projname.nuqnique()

5278

In [96]:
sb1_df1.projname.value_counts().head()

None                           781
                               705
Fy                             177
Crack Seal                     122
Maintenance  Rehabilitation    113
Name: projname, dtype: int64

In [95]:
df3.projname.nunique()

5277

In [97]:
df3.projname.value_counts().head()

                                701
Fy                              177
Crack Seal                      122
Maintenance  Rehabilitation     113
Citywide Slurry Seal Project    107
Name: projname, dtype: int64

In [91]:
df3.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 7984 entries, 0 to 9146
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   projectid         7602 non-null   object  
 1   projname          7775 non-null   object  
 2   projcatcode       7984 non-null   object  
 3   projcategory      7984 non-null   object  
 4   projprogcode      7984 non-null   object  
 5   projprogram       7984 non-null   object  
 6   multiprogfunded   5 non-null      object  
 7   projstatus        7602 non-null   object  
 8   description       7517 non-null   object  
 9   cost              1493 non-null   object  
 10  assemblydistrict  7983 non-null   object  
 11  senatedistrict    7983 non-null   object  
 12  assemblycode      7983 non-null   object  
 13  senatecode        7983 non-null   object  
 14  countyname        7982 non-null   object  
 15  cityname          5756 non-null   object  
 16  countycode      

### Separate out SB1 based on our table schema

In [None]:
def merge(df: pd.DataFrame, subset_cols: list):
    df2 = df[subset_cols]
    display(
        pd.merge(df, df2, on=subset_cols, how="inner", indicator=True)[
            ["_merge"]
        ].value_counts()
    )
    return df2

#### Assembly Districts

In [None]:
assembly_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "assemblycode",
    "assemblydistrict",
]

In [None]:
sb1_df1.columns

In [None]:
assembly_df = merge(sb1_df1, assembly_subset)

In [None]:
assembly_df.sample(3)

#### Awards

In [None]:
sb1_df1.columns

In [None]:
awards_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "projprogcode",
    "projprogram",
    "multiprogfunded",
]

In [None]:
awards_df = merge(sb1_df1, awards_subset)

In [None]:
awards_df.projprogram.value_counts()

#### County

In [None]:
county_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "countyname",
    "countycode",
]

In [None]:
county_df = merge(sb1_df1, county_subset)

In [None]:
county_df.sample(3)

#### City

In [None]:
city_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "cityname",
    "citycode",
]

In [None]:
city_df = merge(sb1_df1, city_subset)

#### Senate District

In [None]:
sb1_df1.columns

In [None]:
senate_subset = [
    "objectid",
    "projectid",
    "projname",
    "projagency",
    "senatedistrict",
    "senatecode",
]

In [None]:
senate_df = merge(sb1_df1, senate_subset)

#### Geometry

In [None]:
geo_subset = ["objectid", "projectid", "projname", "projagency", "geometry"]

In [None]:
gdf = merge(sb1_df1, geo_subset)

In [None]:
type(gdf)

#### Project Base Table

In [None]:
project_subset = [
    "objectid",
    "projectid",
    "projname",
    "description",
    "cost",
    "totalcosts",
    "routes",
    "costfull",
    "projagency",
    "appagencyname",
    "impagencyname",
]

In [None]:
project_df = sb1_df1[project_subset]

In [None]:
# project_df.sample(100)

#### Phase Funding

In [None]:
phase_subset = [
    "objectid",
    "projectid",
    "projname",
    "projstatus",
    "projagency",
    "constyear",
]

In [None]:
phase_df = merge(sb1_df1, phase_subset)

In [None]:
phase_df.head()

In [None]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_092023/FeatureServer/"

In [None]:
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
int_to_string = list(map(str, [*range(0, 28)]))

In [None]:
full_gdf = pd.DataFrame()
for i in int_to_string:
    df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
    full_gdf = pd.concat([full_gdf, df], axis=0)

In [None]:
full_gdf.shape

In [None]:
full_gdf.shape, sb1_df1.shape

#### Compare

In [None]:
full_gdf.columns

In [None]:
full_gdf2 = full_gdf[
    ["projecttitle", "projectdescription", "countycodes", "ct_districts"]
]

In [None]:
sb1_df1.columns

In [None]:
sb1_df2 = sb1_df1[["projname", "projagency", "description", "countycode"]]

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "description", "countycode"],
    right_on=["projecttitle", "projectdescription", "countycodes"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "description"],
    right_on=["projecttitle", "projectdescription"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
pd.merge(
    sb1_df2,
    full_gdf2,
    left_on=["projname", "countycode"],
    right_on=["projecttitle", "countycodes"],
    how="outer",
    indicator=True,
)[["_merge"]].value_counts()

In [None]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [None]:
def sb1_basic_cleaning(
    gdf: gpd.GeoDataFrame, project_name_col: str
) -> gpd.GeoDataFrame:
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Some project names contain the year. Remove anything after 20..
    gdf[project_name_col] = gdf[project_name_col].str.split("20").str[0]

    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = gdf.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            gdf[i] = (
                gdf[i]
                .str.replace("|", "")
                .str.lower()
                .str.replace("[^\w\s]", "")
                .str.strip()
            )
    except:
        pass

    # Try to extract titles from popups
    try:
        df["popup"] = (
            df["popup"]
            .str.split("<br  />")
            .str[1]
            .str.split("20")
            .str[0]
            .str.lower()
            .str.strip()
            .str.replace("[^\w\s]", "")
        )
    except:
        pass

    return gdf

In [None]:
def load_sb1_all_projects() -> pd.DataFrame:
    """
    Load in all projects layer of SB1 because it
    contains a value for every row in the
    project title column.
    """
    df = f"{url_pt1}22{url_pt2}"

    df = to_snakecase(gpd.read_file(df))

    # No geometry, just drop it
    df = df.drop(columns=["geometry"])

    # Basic cleaning
    df = sb1_basic_cleaning(
        df, "implementingagency", "projecttitle", "projectid", "projectdescription"
    )

    return df

In [None]:
def sb1_final() -> gpd.GeoDataFrame:
    """
    Layers 0-21 with geographic information
    don't always have project titles for each
    of the projects. Merge these layers with
    layer 22, which does have title information.
    """
    all_projects_subset = [
        "projecttitle",
        "programcodes",
        "totalcost",
        "implementingagency",
        "fiscalyearcode",
    ]

    sb1_geo, missing_geo = load_sb1_rest_server()
    sb1_all_projects = load_sb1_all_projects()[all_projects_subset]

    # Merge
    merge1 = pd.merge(
        sb1_geo,
        sb1_all_projects,
        how="left",
        left_on=["programcodes", "totalcost", "agencies", "fiscalyearcodes"],
        right_on=["programcodes", "totalcost", "implementingagency", "fiscalyearcode"],
    )

    # Fill in missing project titles in sb1_geo with information from
    # sb1_all_projects
    merge1.projecttitle_x = merge1.projecttitle_x.fillna(merge1.projecttitle_y)

    # Fill missing titles with none
    merge1.projecttitle_x = merge1.projecttitle_x.fillna("None")

    # Drop columns
    merge1 = merge1.drop(columns=["projecttitle_y"])

    # Add a new column detailing where this information comes from
    merge1 = merge1.assign(notes=merge1.programcodes + "/" + "SB1")

    merge1 = merge1.fillna(merge1.dtypes.replace({"float64": 0.0, "object": "None"}))
    return merge1

In [None]:
sb1_final_gdf = sb1_final()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
len(sb1_final_gdf), sb1_final_gdf.geometry.nunique()

In [None]:
sb1_final_gdf.info()

In [None]:
sb1_final_gdf.projecttitle_x.value_counts().head(10)

In [None]:
sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].groupby(
    ["projecttitle_x", "countynames"]
).agg({"objectid": "count"}).head()

In [None]:
sb1_final_gdf.projectstatuscodes.value_counts()

In [None]:
sb1_final_gdf.projectstatuscodes.unique()

In [None]:
sb1_final_gdf.projectstatuses.value_counts()

In [None]:
sb1_final_gdf[
    (sb1_final_gdf.projecttitle_x == "major damage restoration")
    & (sb1_final_gdf.countynames == "Alameda")
]

In [None]:
sb1_final_gdf[
    (sb1_final_gdf.projecttitle_x == "major damage restoration")
    & (sb1_final_gdf.countynames == "Alameda")
].plot()

In [None]:
# sb1_final_gdf[sb1_final_gdf.projecttitle_x == "major damage restoration"].explore()