In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import _harmonization_utils 
from calitp_data_analysis.sql import to_snakecase
from shared_utils import utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
import re
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from collections import Counter


In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
url_pt1 = "https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer/"
url_pt2 = "/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&distance=&units=esriSRUnit_Foot&relationParam=&outFields=*+&returnGeometry=true&maxAllowableOffset=&geometryPrecision=&outSR=&gdbVersion=&historicMoment=&returnDistinctValues=false&returnIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&returnZ=false&returnM=false&multipatchOption=&resultOffset=&resultRecordCount=&returnTrueCurves=false&sqlFormat=none&f=geojson"

In [5]:
def sb1_basic_cleaning(
    df,
    agency_col: str,
    project_name_col: str,
    project_id_col: str,
    project_desc_col: str,
):
    """
    Perform basic cleaning before joining
    SB1 & Non SHOPP data together.
    """
    # Remove all punctation, lowercase, and strip whitespaces from
    # project titles & descriptions.
    for i in [project_name_col, project_desc_col]:
        df[i] = df[i].str.lower().str.replace("[^\w\s]", "").str.strip()

    # Some project names contain the year. Remove anything after 20..
    df[project_name_col] = df[project_name_col].str.split("20").str[0]

    # Get rid of | in object cols
    # https://stackoverflow.com/questions/68152902/extracting-only-object-type-columns-in-a-separate-list-from-a-data-frame-in-pand
    string_cols = df.select_dtypes(include=["object"]).columns.to_list()
    try:
        for i in string_cols:
            df[i] = df[i].str.replace("|", "")
    except:
        pass

    # Try to extract titles from popups
    try:
        df["popup"] = (
            df["popup"]
            .str.split("<br  />")
            .str[1]
            .str.split("20")
            .str[0]
            .str.lower()
            .str.strip()
            .str.replace("[^\w\s]", "")
        )
    except:
        pass

    return df

In [6]:
def load_sb1_rest_server() -> gpd.GeoDataFrame:
    """
    Load all the projects on the SB1
    map from the Feature Server.

    https://odpsvcs.dot.ca.gov/arcgis/rest/services/RCA/RCA_Projects_032022/FeatureServer
    """
    full_gdf = pd.DataFrame()
    for i in [*range(0, 22)]:
        df = to_snakecase(gpd.read_file(f"{url_pt1}{i}{url_pt2}"))
        full_gdf = pd.concat([full_gdf, df], axis=0)

    # Basic cleaning
    full_gdf = sb1_basic_cleaning(
        full_gdf, "agencies", "projecttitle", "projectid", "projectdescription"
    )

    # Fill in project titles that are empty with information
    # gleaned from the pop up.
    full_gdf["projecttitle"] = full_gdf["projecttitle"].fillna(full_gdf["popup"])

    # Throw out missing geometry
    missing_geo = full_gdf[full_gdf.geometry.is_empty]
    full_gdf = full_gdf[~full_gdf.geometry.is_empty].reset_index(drop=True)

    return full_gdf, missing_geo

In [7]:
# sb1, missing_geo_sb1 = load_sb1_rest_server()

In [8]:
# len(sb1), type(sb1)

In [9]:
# len(missing_geo_sb1)

In [10]:
def load_sb1_all_projects() -> pd.DataFrame:
    """
    Load in all projects layer of SB1 because it
    contains a value for every row in the
    project title column.
    """
    df = f"{url_pt1}22{url_pt2}"

    df = to_snakecase(gpd.read_file(df))

    # No geometry, just drop it
    df = df.drop(columns=["geometry"])

    # Basic cleaning
    df = sb1_basic_cleaning(
        df, "implementingagency", "projecttitle", "projectid", "projectdescription"
    )

    return df

In [11]:
# sb1_all_projects = load_sb1_all_projects()

In [12]:
# len(sb1_all_projects)

In [13]:
def sb1_final() -> gpd.GeoDataFrame:
    """
    Layers 0-21 with geographic information
    don't always have project titles for each
    of the projects. Merge these layers with
    layer 22, which does have title information.
    """
    all_projects_subset = [
        "projecttitle",
        "programcodes",
        "totalcost",
        "implementingagency",
        "fiscalyearcode",
    ]

    sb1_geo, missing_geo = load_sb1_rest_server()
    sb1_all_projects = load_sb1_all_projects()[all_projects_subset]

    # Merge
    merge1 = pd.merge(
        sb1_geo,
        sb1_all_projects,
        how="left",
        left_on=["programcodes", "totalcost", "agencies", "fiscalyearcodes"],
        right_on=["programcodes", "totalcost", "implementingagency", "fiscalyearcode"],
    )

    # Fill in missing project titles in sb1_geo with information from
    # sb1_all_projects
    merge1.projecttitle_x = merge1.projecttitle_x.fillna(merge1.projecttitle_y)

    # Fill missing titles with none
    merge1.projecttitle_x = merge1.projecttitle_x.fillna("None")
    
    # Drop columns
    merge1 = merge1.drop(columns = ['projecttitle_y'])
    
    # Add a new column detailing where this information comes from 
    merge1 = merge1.assign(
        notes = merge1.programcodes + '/' + 'SB1')
    
    merge1 = merge1.fillna(merge1.dtypes.replace({'float64': 0.0, 'object': 'None'}))
    return merge1

In [14]:
sb1_final_gdf = sb1_final()



In [17]:
sb1_final_gdf.projectstatuscodes.value_counts()

Completed                     2693
InProgress                    1483
Planned                       1321
CompletedInProgress            231
CompletedInProgressPlanned     154
CompletedPlanned               128
InProgressPlanned               72
In Progress                     22
Name: projectstatuscodes, dtype: int64

In [19]:
len(sb1_final_gdf), sb1_final_gdf.geometry.nunique()

(6104, 5448)

In [23]:
sb1_final_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 6104 entries, 0 to 6103
Data columns (total 41 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   objectid                6104 non-null   int64   
 1   agencyids               6104 non-null   object  
 2   agencies                6104 non-null   object  
 3   programcodes            6104 non-null   object  
 4   iijaprogram             6104 non-null   object  
 5   iijacodes               6104 non-null   object  
 6   projectstatuscodes      6104 non-null   object  
 7   fiscalyears             6104 non-null   object  
 8   fiscalyearcodes         6104 non-null   object  
 9   projectstatuses         6104 non-null   object  
 10  sb1funds                6104 non-null   float64 
 11  iijafunds               6104 non-null   float64 
 12  totalcost               6104 non-null   float64 
 13  dateupdated             6104 non-null   object  
 14  projectcount    

In [None]:
# Natalie's function to clean and place words in a project description column
# into a list
def get_list_of_words(df, col: str) -> list:

    # get just the one col
    column = df[[col]]
    
    # Correct spelling 
    # https://stackoverflow.com/questions/49364664/how-to-use-autocorrect-in-pandas-column-of-sentences
    # spell = Speller(lang='en')
    # df[col] = df[col].apply(lambda x: " ".join([spell(i) for i in x.split()]))
    
    # remove single-dimensional entries from the shape of an array
    col_text = column.squeeze()
    # get list of words
    text_list = col_text.tolist()

    # Join all the column into one large text blob, lower text
    text_list = " ".join(text_list).lower()

    # remove punctuation
    text_list = re.sub(r"[^\w\s]", "", text_list)

    # List of stopwords
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words("english")]
    
    # Remove stopwords
    clean_text_list = [
        word for word in word_tokenize(text_list.lower()) if word not in swords
    ]

    return clean_text_list

In [None]:
def find_common_phrases(df, description_column:str):
    
    # Break apart every word in the description column into a list
    descriptions_list = get_list_of_words(df, description_column)
    
    # Get phrases of whatever length you want (2,3,4,etc)
    c = Counter([" ".join(y) for x in [2] for y in ngrams(descriptions_list, x)])
    
    # Make a dataframe out of the counter values
    df_phrases = pd.DataFrame({"phrases": list(c.keys()), "total": list(c.values())})
    
    # Take phrases that are the most repeated and turn it into a list
    df_phrases = ((df_phrases.loc[df_phrases["total"] > 7])).reset_index(drop = True)
    
    common_phrases_list = df_phrases.phrases.tolist()

    return common_phrases_list 

In [None]:
def categorize_projects(df, 
                        description_column:str,
                        agency_column:str,
                        project_year_column:str,
                        title_column:str):
    
    # Find most common 2 word phrases for some automatic project categories
    common_phrases_list = find_common_phrases(df,description_column)
    
    # Place all the words in common_phrases_list into a blob named query
    # Ex: 'lrv|zero emission|electric' 
    # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa
    query = "|".join(common_phrases_list)
    
    # Remove punctation and lower strings in original description column befores searching
    df["clean_description"] = (df[description_column]
                      .str.lower()
                      .str.replace("-", " ", regex=True)
                      .str.replace("("," ", regex=True)
                      .str.replace(")"," ", regex=True)
                      .str.replace("."," ", regex=True)
                      .str.strip()
                     )
    
    # Search through description column for the most common phrases 
    df["project_category"] = df["clean_description"].str.findall(
    r"\b({})\b".format(query))
    
    # Explode to take categories out of a list
    # Drop duplicate project keywords by title
    df = (df
          .explode("project_category")
          .sort_values([project_year_column, title_column])
          .drop_duplicates(subset=[description_column, project_year_column, title_column, agency_column, "project_category"]))
    
    # Fill any uncategorized projects as "Other"
    df["project_category"] = (df["project_category"].fillna("Other").str.title())
    
    # Summarize - put all the categories onto one line
    df = (df
          .groupby([agency_column, project_year_column, title_column, description_column])["project_category"]
          .apply(",".join)
          .reset_index())
    
    return df

In [None]:
# sb1_final_gdf2 = categorize_projects(sb1_final_gdf, 'projectdescription', 'agencies', 'fiscalyears', 'projecttitle_x')

In [None]:
# len(sb1_final_gdf2)

In [None]:
# sb1_final_gdf2.project_category.value_counts()

In [None]:
type(sb1_harmonized)

In [None]:
sb1_harmonized.head(5)

In [None]:
# sb1_harmonized = sb1_harmonized.set_geometry('location')

In [None]:
# sb1_harmonized.fully_funded.value_counts()