## General function to clean up data from various grants
TO DO
* Switch City of Berkeley to Berkeley City. https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
* De duplicate projects
* Rearrange counties in County column in alphabetical order.

In [1]:
import _cleaning_utils
import _harmonization_utils as harmonization_utils
import _state_rail_plan_utils as srp_utils
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

import shapely 
import geopandas as gpd


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
import re
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from collections import Counter
from autocorrect import Speller

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
# sb1 = harmonization_utils.load_sb1()

In [5]:
# lost = harmonization_utils.load_lost()

In [6]:
# lost.sample()

In [7]:
# srp = harmonization_utils.load_state_rail_plan()

In [8]:
columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "project_description",
        "total_project_cost",
        "fully_funded",
        "total_available_funds",
        "location",
         "city",
        "county",
        "data_source",
        "notes",
        "funding_notes",
        "project_id",
    ]

In [9]:
def harmonizing(
    df,
    agency_name_col: str,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    program_col: str,
    data_source: str,
    fund_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.
    
    Add metric if the project is fully funded or not.
    """
    rename_columns = {
        agency_name_col: "lead_agency",
        project_name_col: "project_title",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "location",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
        program_col: "grant_program"
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Clean up string columns
    string_cols = df.select_dtypes(include=["object"]).columns.to_list()
    for i in string_cols:
        df[i] = df[i].str.replace("_", " ").str.strip().str.title()
    
    # Clean agency names
    df = harmonization_utils.organization_cleaning(df, "lead_agency")

    # Add data source
    df["data_source"] = data_source

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = ["county","city","notes", "project_year", "project_category"]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "grant_program" not in df:
        df["grant_program"] = data_source

    # Determine if the project completely funded or not?
    # Add up all available funds
    df["total_available_funds"] = df[fund_cols].sum(axis=1)
    df["fully_funded"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)
    
    # Add new column with funding breakout 
    # Since it's summarized above and the details are suppressed.
    prefix = "_" 
    for column in fund_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    str_fund_cols = [prefix + sub for sub in fund_cols]
    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_funding(x):
        return ', '.join([col + ': ' + x[col] for col in str_fund_cols])
    df['funding_notes'] = df.apply(combine_funding, axis = 1)
    df['funding_notes'] = df['funding_notes'].str.replace('_',' ')
    
    # Create unique project id - first LOST project is LOST-1, 
    # second LOST is LOST-2, LOST-3, LOST-4, etc 
    df['project_id'] =  df.data_source + '-' + df.groupby('data_source').cumcount().astype('str')
    
    # Only keep certain columns
    df = df[columns_to_keep]

    # Fill in any nulls
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    return df

In [10]:
def harmonize_srp():
    df = harmonization_utils.load_state_rail_plan()
    df = harmonizing(
        df,
        agency_name_col="lead_agency",
        project_name_col="project_name",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="total_project_cost",
        location_col="corridor",
        county_col="",
        city_col="",
        project_year_col="",
        program_col= "",
        data_source = "State Rail Plan",
        fund_cols=[],
        cost_in_millions=True,)
        
    return df

In [11]:
def harominze_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing(
        df,
        agency_name_col="agency",
         project_name_col = "project_title",
        project_description_col="project_description",
        project_category_col= "project_category",
         project_cost_col="cost__in_millions_",
        location_col = "location",
        county_col = "county",
        city_col = "city",
        project_year_col = "",
        program_col = "measure",
        data_source = "LOST",
        fund_cols = [
            "estimated_lost_funds",
            "estimated_federal_funds",
            "estimated_state_funds",
            "estimated_local_funds",
            "estimated_other_funds",
        ],
        cost_in_millions = False,
    )
        
    return df

In [12]:
def harmonize_sb1():
    df = harmonization_utils.load_sb1()
    df = harmonizing(
        df,
        agency_name_col="agencies",
        project_name_col="projecttitle_x",
        project_description_col="projectdescription",
        project_category_col="",
        project_cost_col="totalcost",
        location_col="geometry",
        county_col="countynames",
        city_col="citynames",
        project_year_col="fiscalyears",
        program_col="programcodes",
        data_source = "SB1",
        fund_cols=["sb1funds", "iijafunds"],
        cost_in_millions=True,
    )
        
    return df

In [13]:
sb1 = harmonization_utils.load_sb1()



In [14]:

def get_list_of_words(df, col: str) -> list:
    """
    Natalie's function to clean and place words in a project description column
    into a list
    """
    # get just the one col
    column = df[[col]]

    # remove single-dimensional entries from the shape of an array
    col_text = column.squeeze()
    # get list of words
    text_list = col_text.tolist()

    # Join all the column into one large text blob, lower text
    text_list = " ".join(text_list).lower()

    # remove punctuation
    text_list = re.sub(r"[^\w\s]", "", text_list)

    # List of stopwords
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words("english")]
    
    # Remove stopwords
    clean_text_list = [
        word for word in word_tokenize(text_list.lower()) if word not in swords
    ]

    return clean_text_list

In [15]:
def find_common_phrases(df, description_column:str, values_to_add:list):
    
    # Break apart every word in the description column into a list
    descriptions_list = get_list_of_words(df, description_column)
    
    # Get phrases of whatever length you want (2,3,4,etc)
    c = Counter([" ".join(y) for x in [2] for y in ngrams(descriptions_list, x)])
    
    # Make a dataframe out of the counter values
    df_phrases = pd.DataFrame({"phrases": list(c.keys()), "total": list(c.values())})
    
    # Take phrases that are repeated more than 40 times and turn it into a list
    df_phrases = ((df_phrases.loc[df_phrases["total"] > 40])).reset_index(drop = True)
    common_phrases_list = df_phrases.phrases.tolist()
    
    phrases_to_del = ["san bernardino","los angeles","contra costa", "el dorado",
                "san luis obispo", "luis obispo", "del norte", "san francisco",
                "improve approximately"]
    
    common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))
    
    # CLean up the list to delete county information/etc
    words_to_delete = ['county','route', 'dollar','mile','santa','project','san','lanes','lane',
                      '2','4','financial','prop','best', 'approximately',]
    
    for word in words_to_delete:
        common_phrases_list = [ x for x in common_phrases_list if word not in x ]

    # ADD certain keywords here 
    # Operating Additional Service
    common_phrases_list.extend(values_to_add)
    
    return common_phrases_list 

In [16]:
def categorize_projects(df, 
                        description_column:str,
                        project_id_column:str,
                        title_column:str,
                        values_to_add:list):
    
    # Find most common 2 word phrases for some automatic project categories
    common_phrases_list = find_common_phrases(df,description_column,values_to_add)
    
    # Place all the words in common_phrases_list into a blob named query
    # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa
    query = "|".join(common_phrases_list)
    
    # Remove punctation and lower strings in original description column befores searching
    df["clean_description"] = (df[description_column]
                      .str.lower()
                      .str.replace("-", " ", regex=True)
                      .str.replace("("," ", regex=True)
                      .str.replace(")"," ", regex=True)
                      .str.replace("."," ", regex=True)
                      .str.strip()
                     )
    
    # Search through description column for the most common phrases 
    # Input the results in the new column
    df["auto_project_category"] = df["clean_description"].str.findall(
    r"\b({})\b".format(query))
    
    # Explode to take categories out of a list
    # Drop duplicate project keywords by title
    df = (df
          .explode("auto_project_category")
          .sort_values([project_id_column, title_column])
          .drop_duplicates(subset=[description_column, project_id_column, title_column, "auto_project_category"]))
    
    # Fill any uncategorized projects as "Other"
    df["auto_project_category"] = (df["auto_project_category"].fillna("Other").str.title())
    
    # Correct spelling
    spell = Speller(lang='en')
    df["auto_project_category"] = df["auto_project_category"].apply(lambda x: " ".join([spell(i) for i in x.split()]))
    
    # Summarize - put all the categories onto one line
    df = (df
          .groupby([description_column, project_id_column, title_column,])["auto_project_category"]
          .apply(",".join)
          .reset_index())
    
    return df

In [17]:
def add_all_projects():

    # Load  dataframes
    state_rail_plan = harmonize_srp()
    lost = harominze_lost()
    sb1 = harmonize_sb1()
    
    # Concat for df
    all_projects_df = pd.concat([lost, state_rail_plan, sb1])
    
    # Categorize
    categories = categorize_projects(all_projects_df, 'project_description','project_title','project_id',
                                       ['operating','service','zero emission vehicle','zev','maintain/repair',
                                       'repair/replace'])
    
    # Merge categorized 
    all_projects_df = pd.merge(all_projects_df.drop(columns = ['clean_description']),
                                    categories, 
                                    how = "left",
                                    on = ['project_description','project_title','project_id']
                 )
    
    # Rename
    all_projects_df = (all_projects_df.drop(columns = ['auto_project_category_x'])
                                     .rename(columns = {'auto_project_category_y':'auto_tagged_project_categories'})
                       )
    # Concat for gdf
    all_projects_gdf = pd.concat([sb1])
    all_projects_gdf = all_projects_gdf.set_geometry('location')
    
    return all_projects_df, all_projects_gdf

In [18]:
all_projects, all_projects_geo = add_all_projects()



In [19]:
# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)

In [20]:
all_projects.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8284 entries, 0 to 8283
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   project_title                   8284 non-null   object 
 1   lead_agency                     8284 non-null   object 
 2   project_year                    8284 non-null   object 
 3   project_category                8284 non-null   object 
 4   grant_program                   8284 non-null   object 
 5   project_description             8284 non-null   object 
 6   total_project_cost              8284 non-null   float64
 7   fully_funded                    8284 non-null   object 
 8   total_available_funds           8284 non-null   float64
 9   location                        8284 non-null   object 
 10  city                            8284 non-null   object 
 11  county                          8284 non-null   object 
 12  data_source                     82

In [30]:
all_projects.drop(columns = ['location']).sample(5)

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,project_description,total_project_cost,fully_funded,total_available_funds,city,county,data_source,notes,funding_notes,project_id,auto_tagged_project_categories
7904,Americans With Disabilities Act New Curb Ramps,Caltrans,20/21,,Shopp,A $8.12 Million Dollar Project In Merced County On Route 165 Will Upgrade Americans With Disabilities Act (Ada) Facilities.,8.12,No available funding info,0.0,Los Banos,Merced,SB1,,"sb1funds: 0.0, iijafunds: 0.0",SB1-5779,Disabilities Act
6230,Roadway Rehabilitation,Caltrans,23/24,,Shopp,A $31.52 Million Dollar Project In Mendocino County On Route 101 Will Rehabilitate Roadway And Upgrade Guardrail And Transportation Management System (Tms) Elements.,31.52,Partially funded,0.82,,Mendocino,SB1,,"sb1funds: 0.094054, iijafunds: 0.725946",SB1-4105,"Rehabilitate Roadway,Upgrade Guardrail,Transportation Management"
8214,Safety Improvements,Caltrans,21/22,,Shopp,"A $3.04 Million Dollar Project In Orange County On Route 1 Will Improve Safety By Modifying The Traffic Signals, Adding Safety Lighting, Refreshing Pavement Striping, And Upgrading Pedestrian Facilities To Americans With Disabilities Act (Ada) Standards.",3.04,Partially funded,1.76,"Huntington Beach, Newport Beach",Orange,SB1,,"sb1funds: 0.0, iijafunds: 1.7597",SB1-6089,"Traffic Signals,Disabilities Act"
3916,Safe Routes To School Education And Encouragement In Santa Cruz County,Santa Cruz Health Services Agency,14/15,,Atp,,0.45,No available funding info,0.0,"Santa Cruz, Watsonville",Santa Cruz,SB1,,"sb1funds: 0.0, iijafunds: 0.0",SB1-1791,Other
1027,Paratransit Emergency Response Module,Mst,,,Monterey Salinas Transit Q 2016,,0.0,No project cost info,0.0,,Monterey,LOST,,"estimated lost funds: 0.0, estimated federal funds: 0.0, estimated state funds: 0.0, estimated local funds: 0, estimated other funds: 0.0",LOST-1027,Other


### Look at the data

In [22]:
all_projects.groupby(['lead_agency']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

Unnamed: 0_level_0,project_id
lead_agency,Unnamed: 1_level_1
Caltrans,4660
,2028
Multiple,46
Metrolink,42
,33
Sjrrc,25
Sandag,22
Lossan,18
Sjjpa,17
Bnsf,12


In [42]:
all_projects[(all_projects.county == "Kern") & (all_projects.project_description.str.contains("Seal Coat"))].drop(columns = ['location'])

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,project_description,total_project_cost,fully_funded,total_available_funds,city,county,data_source,notes,funding_notes,project_id,auto_tagged_project_categories
3181,Pavement - Hm1,Caltrans,20/21,,Hm,"In Kern County, This Project Will Maintain/Repair Pavement - Seal Coat.",2.61,No available funding info,0.0,,Kern,SB1,,"sb1funds: 0.0, iijafunds: 0.0",SB1-1056,"Maintain/Repair,Seal Coat"
3205,Pavement - Hm1,Caltrans,20/21,,Hm,"In Kern County, This Project Will Maintain/Repair Pavement - Seal Coat.",1.99,No available funding info,0.0,,Kern,SB1,,"sb1funds: 0.0, iijafunds: 0.0",SB1-1080,"Maintain/Repair,Seal Coat"
3363,Pavement - Hm1 -Sb1,Caltrans,20/21,,Hm,"In Kern County, This Project Will Maintain/Repair Pavement - Seal Coat.",1.04,Fully funded,1.04,,Kern,SB1,,"sb1funds: 1.04, iijafunds: 0.0",SB1-1238,"Maintain/Repair,Seal Coat"
5779,Pavement - Hm1,Caltrans,20/21,,Hm,"In Kern County, This Project Will Maintain/Repair Pavement - Seal Coat.",2.61,No available funding info,0.0,,Kern,SB1,,"sb1funds: 0.0, iijafunds: 0.0",SB1-3654,"Maintain/Repair,Seal Coat"
5791,Pavement - Hm1,Caltrans,20/21,,Hm,"In Kern County, This Project Will Maintain/Repair Pavement - Seal Coat.",1.99,No available funding info,0.0,,Kern,SB1,,"sb1funds: 0.0, iijafunds: 0.0",SB1-3666,"Maintain/Repair,Seal Coat"
5807,Pavement - Hm1,Caltrans,21/22,,Hm,Maintain/Repair Pavement - Seal Coat,1.25,Fully funded,1.25,"Arvin, Bakersfield",Kern,SB1,,"sb1funds: 1.25, iijafunds: 0.0",SB1-3682,"Maintain/Repair,Seal Coat"
5808,Pavement - Hm1,Caltrans,21/22,,Hm,Maintain/Repair Pavement - Seal Coat,1.25,Fully funded,1.25,"Arvin, Bakersfield",Kern,SB1,,"sb1funds: 1.25, iijafunds: 0.0",SB1-3683,"Maintain/Repair,Seal Coat"
5809,Pavement - Hm1,Caltrans,21/22,,Hm,Maintain/Repair Pavement - Seal Coat,1.25,Fully funded,1.25,"Arvin, Bakersfield",Kern,SB1,,"sb1funds: 1.25, iijafunds: 0.0",SB1-3684,"Maintain/Repair,Seal Coat"
5964,Pavement - Hm1 -Sb1,Caltrans,20/21,,Hm,"In Kern County, This Project Will Maintain/Repair Pavement - Seal Coat.",1.04,Fully funded,1.04,,Kern,SB1,,"sb1funds: 1.04, iijafunds: 0.0",SB1-3839,"Maintain/Repair,Seal Coat"


In [54]:
# all_projects.groupby(['project_category','auto_tagged_project_categories']).agg({'project_id':'nunique'})

In [55]:
all_projects.groupby(['auto_tagged_project_categories']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

Unnamed: 0_level_0,project_id
auto_tagged_project_categories,Unnamed: 1_level_1
Other,4842
Perform Preservation,402
Maintain/Repair,315
Repair/Replace,260
Allocated By,185
"Maintain/Repair,Asphalt Overlay",136
"Maintain/Repair,Maintenance Station",128
Service,117
"Maintain/Repair,Transportation Management",101
Replace Bridge,60


In [56]:
all_projects.groupby(['project_category']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

Unnamed: 0_level_0,project_id
project_category,Unnamed: 1_level_1
,7457
Capital,229
Highway,77
Transit,59
Candidate Grade Separation Projects,46
"Local Safety, Pothole And Congestion Relief Improvements",39
Bicycle And Pedestrian Improvements,35
Bicycle And Pedestrian,34
Grade Separation,29
"Local Road Maintenance, Safety And Congestion Relief\nProgram",24


In [43]:
all_projects.groupby(['project_description']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

Unnamed: 0_level_0,project_id
project_description,Unnamed: 1_level_1
,2155
Resurface,191
Repair/Replace Culverts,157
Overlay,126
Repair/Maintenance,108
Maintain/Repair Transportaiton Management Systems,98
Maintain/Repair Maintenance Station,79
Maintain/Repair Pavement - Asphault Overlay,36
"In San Diego County, This Project Will Repair/Replace Culverts.",35
Reconstruction,29


In [44]:
all_projects.groupby(['county']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

Unnamed: 0_level_0,project_id
county,Unnamed: 1_level_1
Imperial,780
Los Angeles,778
San Diego,390
San Bernardino,304
Alameda,293
San Mateo,282
Orange,276
,276
Santa Clara,216
Riverside,215


In [23]:
all_projects.lead_agency.nunique()

710

In [24]:
all_projects.total_project_cost.describe()

count    8284.00
mean       37.09
std       419.45
min         0.00
25%         0.40
50%         3.10
75%        12.00
max     30000.00
Name: total_project_cost, dtype: float64

In [25]:
all_projects.loc[all_projects.fully_funded == "Fully funded"].groupby(
    ["data_source"]
).agg({"project_id":"nunique"})

Unnamed: 0_level_0,project_id
data_source,Unnamed: 1_level_1
LOST,277
SB1,874


In [26]:
all_projects.loc[all_projects.fully_funded == "Partially funded"].groupby(
    ["data_source"]
).agg({"project_id":"nunique"})

Unnamed: 0_level_0,project_id
data_source,Unnamed: 1_level_1
LOST,65
SB1,2250


In [27]:
all_projects.groupby(['data_source']).agg({"project_id":"nunique"})

Unnamed: 0_level_0,project_id
data_source,Unnamed: 1_level_1
LOST,1849
SB1,6159
State Rail Plan,276


In [28]:
all_projects.groupby(['fully_funded']).agg({"project_id":"nunique"}).reset_index().sort_values('project_id', ascending = False)

Unnamed: 0,fully_funded,project_id
1,No available funding info,3729
3,Partially funded,2315
0,Fully funded,1151
2,No project cost info,1089


In [29]:
all_projects.groupby(['data_source','fully_funded']).agg({"project_id":"nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,project_id
data_source,fully_funded,Unnamed: 2_level_1
LOST,Fully funded,277
LOST,No available funding info,423
LOST,No project cost info,1084
LOST,Partially funded,65
SB1,Fully funded,874
SB1,No available funding info,3035
SB1,Partially funded,2250
State Rail Plan,No available funding info,271
State Rail Plan,No project cost info,5
