## General function to clean up data from various grants

In [1]:
import _cleaning_utils
import _harmonization_utils as harmonization_utils
import _state_rail_plan_utils as srp_utils
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

import shapely 
import geopandas as gpd


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
import re
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from collections import Counter
from autocorrect import Speller

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
# sb1 = harmonization_utils.load_sb1()

In [5]:
# lost = harmonization_utils.load_lost()

In [6]:
# srp = harmonization_utils.load_state_rail_plan()

In [7]:
def harmonizing(
    df,
    agency_name_col: str,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    program_col: str,
    data_source: str,
    fund_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.
    
    Add metric if the project is fully funded or not.
    """
    # Rename columns
    rename_columns = {
        agency_name_col: "lead_agency",
        project_name_col: "project_title",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "location",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
        program_col: "grant_program"
    }

    df = df.rename(columns=rename_columns)

    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Clean up string columns
    string_cols = df.select_dtypes(include=["object"]).columns.to_list()
    for i in string_cols:
        df[i] = df[i].str.strip().str.title()
    
    # Clean agency names
    df = harmonization_utils.organization_cleaning(df, "lead_agency")

    # Add data source
    df["data_source"] = data_source

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = ["county","city","notes", "project_year", "project_category"]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "grant_program" not in df:
        df["grant_program"] = data_source

    # Determine if the project completely funded or not?
    # Add up all available funds
    df["total_available_funds"] = df[fund_cols].sum(axis=1)

    # Add new column with funding breakout 
    # Since it's summarized above and the details are suppressed.
    prefix = "_" 
    for column in fund_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    
    str_fund_cols = [prefix + sub for sub in fund_cols]
    
    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_funding(x):
        return ', '.join([col + ': ' + x[col] for col in str_fund_cols])
    
    df['funding_notes'] = df.apply(combine_funding, axis = 1)
    df['funding_notes'] = df['funding_notes'].str.replace('_',' ')
    
    # Compare if available funds is greater or equal to
    # total project cost
    df["fully_funded"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)
    
    # Create unique project id - first LOST project is LOST-1, 
    # second LOST is LOST-2, LOST-3, LOST-4, etc 
    df['project_id'] =  df.data_source + '-' + df.groupby('data_source').cumcount().astype('str')
    
    # Only keep certain columns
    columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "project_description",
        "total_project_cost",
        "fully_funded",
        "total_available_funds",
        "location",
         "city",
        "county",
        "data_source",
        "notes",
        "funding_notes",
        "project_id",
    ]
    df = df[columns_to_keep]

    # Fill in any nulls
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    return df

In [8]:
def harmonize_srp():
    df = harmonization_utils.load_state_rail_plan()
    df = harmonizing(
        df,
        agency_name_col="lead_agency",
        project_name_col="project_name",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="total_project_cost",
        location_col="corridor",
        county_col="",
        city_col="",
        project_year_col="",
        program_col= "",
        data_source = "State Rail Plan",
        fund_cols=[],
        cost_in_millions=True,)
        
    return df

In [9]:
def harominze_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing(
        df,
        agency_name_col="agency",
         project_name_col = "project_title",
        project_description_col="project_description",
        project_category_col= "project_category",
         project_cost_col="cost__in_millions_",
        location_col = "location",
        county_col = "county",
        city_col = "city",
        project_year_col = "",
        program_col = "",
        data_source = "LOST",
        fund_cols = [
            "estimated_lost_funds",
            "estimated_federal_funds",
            "estimated_state_funds",
            "estimated_local_funds",
            "estimated_other_funds",
        ],
        cost_in_millions = False,
    )
        
    return df

In [10]:
def harmonize_sb1():
    df = harmonization_utils.load_sb1()
    df = harmonizing(
        df,
        agency_name_col="agencies",
        project_name_col="projecttitle_x",
        project_description_col="projectdescription",
        project_category_col="",
        project_cost_col="totalcost",
        location_col="geometry",
        county_col="countynames",
        city_col="citynames",
        project_year_col="fiscalyears",
        program_col="programcodes",
        data_source = "SB1",
        fund_cols=["sb1funds", "iijafunds"],
        cost_in_millions=True,
    )
        
    return df

In [11]:
sb1 = harmonization_utils.load_sb1()



In [12]:

def get_list_of_words(df, col: str) -> list:
    """
    Natalie's function to clean and place words in a project description column
    into a list
    """
    # get just the one col
    column = df[[col]]

    # remove single-dimensional entries from the shape of an array
    col_text = column.squeeze()
    # get list of words
    text_list = col_text.tolist()

    # Join all the column into one large text blob, lower text
    text_list = " ".join(text_list).lower()

    # remove punctuation
    text_list = re.sub(r"[^\w\s]", "", text_list)

    # List of stopwords
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words("english")]
    
    # Remove stopwords
    clean_text_list = [
        word for word in word_tokenize(text_list.lower()) if word not in swords
    ]

    return clean_text_list

In [13]:
def find_common_phrases(df, description_column:str, values_to_add:list):
    
    # Break apart every word in the description column into a list
    descriptions_list = get_list_of_words(df, description_column)
    
    # Get phrases of whatever length you want (2,3,4,etc)
    c = Counter([" ".join(y) for x in [2] for y in ngrams(descriptions_list, x)])
    
    # Make a dataframe out of the counter values
    df_phrases = pd.DataFrame({"phrases": list(c.keys()), "total": list(c.values())})
    
    # Take phrases that are repeated more than 40 times and turn it into a list
    df_phrases = ((df_phrases.loc[df_phrases["total"] > 40])).reset_index(drop = True)
    common_phrases_list = df_phrases.phrases.tolist()
    
    phrases_to_del = ["san bernardino","los angeles","contra costa", "el dorado",
                "san luis obispo", "luis obispo", "del norte", "san francisco",
                "improve approximately"]
    
    common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))
    
    # CLean up the list to delete county information/etc
    words_to_delete = ['county','route', 'dollar','mile','santa','project','san','lanes','lane',
                      '2','4','financial','prop','best', 'approximately',]
    
    for word in words_to_delete:
        common_phrases_list = [ x for x in common_phrases_list if word not in x ]

    # ADD certain keywords here 
    # Operating Additional Service
    common_phrases_list.extend(values_to_add)
    
    return common_phrases_list 

In [14]:
def categorize_projects(df, 
                        description_column:str,
                        project_id_column:str,
                        title_column:str,
                        values_to_add:list):
    
    # Find most common 2 word phrases for some automatic project categories
    common_phrases_list = find_common_phrases(df,description_column,values_to_add)
    
    # Place all the words in common_phrases_list into a blob named query
    # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa
    query = "|".join(common_phrases_list)
    
    # Remove punctation and lower strings in original description column befores searching
    df["clean_description"] = (df[description_column]
                      .str.lower()
                      .str.replace("-", " ", regex=True)
                      .str.replace("("," ", regex=True)
                      .str.replace(")"," ", regex=True)
                      .str.replace("."," ", regex=True)
                      .str.strip()
                     )
    
    # Search through description column for the most common phrases 
    # Input the results in the new column
    df["auto_project_category"] = df["clean_description"].str.findall(
    r"\b({})\b".format(query))
    
    # Explode to take categories out of a list
    # Drop duplicate project keywords by title
    df = (df
          .explode("auto_project_category")
          .sort_values([project_id_column, title_column])
          .drop_duplicates(subset=[description_column, project_id_column, title_column, "auto_project_category"]))
    
    # Fill any uncategorized projects as "Other"
    df["auto_project_category"] = (df["auto_project_category"].fillna("Other").str.title())
    
    # Correct spelling
    spell = Speller(lang='en')
    df["auto_project_category"] = df["auto_project_category"].apply(lambda x: " ".join([spell(i) for i in x.split()]))
    
    # Summarize - put all the categories onto one line
    df = (df
          .groupby([description_column, project_id_column, title_column,])["auto_project_category"]
          .apply(",".join)
          .reset_index())
    
    return df

In [15]:
def add_all_projects():

    # Load  dataframes
    state_rail_plan = harmonize_srp()
    lost = harominze_lost()
    sb1 = harmonize_sb1()
    
    # Concat for df
    all_projects_df = pd.concat([lost, state_rail_plan, sb1])
    
    # Categorize
    categories = categorize_projects(all_projects_df, 'project_description','project_title','project_id',
                                       ['operating','service','zero emission vehicle','zev','maintain/repair',
                                       'repair/replace'])
    
    # Merge categorized 
    all_projects_df = pd.merge(all_projects_df.drop(columns = ['clean_description']),
                                    categories, 
                                    how = "left",
                                    on = ['project_description','project_title','project_id']
                 )
    
    # Rename
    all_projects_df = (all_projects_df.drop(columns = ['auto_project_category_x'])
                                     .rename(columns = {'auto_project_category_y':'auto_tagged_project_categories'})
                       )
    # Concat for gdf
    all_projects_gdf = pd.concat([sb1])
    all_projects_gdf = all_projects_gdf.set_geometry('location')
    
    return all_projects_df, all_projects_gdf

In [16]:
all_projects, all_projects_geo = add_all_projects()



In [27]:
# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)

### Look at the data

In [28]:
all_projects.groupby(['lead_agency']).agg({'project_id':'nunique'})

Unnamed: 0_level_0,project_id
lead_agency,Unnamed: 1_level_1
,33
Ac Transit,4
Alameda County,5
Alameda County Cma,1
Alameda County Transportation Commission,2
Alameda-Contra Costa Transit District,3
Alpine County,1
Alpine County Transportation Commission,2
Altamont Corridor Express,1
Amador City,1


In [18]:
all_projects.shape

(8229, 17)

In [19]:
all_projects.fully_funded.value_counts() / len(all_projects) * 100

No available funding info   44.65
Partially funded            28.13
Fully funded                13.99
No project cost info        13.23
Name: fully_funded, dtype: float64

In [20]:
all_projects.lead_agency.nunique()

710

In [21]:
all_projects.total_project_cost.describe()

count    8229.00
mean       37.33
std       420.84
min         0.00
25%         0.39
50%         3.15
75%        12.15
max     30000.00
Name: total_project_cost, dtype: float64

In [22]:
all_projects.loc[all_projects.fully_funded == "Fully funded"].groupby(
    ["data_source"]
).agg({"project_id":"nunique"})

Unnamed: 0_level_0,project_id
data_source,Unnamed: 1_level_1
LOST,277
SB1,874


In [23]:
all_projects.loc[all_projects.fully_funded == "Partially funded"].groupby(
    ["data_source"]
).agg({"project_id":"nunique"})

Unnamed: 0_level_0,project_id
data_source,Unnamed: 1_level_1
LOST,65
SB1,2250


In [24]:
all_projects.groupby(['data_source']).agg({"project_id":"nunique"})

Unnamed: 0_level_0,project_id
data_source,Unnamed: 1_level_1
LOST,1849
SB1,6104
State Rail Plan,276


In [25]:
all_projects.groupby(['data_source','fully_funded']).agg({"project_id":"nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,project_id
data_source,fully_funded,Unnamed: 2_level_1
LOST,Fully funded,277
LOST,No available funding info,423
LOST,No project cost info,1084
LOST,Partially funded,65
SB1,Fully funded,874
SB1,No available funding info,2980
SB1,Partially funded,2250
State Rail Plan,No available funding info,271
State Rail Plan,No project cost info,5


In [26]:
# all_projects_df = harmonization_utils.add_all_projects()