## General function to clean up data from various grants
TO DO
* Switch City of Berkeley to Berkeley City. https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
* De duplicate projects
* Rearrange counties in County column in alphabetical order.

In [90]:
import _cleaning_utils
import _harmonization_utils as harmonization_utils
import _state_rail_plan_utils as srp_utils
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

import shapely 
import geopandas as gpd

In [91]:
"""
import re
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from collections import Counter
from autocorrect import Speller
"""

'\nimport re\nimport nltk\nfrom nltk import ngrams\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import sent_tokenize, word_tokenize\nimport re\nfrom collections import Counter\nfrom autocorrect import Speller\n'

In [92]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [93]:
# sb1 = harmonization_utils.load_sb1()

In [94]:
# lost = harmonization_utils.load_lost()

In [95]:
# lost.sample()

In [96]:
# srp = harmonization_utils.load_state_rail_plan()

In [97]:
columns_to_keep = [
        "project_title",
        "lead_agency",
        "project_year",
        "project_category",
        "grant_program",
        "phase",
        "project_description",
        "total_project_cost",
        "fully_funded",
        "total_available_funds",
        "city",
        "county",
        "location",
        "geometry",
        "data_source",
        "notes",
        "funding_notes",
        "project_id",
        "phase",
        
    ]

In [98]:
def create_notes(df, note_cols: list, new_col_name:str):
    """
    Concat multiple columns into one. 
    """
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df[new_col_name] = df.apply(combine_notes, axis=1)
    df[new_col_name] = df[new_col_name].str.replace("_", " ")

    return df

#### Make sure this doesn't erase project cost detail
* Apply this to original dataframes and compare side by side

In [99]:
def correct_cost_fund(df):
    """
    Change columns with cost/fund in 
    its name to be integer datatype
    """
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        try:
            df[i] = (
                df[i]
                .str.replace("$", "")
                .str.replace(",", "")
                .apply(pd.to_numeric, errors="coerce")
                .fillna(0)
            )
        except:
            df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)
    return df

#### Why is this not working?

In [100]:
def correct_project_cost(df, project_title_col:str, project_total_cost:str):
    """
    For some datasets, the same project 
    (as determined by the same project name, cost,
    and source) is split across multiple rows. 

    Ex: A project costs $500 million and is 
    split on 5 rows by phase/location. Each row still lists 
    the total  cost as $500 million, which is not accurate.
    This function will recalculate each of the row to list
    $100 mil as the total project cost
    """
    # Create a unique identifier
    df['unique_identifier'] =  df[project_title_col] + df[project_total_cost].astype(str)
    
    # Create count for each project
    df['how_many_times_same_proj_appears'] = df.groupby('unique_identifier').cumcount()+1
    
    # Find the total number of times a project title-cost appears. 
    # Sort by descending and keep only the row with the highest level
    df2 = (df[[project_title_col,'how_many_times_same_proj_appears',project_total_cost, 'unique_identifier']]
            .sort_values([project_title_col, 'how_many_times_same_proj_appears'], ascending = False)
            .drop_duplicates(subset = ['unique_identifier'])
           ) 
    # Create new funding estimate
    df2['new_proj_cost'] = df2[project_total_cost] / df2["how_many_times_same_proj_appears"]
    
    # Drop some columns
    df2 = df2.drop(columns = [project_title_col,project_total_cost,'how_many_times_same_proj_appears'])
    
    # Merge
    m1 = pd.merge(df, df2, how = "inner", on = "unique_identifier")
    
    # Clean up
    m1 = m1.drop(columns = ['unique_identifier', 'how_many_times_same_proj_appears', project_total_cost])
    
    # Replace project cost
    m1 = m1.rename(columns = {'new_proj_cost':'total_project_cost'})
    return m1

In [101]:
def harmonizing(
    df,
    agency_name_col: str,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    geography_col:str,
    phase_col:str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    program_col: str,
    data_source: str,
    fund_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.
    
    Add metric if the project is fully funded or not.
    """
    rename_columns = {
        agency_name_col: "lead_agency",
        project_name_col: "project_title",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "location",
        geography_col: "geometry",
        phase_col: "phase",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
        program_col: "grant_program"
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Clean up monetary columns to be interger
    df = correct_cost_fund(df)
  
    # Clean up string columns
    string_cols = df.select_dtypes(include=["object"]).columns.to_list()
    for i in string_cols:
        df[i] = df[i].str.replace("_", " ").str.strip().str.title()
    
    # Clean agency names
    df = harmonization_utils.organization_cleaning(df, "lead_agency")

    # Add data source
    df["data_source"] = data_source

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in fund_cols + ['total_project_cost']:
            df[i] = df[i].divide(1_000_000)
            
    # Clean up project cost
    df = correct_project_cost(df, "project_title", "total_project_cost")
        
    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = ["county","city","notes", "project_year", 
    "project_category", "location", "phase"]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "grant_program" not in df:
        df["grant_program"] = data_source
    if "geometry" not in df:
        df["geometry"] = None

    # Determine if the project completely funded or not?
    # Add up all available funds
    df["total_available_funds"] = df[fund_cols].sum(axis=1)
    df["fully_funded"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)
    
    # Add new column with funding breakout 
    # Since it's summarized above and the details are suppressed.
    df = create_notes(df, fund_cols + ["total_project_cost_note"], "funding_notes")
    
    # Create unique project id - first LOST project is LOST-1, 
    # second LOST is LOST-2, LOST-3, LOST-4, etc 
    df['project_id'] =  df.data_source + '-' + df.groupby('data_source').cumcount().astype('str')
    
    # Only keep certain columns
    df = df[columns_to_keep]

    # Fill in any nulls
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    return df

In [102]:
def harmonize_srp():
    df = harmonization_utils.load_state_rail_plan()
    df = harmonizing(
        df,
        agency_name_col="lead_agency",
        project_name_col="project_name",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="total_project_cost",
        location_col="corridor",
        geography_col="",
        phase_col="",
        county_col="",
        city_col="",
        project_year_col="",
        program_col= "",
        data_source = "State Rail Plan",
        fund_cols=[],
        cost_in_millions=True,)
        
    return df

In [103]:
def harominze_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing(
        df,
        agency_name_col="agency",
         project_name_col = "project_title",
        project_description_col="project_description",
        project_category_col= "project_category",
        project_cost_col="cost__in_millions_",
        location_col = "location",
        geography_col="",
        phase_col="",
        county_col = "county",
        city_col = "city",
        project_year_col = "",
        program_col = "LOST",
        data_source = "measure",
        fund_cols = [
            "estimated_lost_funds",
            "estimated_federal_funds",
            "estimated_state_funds",
            "estimated_local_funds",
            "estimated_other_funds",
        ],
        cost_in_millions = False,
    )
        
    return df

In [104]:
def harmonize_sb1():
    df = harmonization_utils.load_sb1()
    df = harmonizing(
        df,
        agency_name_col="agencies",
        project_name_col="projecttitle_x",
        project_description_col="projectdescription",
        project_category_col="",
        project_cost_col="totalcost",
        location_col="",
        geography_col="geometry",
        phase_col="projectstatuses",
        county_col="countynames",
        city_col="citynames",
        project_year_col="fiscalyears",
        program_col="SB1",
        data_source = "programcodes",
        fund_cols=["sb1funds", "iijafunds"],
        cost_in_millions=True,
    )
        
    return df

In [105]:
def add_all_projects():

    # Load  dataframes
    state_rail_plan = harmonize_srp()
    lost = harominze_lost()
    sb1 = harmonize_sb1()
    
    # Concat for df
    all_projects_df = pd.concat([lost, state_rail_plan, sb1])
    
    return all_projects_df

In [106]:
all_projects = add_all_projects()



In [109]:
all_projects.drop(columns = ['geometry']).sample(5)

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,phase,project_description,total_project_cost,fully_funded,total_available_funds,city,county,location,data_source,notes,funding_notes,project_id,phase.1
472,Emerson Ave From 3 Street To Highway 98,,,,measure,,Reconstruction,0.0,No project cost info,0.0,City Of Calexico,Imperial,,measure,,"estimated lost funds: 0.0, estimated federal funds: 0.0, estimated state funds: 0.0, estimated local funds: 0, estimated other funds: 0.0, total project cost note: This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset.",measure-472,
1713,La Canada Flintridge Soundwalls Project,,21/22,,programcodes,In Progress,"In The City Of La Canada Flintridge, Adjacent To The I-210, From Hampton-Foothill Undercrossing To Berkshire Place Undercrossing. Construct Three Soundwalls.",5.0,Fully funded,5.0,"La Canada Flintridge, Pasadena",Los Angeles,,programcodes,,"sb1funds: 5.0, iijafunds: 0.0, total project cost note: This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset.",programcodes-1713,In Progress
1127,Pavement Resurfacing Project,City Of Jackson,19/20,,programcodes,Inprogress,,0.12,Fully funded,0.2,Jackson,Amador,,programcodes,,"sb1funds: 0.197598, iijafunds: 0.0, total project cost note: This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset.",programcodes-1127,Inprogress
1150,Fairview Road Heavy Rehab And Seal Cracks,,,,measure,,,0.0,No project cost info,0.0,,San Benito,,measure,,"estimated lost funds: 0.0, estimated federal funds: 0.0, estimated state funds: 0.0, estimated local funds: 0, estimated other funds: 0.0, total project cost note: This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset.",measure-1150,
1246,,City Of San Marcos,"18/19, 19/20",,programcodes,"Completed, Inprogress",,3.6,Partially funded,0.54,San Marcos,San Diego,,programcodes,,"sb1funds: 0.54037, iijafunds: 0.0, total project cost note: This is an estimate of how much the project cost, estimated by dividing the total project cost by how many times the project appears in the dataset.",programcodes-1246,"Completed, Inprogress"


### Metrics

### Categorization

In [None]:

def get_list_of_words(df, col: str) -> list:
    """
    Natalie's function to clean and place words in a project description column
    into a list
    """
    # get just the one col
    column = df[[col]]

    # remove single-dimensional entries from the shape of an array
    col_text = column.squeeze()
    # get list of words
    text_list = col_text.tolist()

    # Join all the column into one large text blob, lower text
    text_list = " ".join(text_list).lower()

    # remove punctuation
    text_list = re.sub(r"[^\w\s]", "", text_list)

    # List of stopwords
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words("english")]
    
    # Remove stopwords
    clean_text_list = [
        word for word in word_tokenize(text_list.lower()) if word not in swords
    ]

    return clean_text_list

In [None]:
def find_common_phrases(df, description_column:str, values_to_add:list):
    
    # Break apart every word in the description column into a list
    descriptions_list = get_list_of_words(df, description_column)
    
    # Get phrases of whatever length you want (2,3,4,etc)
    c = Counter([" ".join(y) for x in [2] for y in ngrams(descriptions_list, x)])
    
    # Make a dataframe out of the counter values
    df_phrases = pd.DataFrame({"phrases": list(c.keys()), "total": list(c.values())})
    
    # Take phrases that are repeated more than 40 times and turn it into a list
    df_phrases = ((df_phrases.loc[df_phrases["total"] > 40])).reset_index(drop = True)
    common_phrases_list = df_phrases.phrases.tolist()
    
    phrases_to_del = ["san bernardino","los angeles","contra costa", "el dorado",
                "san luis obispo", "luis obispo", "del norte", "san francisco",
                "improve approximately"]
    
    common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))
    
    # CLean up the list to delete county information/etc
    words_to_delete = ['county','route', 'dollar','mile','santa','project','san','lanes','lane',
                      '2','4','financial','prop','best', 'approximately',]
    
    for word in words_to_delete:
        common_phrases_list = [ x for x in common_phrases_list if word not in x ]

    # ADD certain keywords here 
    # Operating Additional Service
    common_phrases_list.extend(values_to_add)
    
    return common_phrases_list 

In [None]:
def categorize_projects(df, 
                        description_column:str,
                        project_id_column:str,
                        title_column:str,
                        values_to_add:list):
    
    # Find most common 2 word phrases for some automatic project categories
    common_phrases_list = find_common_phrases(df,description_column,values_to_add)
    
    # Place all the words in common_phrases_list into a blob named query
    # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa
    query = "|".join(common_phrases_list)
    
    # Remove punctation and lower strings in original description column befores searching
    df["clean_description"] = (df[description_column]
                      .str.lower()
                      .str.replace("-", " ", regex=True)
                      .str.replace("("," ", regex=True)
                      .str.replace(")"," ", regex=True)
                      .str.replace("."," ", regex=True)
                      .str.strip()
                     )
    
    # Search through description column for the most common phrases 
    # Input the results in the new column
    df["auto_project_category"] = df["clean_description"].str.findall(
    r"\b({})\b".format(query))
    
    # Explode to take categories out of a list
    # Drop duplicate project keywords by title
    df = (df
          .explode("auto_project_category")
          .sort_values([project_id_column, title_column])
          .drop_duplicates(subset=[description_column, project_id_column, title_column, "auto_project_category"]))
    
    # Fill any uncategorized projects as "Other"
    df["auto_project_category"] = (df["auto_project_category"].fillna("Other").str.title())
    
    # Correct spelling
    spell = Speller(lang='en')
    df["auto_project_category"] = df["auto_project_category"].apply(lambda x: " ".join([spell(i) for i in x.split()]))
    
    # Summarize - put all the categories onto one line
    df = (df
          .groupby([description_column, project_id_column, title_column,])["auto_project_category"]
          .apply(",".join)
          .reset_index())
    
    return df

In [None]:
def add_all_projects2():

    # Load  dataframes
    state_rail_plan = harmonize_srp()
    lost = harominze_lost()
    sb1 = harmonize_sb1()
    
    # Concat for df
    all_projects_df = pd.concat([lost, state_rail_plan, sb1])
    
    # Categorize
    categories = categorize_projects(all_projects_df, 'project_description','project_title','project_id',
                                       ['operating','service','zero emission vehicle','zev','maintain/repair',
                                       'repair/replace'])
    
    # Merge categorized 
    all_projects_df = pd.merge(all_projects_df.drop(columns = ['clean_description']),
                                    categories, 
                                    how = "left",
                                    on = ['project_description','project_title','project_id']
                 )
    
    # Rename
    all_projects_df = (all_projects_df.drop(columns = ['auto_project_category_x'])
                                     .rename(columns = {'auto_project_category_y':'auto_tagged_project_categories'})
                       )
    # Concat for gdf
    all_projects_gdf = pd.concat([sb1])
    all_projects_gdf = all_projects_gdf.set_geometry('location')
    
    return all_projects_df, all_projects_gdf

In [None]:
all_projects, all_projects_geo = add_all_projects()

In [None]:
# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)

In [None]:
all_projects.info()

In [None]:
all_projects.drop(columns = ['location']).sample(5)

### Look at the data

In [None]:
all_projects.groupby(['lead_agency']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

In [None]:
all_projects[(all_projects.county == "Kern") & (all_projects.project_description.str.contains("Seal Coat"))].drop(columns = ['location'])

In [None]:
# all_projects.groupby(['project_category','auto_tagged_project_categories']).agg({'project_id':'nunique'})

In [None]:
all_projects.groupby(['auto_tagged_project_categories']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

In [None]:
all_projects.groupby(['project_category']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

In [None]:
all_projects.groupby(['project_description']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

In [None]:
all_projects.groupby(['county']).agg({'project_id':'nunique'}).sort_values('project_id', ascending = False).head(10)

In [None]:
all_projects.lead_agency.nunique()

In [None]:
all_projects.total_project_cost.describe()

In [None]:
all_projects.loc[all_projects.fully_funded == "Fully funded"].groupby(
    ["data_source"]
).agg({"project_id":"nunique"})

In [None]:
all_projects.loc[all_projects.fully_funded == "Partially funded"].groupby(
    ["data_source"]
).agg({"project_id":"nunique"})

In [None]:
all_projects.groupby(['data_source']).agg({"project_id":"nunique"})

In [None]:
all_projects.groupby(['fully_funded']).agg({"project_id":"nunique"}).reset_index().sort_values('project_id', ascending = False)

In [None]:
all_projects.groupby(['data_source','fully_funded']).agg({"project_id":"nunique"})