## General function to clean up data from various grants
TO DO
* Switch City of Berkeley to Berkeley City. https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
* De duplicate projects
* Rearrange counties in County column in alphabetical order.

* What to do with "None" in project description? Replace it with title?

In [1]:
import _cleaning_utils
import _harmonization_utils as harmonization_utils
import _state_rail_plan_utils as srp_utils
import geopandas as gpd
import pandas as pd
import shapely
from calitp_data_analysis.sql import to_snakecase


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
"""
import re
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from collections import Counter
from autocorrect import Speller
"""

'\nimport re\nimport nltk\nfrom nltk import ngrams\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import sent_tokenize, word_tokenize\nimport re\nfrom collections import Counter\nfrom autocorrect import Speller\n'

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
# sb1 = harmonization_utils.load_sb1()

In [5]:
# lost = harmonization_utils.load_lost()

In [6]:
columns_to_keep = [
    "project_title",
    "lead_agency",
    "project_year",
    "project_category",
    "grant_program",
    "phase",
    "project_description",
    "total_project_cost",
    "fully_funded",
    "total_available_funds",
    "city",
    "county",
    "location",
    "geometry",
    "data_source",
    "notes",
    "funding_notes",
    "project_id",
]

In [7]:
def create_notes(df, note_cols: list, new_col_name: str):
    """
    Concat multiple columns into one.
    """
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df[new_col_name] = df.apply(combine_notes, axis=1)
    df[new_col_name] = df[new_col_name].str.replace("_", " ")

    return df

#### Why does erase this project cost detail
* For SRP...

In [8]:
srp = harmonization_utils.load_state_rail_plan()

In [9]:
srp_head = srp.head()

In [10]:
def correct_cost_fund(df):
    """
    Change columns with cost/fund in
    its name to be integer datatype
    """
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        try:
            df[i] = (
                df[i]
                .str.replace("$", "")
                .str.replace(",", "")
                .apply(pd.to_numeric, errors="coerce")
                .fillna(0)
            )
        except:
            df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)
    return df

In [11]:
def correct_project_cost(df, project_title_col: str, project_total_cost: str):
    """
    For some datasets, the same project
    (as determined by the same project name, cost,
    and source) is split across multiple rows.

    Ex: A project costs $500 million and is
    split on 5 rows by phase/location. Each row still lists
    the total  cost as $500 million, which is not accurate.
    This function will recalculate each of the row to list
    $100 mil as the total project cost
    """
    # Create a unique identifier
    df["unique_identifier"] = df[project_title_col] + df[project_total_cost].astype(str)

    # Create count for each project
    df["how_many_times_same_proj_appears"] = (
        df.groupby("unique_identifier").cumcount() + 1
    )

    # Find the total number of times a project title-cost appears.
    # Sort by descending and keep only the row with the highest level
    df2 = (
        df[
            [
                project_title_col,
                "how_many_times_same_proj_appears",
                project_total_cost,
                "unique_identifier",
            ]
        ]
        .sort_values(
            [project_title_col, "how_many_times_same_proj_appears"], ascending=False
        )
        .drop_duplicates(subset=["unique_identifier"])
    )
    # Create new funding estimate
    df2["new_proj_cost"] = (
        df2[project_total_cost] / df2["how_many_times_same_proj_appears"]
    )

    # Drop some columns
    df2 = df2.drop(
        columns=[
            project_title_col,
            project_total_cost,
            "how_many_times_same_proj_appears",
        ]
    )

    # Merge
    m1 = pd.merge(df, df2, how="inner", on="unique_identifier")

    # Clean up
    m1 = m1.drop(
        columns=[
            "unique_identifier",
            "how_many_times_same_proj_appears",
            project_total_cost,
        ]
    )

    # Replace project cost
    m1 = m1.rename(columns={"new_proj_cost": "total_project_cost"})
    return m1

In [12]:
def harmonizing(
    df,
    agency_name_col: str,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    geography_col: str,
    phase_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    program: str,
    data_source_col: str,
    fund_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.

    Add metric if the project is fully funded or not.
    """
    rename_columns = {
        agency_name_col: "lead_agency",
        project_name_col: "project_title",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "location",
        geography_col: "geometry",
        phase_col: "phase",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
        data_source_col: "data_source",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)

    # Clean up monetary columns to be interger
    df = correct_cost_fund(df)

    # Clean up string columns
    string_cols = df.select_dtypes(include=["object"]).columns.to_list()
    for i in string_cols:
        df[i] = df[i].str.replace("_", " ").str.strip().str.title()

    # Clean agency names
    df = harmonization_utils.organization_cleaning(df, "lead_agency")

    # Clean up monetary columns to be interger
    df = correct_cost_fund(df)

    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in fund_cols + ["total_project_cost"]:
            df[i] = df[i].divide(1_000_000)

    # Determine if the project completely funded or not?
    # Add up all available funds
    df["total_available_funds"] = df[fund_cols].sum(axis=1)
    df["fully_funded"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)

    # Add new column with funding breakout
    # Since it's summarized above and the details are suppressed.
    df = create_notes(df, fund_cols, "funding_notes")

    # Add program
    df["grant_program"] = program

    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_year",
        "project_category",
        "location",
        "phase",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "geometry" not in df:
        df["geometry"] = None
    if "data_source" not in df:
        df["data_source"] = program

    # Create unique project id - first LOST project is LOST-1,
    # second LOST is LOST-2, LOST-3, LOST-4, etc
    df["project_id"] = (
        df.data_source + "-" + df.groupby("data_source").cumcount().astype("str")
    )

    # Only keep certain columns
    df = df[columns_to_keep]

    # Fill in any nulls
    df.project_description = df.project_description.fillna(df.project_title)
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    return df

In [13]:
def harmonize_srp():
    df = harmonization_utils.load_state_rail_plan()
    df = harmonizing(
        df,
        agency_name_col="lead_agency",
        project_name_col="project_name",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="total_project_cost",
        location_col="corridor",
        geography_col="",
        phase_col="",
        county_col="",
        city_col="",
        project_year_col="",
        program="State Rail Plan",
        data_source_col="",
        fund_cols=[],
        cost_in_millions=True,
    )

    return df

In [14]:
test = harmonize_srp()



In [15]:
def harominze_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing(
        df,
        agency_name_col="agency",
        project_name_col="project_title",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="cost__in_millions_",
        location_col="location",
        geography_col="",
        phase_col="",
        county_col="county",
        city_col="city",
        project_year_col="",
        program="Local Options Sales Tax",
        data_source_col="measure",
        fund_cols=[
            "estimated_lost_funds",
            "estimated_federal_funds",
            "estimated_state_funds",
            "estimated_local_funds",
            "estimated_other_funds",
        ],
        cost_in_millions=False,
    )

    return df

In [16]:
def harmonize_sb1():
    df = harmonization_utils.load_sb1()
    df = harmonizing(
        df,
        agency_name_col="agencies",
        project_name_col="projecttitle_x",
        project_description_col="projectdescription",
        project_category_col="",
        project_cost_col="totalcost",
        location_col="",
        geography_col="geometry",
        phase_col="projectstatuses",
        county_col="countynames",
        city_col="citynames",
        project_year_col="fiscalyears",
        program="SB1",
        data_source_col="programcodes",
        fund_cols=["sb1funds", "iijafunds"],
        cost_in_millions=True,
    )

    return df

In [17]:
def add_all_projects():

    # Load  dataframes
    state_rail_plan = harmonize_srp()
    lost = harominze_lost()
    sb1 = harmonize_sb1()

    # Concat for df
    all_projects_df = pd.concat([lost, state_rail_plan, sb1])

    return all_projects_df

In [18]:
all_projects = add_all_projects()



In [19]:
all_projects.columns

Index(['project_title', 'lead_agency', 'project_year', 'project_category',
       'grant_program', 'phase', 'project_description', 'total_project_cost',
       'fully_funded', 'total_available_funds', 'city', 'county', 'location',
       'geometry', 'data_source', 'notes', 'funding_notes', 'project_id'],
      dtype='object')

In [20]:
all_projects.grant_program.value_counts()

SB1                        3305
Local Options Sales Tax    1849
State Rail Plan             276
Name: grant_program, dtype: int64

In [21]:
all_projects.total_project_cost.value_counts().head() / len(all_projects) * 100

0.00   39.13
0.33    2.65
0.25    1.20
1.25    0.50
0.33    0.48
Name: total_project_cost, dtype: float64

In [22]:
all_projects.project_description.value_counts().head()

None                       889
Resurface                  191
Repair/Replace Culverts    157
Overlay                    126
Repair/Maintenance         108
Name: project_description, dtype: int64

In [23]:
all_projects.project_title.value_counts().head()

None                       667
Pavement Rehabilitation    235
Safety Improvements        193
Safety - Hm4               151
Drainage - Hm251           146
Name: project_title, dtype: int64

### Metrics
* Correct spelling of descriptions?

* https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305

In [26]:
def add_metrics(df):
    # There are many projects that are 
    ACTIVE_TRANSPORTATION = ['bike', 'bicycle', 'cyclist', 
                             'pedestrian', 
                             ## including the spelling errors of `pedestrian`
                             'pedestrain',
                             'crosswalk', 
                             'bulb out', 'bulb-out', 
                             'active transp', 'traffic reduction', 
                             'speed reduction', 'ped', 'srts', 
                             'safe routes to school',
                             'sidewalk', 'side walk', 'Cl ', 'trail'
                            ]
    TRANSIT = ['bus', 'metro', 'station', #Station comes up a few times as a charging station and also as a train station
               'transit','fare', 'brt', 'yarts', 'railroad', 'highway-rail'
               # , 'station' in description and 'charging station' not in description
              ] 
    BRIDGE = ["bridge", 'viaduct']
    STREET = ['traffic signal', 'resurface', 'resurfacing', 'slurry', 'seal' 
              'sign', 'stripe', 'striping', 'median', 
              'guard rail', 'guardrail', 
              'road', 'street', 
              'sinkhole', 'intersection', 'signal', 'curb',
              'light', 'tree', 'pavement', 'roundabout'
             ]

    FREEWAY = ['hov ', 'hot ', 'freeway', 'highway', 'express lanes', 'hwy']

    INFRA_RESILIENCY_ER = ['repair', 'emergency', 'replace','retrofit', 'er',
                           'rehab', 'improvements', 'seismic', 'reconstruct', 'restoration']

    CONGESTION_RELIEF = ['congestion', 'rideshare','ridesharing', 'vanpool', 'car share']

    NOT_INC = ['charging', 'fueling', 'cng', 'bridge', 'trail',
           'k-rail', 'guardrails', 'bridge rail', 'guard', 'guarrail']

    def categorize_project_descriptions(row):
        """
        This function takes a individual type of work description (row of a dataframe)
        and returns a dummy flag of 1 if it finds keyword present in
        project categories (active transportation, transit, bridge, etc).
        A description can contain multiple keywords across categories.
        """
        # Make lowercase
        project_description = (row.project_description.lower()
                               .replace("-","")
                               .replace(".","")
                               .replace(":","")
                              )
    
        # Store a bunch of columns that will be flagged
        # A project can involve multiple things...also, not sure what's in the descriptions
        active_transp = ""
        transit = ""
        bridge = ""
        street = ""
        freeway = ""
        infra_resiliency_er = ""
        congestion_relief = ""
        
        if any(word in project_description for word in ACTIVE_TRANSPORTATION):
            active_transp = "active transportation"
        
        #if any(word in description if instanceof(word, str) else word(description) for word in TRANSIT)

        if (any(word in project_description for word in TRANSIT) and 
            not any(exclude_word in project_description for exclude_word in NOT_INC)
           ):
            transit = "transit"
        
        if any(word in project_description for word in BRIDGE):
            bridge = "bridge"
        if any(word in project_description for word in STREET):
            street = "street"
        if any(word in project_description for word in FREEWAY):
            freeway = "freeway" 
        if any(word in project_description for word in INFRA_RESILIENCY_ER):
            infra_resiliency_er = "infrastructure"
        if any(word in project_description for word in CONGESTION_RELIEF):
            congestion_relief = "congestion relief"    
        
        
        return pd.Series(
            [active_transp, transit, bridge, street, freeway, infra_resiliency_er, congestion_relief], 
            index=['active_transp', 'transit', 'bridge', 'street', 
                   'freeway', 'infra_resiliency_er', 'congestion_relief']
        )
    
    
    work_categories = df.apply(categorize_project_descriptions, axis=1)
    work_cols = list(work_categories.columns)
    df2 = pd.concat([df, work_categories], axis=1)
    
    df2['all_categories'] = df2[work_cols].agg(' '.join, axis=1)
    df2['all_categories'] = df2['all_categories'].str.replace("N/A","").str.strip()
    df2 = df2.drop(columns = work_cols)
    
    return df2



In [27]:
test2 = add_metrics(all_projects)

In [28]:
test2.all_categories.value_counts().head(10)

infrastructure                                    1477
                                                  1397
street  infrastructure                             800
street                                             383
bridge   infrastructure                            238
transit    infrastructure                          208
active transportation   street  infrastructure     136
freeway infrastructure                              92
street freeway infrastructure                       83
transit                                             76
Name: all_categories, dtype: int64

In [29]:
def apply_metrics(df):
    def categorize_metrics(row):
        # Store a bunch of columns that will be flagged
        # A project can involve multiple things...also, not sure what's in the descriptions
        # Make lowercase
        all_categories = row.all_categories.lower()
        safety = ""
        passenger_mode_shift = ""
        infill_development = ""
        
        if any(word in all_categories for word in ['infrastructure', 'bridge']):
            safety = "safety"
        if any(word in all_categories for word in ['active transportation']):
            passenger_mode_shift = "passenger_mode_shift"
        if any(word in all_categories for word in ['freeway', 'transit', 'active transportation', 'infrastructure']):
            infill_development = "infill_development" 
       
        return pd.Series(
            [safety,passenger_mode_shift,infill_development], 
            index=['safety', 'passenger_mode_shift', 'infill_development']
        )
    
    work_categories = df.apply(categorize_metrics, axis=1)
    work_cols = list(work_categories.columns)
    df2 = pd.concat([df, work_categories], axis=1)
    
    df2['all_metrics'] = df2[work_cols].agg(' '.join, axis=1)
    df2['all_metrics'] = df2['all_metrics'].str.replace("N/A","").str.strip()
    df2 = df2.drop(columns = work_cols)
    
    return df2

In [30]:
test3 = apply_metrics(test2)

In [31]:
test3.all_metrics.unique()

array(['infill_development', 'safety  infill_development', '',
       'passenger_mode_shift infill_development',
       'safety passenger_mode_shift infill_development', 'safety'],
      dtype=object)

In [32]:
test3[['grant_program','project_description','all_metrics']].sample(50)

Unnamed: 0,grant_program,project_description,all_metrics
2946,SB1,A $33.35 Million Dollar Project In San Bernardino County On Route 215 Will Replace Overcrossing.,safety infill_development
1230,Local Options Sales Tax,Sr-30/210 Widening From I-215 To I-10,
2852,SB1,A $5.84 Million Dollar Project In Los Angeles County On Route 2 Will Rehabilitate Culverts.,safety infill_development
1657,Local Options Sales Tax,Union Pacific Railroad Trail,passenger_mode_shift infill_development
2716,SB1,"A $2.4 Million Dollar Project In Kern County On Route 155 Will Remove Hazardous Trees, Debris, And Replace Fire Damaged Culverts And Signs.",safety infill_development
3107,SB1,"A $10.54 Million Dollar Project In Calaveras County On Route 4 Will Replace Bridge.(Additional $300,000 For Ps&E From Local Contribution.)",safety infill_development
569,Local Options Sales Tax,Resurface,
2736,SB1,A $9.99 Million Dollar Project In Ventura County On Route 150 Will Construct And Upgrade Curb Ramps And Sidewalks To Comply With Americans With Disabilities Act (Ada) Standards.,safety passenger_mode_shift infill_development
2267,SB1,A $15.4 Million Dollar Project In Sacramento County On Route 99 Will Rehabilitate And Improve Access To Five Pump Plants.,safety infill_development
1356,Local Options Sales Tax,"Construction Of New Wheelchair Curb Ramps And Related Roadway Work To\nPermit Ease Of Movement For The Mobility Impaired. Reconstruction Of\nExisting Ramps. Includes Project Development And Capital Costs. Sponsoring\nAgency: Dpw, Muni. The First $23.6M Is Priority 1. The Next $2.4M Is\nPriority 2 And The Remainder Is Priority 3. Total Funds: $66.0M; Prop K:\n$36.0M.",safety infill_development


### Does this project have enough information to be useful?

In [33]:
len(all_projects.columns)

18

In [34]:
string_cols = all_projects.select_dtypes(include=["object"]).columns.to_list()

In [35]:
all_projects2 = all_projects[string_cols]

In [36]:
# https://stackoverflow.com/questions/73839250/count-number-of-occurrences-of-text-over-row-python-pandas
all_projects2['none_count'] = all_projects2.astype(str).sum(axis=1).str.lower().str.count("none")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [37]:
all_projects2['none_count'].value_counts()

3    1397
7    1293
4    1124
5     760
6     581
8     274
9       1
Name: none_count, dtype: int64

In [38]:
all_projects2.drop(columns = ['geometry']).sample(3)

Unnamed: 0,project_title,lead_agency,project_year,project_category,grant_program,phase,project_description,fully_funded,city,county,location,data_source,notes,funding_notes,project_id,none_count
2532,Pavement Rehabilitation,Caltrans,22/23,,SB1,Planned,"A $19.55 Million Dollar Project In San Luis Obispo County On Route 1 Will Upgrade Americans With Disabilities Act (Ada) Curb Ramps, Cold Plane Pavement, Place Rubberized Hot Mix Asphalt (Rhma) Pavement And Widen Shoulders And Construct Bike Lanes As Complete Streets Elements.",Partially funded,"Arroyo Grande, Grover Beach, Pismo Beach, San Luis Obispo",San Luis Obispo,,Shopp,,"sb1funds: 4.9521, iijafunds: 0.0",Shopp-858,3
45,Dublinpleasanton Capacity Improvement And Congestion Reduction Program,,18/19,,SB1,Inprogress,,Partially funded,"Dublin, Pleasanton",Alameda,,Tircp,,"sb1funds: 11.411, iijafunds: 0.0",Tircp-1,5
3238,Roadside Safety Improvements/Freeway Maintenance Access,Caltrans,21/22,,SB1,In Progress,"A $2.63 Million Dollar Project In Orange County On Route 5 Will Enhance Highway Worker Safety By Paving Miscellaneous Areas, Install Access Gates And Relocate Irrigation Facilities Away From The Edge Of The Road.",Partially funded,"Anaheim, Fullerton",Orange,,Shopp,,"sb1funds: 0.0, iijafunds: 1.7386",Shopp-1564,3


In [39]:
stop

NameError: name 'stop' is not defined

### Categorization

In [None]:
def get_list_of_words(df, col: str) -> list:
    """
    Natalie's function to clean and place words in a project description column
    into a list
    """
    # get just the one col
    column = df[[col]]

    # remove single-dimensional entries from the shape of an array
    col_text = column.squeeze()
    # get list of words
    text_list = col_text.tolist()

    # Join all the column into one large text blob, lower text
    text_list = " ".join(text_list).lower()

    # remove punctuation
    text_list = re.sub(r"[^\w\s]", "", text_list)

    # List of stopwords
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words("english")]

    # Remove stopwords
    clean_text_list = [
        word for word in word_tokenize(text_list.lower()) if word not in swords
    ]

    return clean_text_list

In [None]:
def find_common_phrases(df, description_column: str, values_to_add: list):

    # Break apart every word in the description column into a list
    descriptions_list = get_list_of_words(df, description_column)

    # Get phrases of whatever length you want (2,3,4,etc)
    c = Counter([" ".join(y) for x in [2] for y in ngrams(descriptions_list, x)])

    # Make a dataframe out of the counter values
    df_phrases = pd.DataFrame({"phrases": list(c.keys()), "total": list(c.values())})

    # Take phrases that are repeated more than 40 times and turn it into a list
    df_phrases = ((df_phrases.loc[df_phrases["total"] > 40])).reset_index(drop=True)
    common_phrases_list = df_phrases.phrases.tolist()

    phrases_to_del = [
        "san bernardino",
        "los angeles",
        "contra costa",
        "el dorado",
        "san luis obispo",
        "luis obispo",
        "del norte",
        "san francisco",
        "improve approximately",
    ]

    common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))

    # CLean up the list to delete county information/etc
    words_to_delete = [
        "county",
        "route",
        "dollar",
        "mile",
        "santa",
        "project",
        "san",
        "lanes",
        "lane",
        "2",
        "4",
        "financial",
        "prop",
        "best",
        "approximately",
    ]

    for word in words_to_delete:
        common_phrases_list = [x for x in common_phrases_list if word not in x]

    # ADD certain keywords here
    # Operating Additional Service
    common_phrases_list.extend(values_to_add)

    return common_phrases_list

In [None]:
def categorize_projects(
    df,
    description_column: str,
    project_id_column: str,
    title_column: str,
    values_to_add: list,
):

    # Find most common 2 word phrases for some automatic project categories
    common_phrases_list = find_common_phrases(df, description_column, values_to_add)

    # Place all the words in common_phrases_list into a blob named query
    # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa
    query = "|".join(common_phrases_list)

    # Remove punctation and lower strings in original description column befores searching
    df["clean_description"] = (
        df[description_column]
        .str.lower()
        .str.replace("-", " ", regex=True)
        .str.replace("(", " ", regex=True)
        .str.replace(")", " ", regex=True)
        .str.replace(".", " ", regex=True)
        .str.strip()
    )

    # Search through description column for the most common phrases
    # Input the results in the new column
    df["auto_project_category"] = df["clean_description"].str.findall(
        r"\b({})\b".format(query)
    )

    # Explode to take categories out of a list
    # Drop duplicate project keywords by title
    df = (
        df.explode("auto_project_category")
        .sort_values([project_id_column, title_column])
        .drop_duplicates(
            subset=[
                description_column,
                project_id_column,
                title_column,
                "auto_project_category",
            ]
        )
    )

    # Fill any uncategorized projects as "Other"
    df["auto_project_category"] = (
        df["auto_project_category"].fillna("Other").str.title()
    )

    # Correct spelling
    spell = Speller(lang="en")
    df["auto_project_category"] = df["auto_project_category"].apply(
        lambda x: " ".join([spell(i) for i in x.split()])
    )

    # Summarize - put all the categories onto one line
    df = (
        df.groupby(
            [
                description_column,
                project_id_column,
                title_column,
            ]
        )["auto_project_category"]
        .apply(",".join)
        .reset_index()
    )

    return df

In [None]:
def add_all_projects2():

    # Load  dataframes
    state_rail_plan = harmonize_srp()
    lost = harominze_lost()
    sb1 = harmonize_sb1()

    # Concat for df
    all_projects_df = pd.concat([lost, state_rail_plan, sb1])

    # Categorize
    categories = categorize_projects(
        all_projects_df,
        "project_description",
        "project_title",
        "project_id",
        [
            "operating",
            "service",
            "zero emission vehicle",
            "zev",
            "maintain/repair",
            "repair/replace",
        ],
    )

    # Merge categorized
    all_projects_df = pd.merge(
        all_projects_df.drop(columns=["clean_description"]),
        categories,
        how="left",
        on=["project_description", "project_title", "project_id"],
    )

    # Rename
    all_projects_df = all_projects_df.drop(columns=["auto_project_category_x"]).rename(
        columns={"auto_project_category_y": "auto_tagged_project_categories"}
    )
    # Concat for gdf
    all_projects_gdf = pd.concat([sb1])
    all_projects_gdf = all_projects_gdf.set_geometry("location")

    return all_projects_df, all_projects_gdf

In [None]:
# all_projects, all_projects_geo = add_all_projects()

In [None]:
# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)

In [None]:
all_projects.info()

In [None]:
all_projects.drop(columns=["location"]).sample(5)

### Look at the data

In [None]:
all_projects.groupby(["lead_agency"]).agg({"project_id": "nunique"}).sort_values(
    "project_id", ascending=False
).head(10)

In [None]:
all_projects[
    (all_projects.county == "Kern")
    & (all_projects.project_description.str.contains("Seal Coat"))
].drop(columns=["location"])

In [None]:
# all_projects.groupby(['project_category','auto_tagged_project_categories']).agg({'project_id':'nunique'})

In [None]:
all_projects.groupby(["auto_tagged_project_categories"]).agg(
    {"project_id": "nunique"}
).sort_values("project_id", ascending=False).head(10)

In [None]:
all_projects.groupby(["project_category"]).agg({"project_id": "nunique"}).sort_values(
    "project_id", ascending=False
).head(10)

In [None]:
all_projects.groupby(["project_description"]).agg(
    {"project_id": "nunique"}
).sort_values("project_id", ascending=False).head(10)

In [None]:
all_projects.groupby(["county"]).agg({"project_id": "nunique"}).sort_values(
    "project_id", ascending=False
).head(10)

In [None]:
all_projects.lead_agency.nunique()

In [None]:
all_projects.total_project_cost.describe()

In [None]:
all_projects.loc[all_projects.fully_funded == "Fully funded"].groupby(
    ["data_source"]
).agg({"project_id": "nunique"})

In [None]:
all_projects.loc[all_projects.fully_funded == "Partially funded"].groupby(
    ["data_source"]
).agg({"project_id": "nunique"})

In [None]:
all_projects.groupby(["data_source"]).agg({"project_id": "nunique"})

In [None]:
all_projects.groupby(["fully_funded"]).agg(
    {"project_id": "nunique"}
).reset_index().sort_values("project_id", ascending=False)

In [None]:
all_projects.groupby(["data_source", "fully_funded"]).agg({"project_id": "nunique"})