## General function to clean up data from various grants
TO DO
* Switch City of Berkeley to Berkeley City. https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
* De duplicate projects
* Rearrange counties in County column in alphabetical order.
* What to do with "None" in project description? Replace it with title?

In [None]:
import _cleaning_utils
import _harmonization_utils as harmonization_utils
import _state_rail_plan_utils as srp_utils
import geopandas as gpd
import pandas as pd
import shapely
from calitp_data_analysis.sql import to_snakecase

In [None]:
"""
import re
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from collections import Counter
from autocorrect import Speller
"""

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# lost = harmonization_utils.load_lost()

In [None]:
columns_to_keep = [
    "project_title",
    "lead_agency",
    "project_year",
    "project_category",
    "grant_program",
    "phase",
    "project_description",
    "total_project_cost",
    "total_available_funds",
    "city",
    "county",
    "location",
    "geometry",
    "data_source",
    "notes",
    "funding_notes",
]

In [None]:
def create_notes(df, note_cols: list, new_col_name: str):
    """
    Concat multiple columns into one.
    """
    prefix = "_"
    for column in note_cols:
        df[f"{prefix}{column}"] = df[column].astype(str)
    note_cols = [prefix + sub for sub in note_cols]

    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values
    def combine_notes(x):
        return ", ".join([col + ": " + x[col] for col in note_cols])

    df[new_col_name] = df.apply(combine_notes, axis=1)
    df[new_col_name] = df[new_col_name].str.replace("_", " ")

    return df

#### Why does erase this project cost detail
* For SRP...

In [None]:
sb1 = harmonization_utils.load_sb1()

In [None]:
# srp = harmonization_utils.load_state_rail_plan()

In [None]:
def correct_cost_fund(df, monetary_cols:list):
    """
    Change columns with cost/fund in
    its name to be integer datatype
    """
    for i in monetary_cols:
        try:
            df[i] = (
                df[i]
                .str.lower()
                .str.replace("$", "")
                .str.replace(",", "")
                .str.replace("tbd","")
                .astype(int)
                .fillna(0)
            )
        except:
            df[i] = df[i].apply(pd.to_numeric, errors="coerce").fillna(0)
    return df

In [None]:
def harmonizing2(df,
    agency_name_col: str,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    geography_col: str,
    phase_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    data_source_col: str,
    fund_cols: list,
    program: str,
    cost_in_millions: bool = True,):
    # Rename columns
    rename_columns = {
        agency_name_col: "lead_agency",
        project_name_col: "project_title",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "location",
        geography_col: "geometry",
        phase_col: "phase",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
        data_source_col: "data_source",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)
    
    # Correct monetary columns
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    df = correct_cost_fund(df,cost_columns)
    
    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1_000_000)
    return df

In [None]:
srp2 = harmonizing2(srp, agency_name_col="lead_agency",
        project_name_col="project_name",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="total_project_cost",
        location_col="corridor",
        geography_col="",
        phase_col="",
        county_col="",
        city_col="",
        project_year_col="",
        data_source_col="",
        fund_cols=[],
        program="State Rail Plan",
        cost_in_millions=True,)

In [None]:
def harmonizing(
    df,
    agency_name_col: str,
    project_name_col: str,
    project_description_col: str,
    project_category_col: str,
    project_cost_col: str,
    location_col: str,
    geography_col: str,
    phase_col: str,
    county_col: str,
    city_col: str,
    project_year_col: str,
    program: str,
    data_source_col: str,
    fund_cols: list,
    cost_in_millions: bool = True,
):
    """
    Take a dataset and change the column names/types to
    default names and formats.
    """
    rename_columns = {
        agency_name_col: "lead_agency",
        project_name_col: "project_title",
        project_description_col: "project_description",
        project_category_col: "project_category",
        project_cost_col: "total_project_cost",
        location_col: "location",
        geography_col: "geometry",
        phase_col: "phase",
        county_col: "county",
        city_col: "city",
        project_year_col: "project_year",
        data_source_col: "data_source",
    }
    # Rename columns
    df = df.rename(columns=rename_columns)
    
    # Clean up monetary columns to be interger
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    df = correct_cost_fund(df,cost_columns)
    
    # Clean up string columns
    string_cols = df.select_dtypes(include=["object"]).columns.to_list()
    for i in string_cols:
        df[i] = df[i].str.replace("_", " ").str.strip().str.title()


    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in fund_cols + ["total_project_cost"]:
            df[i] = df[i].divide(1_000_000)


    # Add new column with funding breakout
    # Since it's summarized above and the details are suppressed.
    df["total_available_funds"] = df[fund_cols].sum(axis=1)
    df = create_notes(df, fund_cols, "funding_notes")

    # Add program
    df["grant_program"] = program
    
    # Create columns even if they don't exist, just to harmonize
    # before concatting.
    create_columns = [
        "county",
        "city",
        "notes",
        "project_year",
        "project_category",
        "location",
        "phase",
    ]
    for column in create_columns:
        if column not in df:
            df[column] = "None"
    if "geometry" not in df:
        df["geometry"] = None
    if "data_source" not in df:
        df["data_source"] = program

    # Only keep certain columns
    df = df[columns_to_keep]

    # Fill in any nulls
    df['project_description2'] = df.project_description.fillna(df.project_title)
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    return df

In [None]:
def harmonize_srp():
    df = harmonization_utils.load_state_rail_plan()
    df = harmonizing(
        df,
        agency_name_col="lead_agency",
        project_name_col="project_name",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="total_project_cost",
        location_col="corridor",
        geography_col="",
        phase_col="",
        county_col="",
        city_col="",
        project_year_col="",
        program="State Rail Plan",
        data_source_col="",
        fund_cols=[],
        cost_in_millions=True,
    )

    return df

In [None]:
test = harmonize_srp()

In [None]:
def harominze_lost():
    df = harmonization_utils.load_lost()
    df = harmonizing(
        df,
        agency_name_col="agency",
        project_name_col="project_title",
        project_description_col="project_description",
        project_category_col="project_category",
        project_cost_col="cost__in_millions_",
        location_col="location",
        geography_col="",
        phase_col="",
        county_col="county",
        city_col="city",
        project_year_col="",
        program="Local Options Sales Tax",
        data_source_col="measure",
        fund_cols=[
            "estimated_lost_funds",
            "estimated_federal_funds",
            "estimated_state_funds",
            "estimated_local_funds",
            "estimated_other_funds",
        ],
        cost_in_millions=False,
    )

    return df

In [None]:
def harmonize_sb1():
    df = harmonization_utils.load_sb1()
    df = harmonizing(
        df,
        agency_name_col="agencies",
        project_name_col="projecttitle_x",
        project_description_col="projectdescription",
        project_category_col="",
        project_cost_col="totalcost",
        location_col="",
        geography_col="geometry",
        phase_col="projectstatuses",
        county_col="countynames",
        city_col="citynames",
        project_year_col="fiscalyears",
        program="SB1",
        data_source_col="programcodes",
        fund_cols=["sb1funds", "iijafunds"],
        cost_in_millions=True,
    )

    return df

In [None]:
def add_all_projects():

    # Load  dataframes
    state_rail_plan = harmonize_srp()
    lost = harominze_lost()
    sb1 = harmonize_sb1()

    # Concat for df
    df = pd.concat([lost, state_rail_plan, sb1])
    
    # Clean agency names
    df = harmonization_utils.organization_cleaning(df, "lead_agency")
    
    # Determine if the project completely funded or not?
    # Add up all available funds
    df["fully_funded"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)
    return df

In [None]:
all_projects = add_all_projects()

In [None]:
all_projects.columns

In [None]:
all_projects.grant_program.value_counts()

In [None]:
all_projects.total_project_cost.value_counts().head() / len(all_projects) * 100

In [None]:
all_projects.project_description2.value_counts().head()

In [None]:
all_projects.project_title.value_counts().head()

### Does this project have enough information to be useful?

In [None]:
def categorize_info(df): 
 
    #Get percentiles in objects for total vehicle.
    p50_project_desc= df.project_description_count.quantile(0.50).astype(float)
    p50_null_values = df.total_percent_null_values.quantile(0.50).astype(float)
    
    #Function for fleet size
    def percentile_info (row):
        if ((row.project_description_count >= p50_project_desc) and (row.total_percent_null_values <= p25_null_values)):
            return "Yes"
        else: 
            return "No"
    df["enough_info"] = df.apply(lambda x: percentile_info(x), axis=1)
  
    return df    

In [None]:
def enough_info(df):
    # Select string columns
    string_cols = all_projects.select_dtypes(include=["object"]).columns.to_list()
    
    # https://stackoverflow.com/questions/73839250/count-number-of-occurrences-of-text-over-row-python-pandas
    # Count "nones" in string columns
    df['none_counts'] = df[string_cols].astype(str).sum(axis=1).str.lower().str.count("none")
    
    # Count zeroes
    df['zero_counts'] = (df == 0).astype(int).sum(axis=1)
    
    # Total up all none/zeroes 
    df["total_percent_null_values"] = df[['none_counts','zero_counts']].sum(axis=1)/len(all_projects.columns) * 100
    
    # Count project descriptions
    df["project_description_count"] = df["project_description"].str.count('\w+')
    
    # Categorize whether it has enough info or not
    df = categorize_info(df)
    
    # Compress columns to retain some info
    df['counts'] = 'number of strings in project desc: ' + df.project_description_count.astype(str) + ' % of null values:' + df.total_percent_null_values.astype(int).astype(str)
    
    df = df.drop(columns = ['none_counts','zero_counts','project_description_count','total_percent_null_values'])
    return df 

In [None]:
all_projects = enough_info(all_projects)

In [None]:
all_projects.enough_info.value_counts()

In [None]:
all_projects.drop(columns = ['geometry']).sample(5)

### Metrics
* Rewrite to be shorter?
* Correct spelling of descriptions?
* https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305

In [None]:
PASSENGER_MODE = ['non sov', 'high quality transit areas', 
                      'hqta', 'hov']
    
    
SAFETY = ['fatalities','safe', 'speed management','signal coordination',
              'slow speeds', 'roundabouts', 'victims','collisoins','protect',
              'crash', 'modification factors', 'safety system'] 

CONGESTION_RELIEF = ['congestion', 'rideshare','ridesharing', 'vanpool', 'car share']

In [None]:
my_list = [PASSENGER_MODE,SAFETY,CONGESTION_RELIEF]

In [None]:
my_values = ['passenger mode shift', 'safety', 'congestion relief']

In [None]:
pd.Series(my_columns)

In [None]:
def categorize(df, keywords:list, columns:list):
    
    # Clean project information
    project_description = (row.project_description2.lower()
                               .replace("-","")
                               .replace(".","")
                               .replace(":","")
                              )
    
    for item in keywords:
        if any(word in project_description for word in item):
            congestion_relief = "congestion relief"    
    if any(word in project_description for word in PASSENGER_MODE):
            passenger_mode_shift = "passenger_mode_shift"    
    if any(word in project_description for word in SAFETY):
            safety = "safety"    
    return pd.Series(
            [active_transp, transit, bridge, street, freeway, infra_resiliency_er, congestion_relief], 
            index=['active_transp', 'transit', 'bridge', 'street', 
                   'freeway', 'infra_resiliency_er', 'congestion_relief',
                  'passenger_mode_shift', 'safety']
        )
    

In [None]:
d = {'a': 1, 'b': 2, 'c': 3}

In [None]:
def add_categories(df):
    # There are many projects that are 
    ACTIVE_TRANSPORTATION = ['bike', 'bicycle', 'cyclist', 
                             'pedestrian', 
                             ## including the spelling errors of `pedestrian`
                             'pedestrain',
                             'crosswalk', 
                             'bulb out', 'bulb-out', 
                             'active transp', 'traffic reduction', 
                             'speed reduction', 'ped', 'srts', 
                             'safe routes to school',
                             'sidewalk', 'side walk', 'Cl ', 'trail',
                             'atp'
                            ]
    TRANSIT = ['bus', 'metro', 'station', #Station comes up a few times as a charging station and also as a train station
               'transit','fare', 'brt', 'yarts', 'railroad', 'highway-rail'
               # , 'station' in description and 'charging station' not in description
              ] 
    BRIDGE = ["bridge", 'viaduct']
    STREET = ['traffic signal', 'resurface', 'resurfacing', 'slurry', 'seal' 
              'sign', 'stripe', 'striping', 'median', 
              'guard rail', 'guardrail', 
              'road', 'street', 
              'sinkhole', 'intersection', 'signal', 'curb',
              'light', 'tree', 'pavement', 'roundabout'
             ]

    FREEWAY = ['hov ', 'hot ', 'freeway', 'highway', 'express lanes', 'hwy']

    INFRA_RESILIENCY_ER = ['repair', 'emergency', 'replace','retrofit', 'er',
                           'rehab', 'improvements', 'seismic', 'reconstruct', 'restoration']

    CONGESTION_RELIEF = ['congestion', 'rideshare','ridesharing', 'vanpool', 'car share']

    NOT_INC = ['charging', 'fueling', 'cng', 'bridge', 'trail',
           'k-rail', 'guardrails', 'bridge rail', 'guard', 'guarrail']
    
    PASSENGER_MODE = ['non sov', 'high quality transit areas', 
                      'hqta', 'hov']
    
    
    SAFETY = ['fatalities','safe', 'speed management','signal coordination',
              'slow speeds', 'roundabouts', 'victims','collisoins','protect',
              'crash', 'modification factors', 'safety system'] 
    
    def categorize_project_descriptions(row):
        """
        This function takes a individual type of work description (row of a dataframe)
        and returns a dummy flag of 1 if it finds keyword present in
        project categories (active transportation, transit, bridge, etc).
        A description can contain multiple keywords across categories.
        """
        # Clean up project description 2
        project_description = (row.project_description2.lower()
                               .replace("-","")
                               .replace(".","")
                               .replace(":","")
                              )
    
        # Store a bunch of columns that will be flagged
        # A project can involve multiple things...also, not sure what's in the descriptions
        active_transp = ""
        transit = ""
        bridge =""
        street = ""
        freeway = ""
        infra_resiliency_er = ""
        congestion_relief = ""
        passenger_mode_shift = ""
        safety = ""
        
        if any(word in project_description for word in ACTIVE_TRANSPORTATION):
            active_transp = "active transportation"
        
        #if any(word in description if instanceof(word, str) else word(description) for word in TRANSIT)

        if (any(word in project_description for word in TRANSIT) and 
            not any(exclude_word in project_description for exclude_word in NOT_INC)
           ):
            transit = "transit"
        if any(word in project_description for word in BRIDGE):
            bridge = "bridge"
        if any(word in project_description for word in STREET):
            street = "street"
        if any(word in project_description for word in FREEWAY):
            freeway = "freeway" 
        if any(word in project_description for word in INFRA_RESILIENCY_ER):
            infra_resiliency_er = "infrastructure"
        if any(word in project_description for word in CONGESTION_RELIEF):
            congestion_relief = "congestion relief"    
        if any(word in project_description for word in PASSENGER_MODE):
            passenger_mode_shift = "passenger_mode_shift"    
        if any(word in project_description for word in SAFETY):
            safety = "safety"    
        return pd.Series(
            [active_transp, transit, bridge, street, freeway, infra_resiliency_er, congestion_relief,
            passenger_mode_shift, safety], 
            index=['active_transp', 'transit', 'bridge', 'street', 
                   'freeway', 'infra_resiliency_er', 'congestion_relief',
                  'passenger_mode_shift', 'safety']
        )
    
    
    work_categories = df.apply(categorize_project_descriptions, axis=1)
    work_cols = list(work_categories.columns)
    df2 = pd.concat([df, work_categories], axis=1)
    
    df2['all_categories'] = df2[work_cols].agg(' '.join, axis=1)
    df2['all_categories'] = df2['all_categories'].str.replace("N/A","").str.strip()
    df2 = df2.drop(columns = work_cols)
    
    return df2



In [None]:
all_projects_metric = add_metrics(all_projects)

In [None]:
all_projects_metric.all_categories.value_counts().head(30)

In [None]:
def apply_metrics(df):
    def categorize_metrics(row):
        all_categories = row.all_categories.lower()
        safety = ""
        passenger_mode_shift = ""
        infill_development = ""
        
        if any(word in all_categories for word in ['infrastructure', 'bridge', 'safety', 'street']):
            safety = "safety"
        if any(word in all_categories for word in ['active transportation', 'passenger_mode_shift']):
            passenger_mode_shift = "passenger_mode_shift"
        if any(word in all_categories for word in ['transit', 'active transportation', 'infrastructure']):
            infill_development = "infill_development" 
       
        return pd.Series(
            [safety,passenger_mode_shift,infill_development], 
            index=['safety', 'passenger_mode_shift', 'infill_development']
        )
    
    work_categories = df.apply(categorize_metrics, axis=1)
    work_cols = list(work_categories.columns)
    df2 = pd.concat([df, work_categories], axis=1)
    
    df2['all_metrics'] = df2[work_cols].agg(' '.join, axis=1)
    df2['all_metrics'] = df2['all_metrics'].str.replace("N/A","").str.strip()
    df2 = df2.drop(columns = work_cols)
    
    return df2

In [None]:
all_projects_metric = apply_metrics(all_projects_metric)

In [None]:
all_projects_metric.all_metrics.unique()

In [None]:
all_projects_metric[['grant_program','project_description','all_metrics']].sample(50)

### Categorization

In [None]:
def get_list_of_words(df, col: str) -> list:
    """
    Natalie's function to clean and place words in a project description column
    into a list
    """
    # get just the one col
    column = df[[col]]

    # remove single-dimensional entries from the shape of an array
    col_text = column.squeeze()
    # get list of words
    text_list = col_text.tolist()

    # Join all the column into one large text blob, lower text
    text_list = " ".join(text_list).lower()

    # remove punctuation
    text_list = re.sub(r"[^\w\s]", "", text_list)

    # List of stopwords
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words("english")]

    # Remove stopwords
    clean_text_list = [
        word for word in word_tokenize(text_list.lower()) if word not in swords
    ]

    return clean_text_list

In [None]:
def find_common_phrases(df, description_column: str, values_to_add: list):

    # Break apart every word in the description column into a list
    descriptions_list = get_list_of_words(df, description_column)

    # Get phrases of whatever length you want (2,3,4,etc)
    c = Counter([" ".join(y) for x in [2] for y in ngrams(descriptions_list, x)])

    # Make a dataframe out of the counter values
    df_phrases = pd.DataFrame({"phrases": list(c.keys()), "total": list(c.values())})

    # Take phrases that are repeated more than 40 times and turn it into a list
    df_phrases = ((df_phrases.loc[df_phrases["total"] > 40])).reset_index(drop=True)
    common_phrases_list = df_phrases.phrases.tolist()

    phrases_to_del = [
        "san bernardino",
        "los angeles",
        "contra costa",
        "el dorado",
        "san luis obispo",
        "luis obispo",
        "del norte",
        "san francisco",
        "improve approximately",
    ]

    common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))

    # CLean up the list to delete county information/etc
    words_to_delete = [
        "county",
        "route",
        "dollar",
        "mile",
        "santa",
        "project",
        "san",
        "lanes",
        "lane",
        "2",
        "4",
        "financial",
        "prop",
        "best",
        "approximately",
    ]

    for word in words_to_delete:
        common_phrases_list = [x for x in common_phrases_list if word not in x]

    # ADD certain keywords here
    # Operating Additional Service
    common_phrases_list.extend(values_to_add)

    return common_phrases_list

In [None]:
def categorize_projects(
    df,
    description_column: str,
    project_id_column: str,
    title_column: str,
    values_to_add: list,
):

    # Find most common 2 word phrases for some automatic project categories
    common_phrases_list = find_common_phrases(df, description_column, values_to_add)

    # Place all the words in common_phrases_list into a blob named query
    # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa
    query = "|".join(common_phrases_list)

    # Remove punctation and lower strings in original description column befores searching
    df["clean_description"] = (
        df[description_column]
        .str.lower()
        .str.replace("-", " ", regex=True)
        .str.replace("(", " ", regex=True)
        .str.replace(")", " ", regex=True)
        .str.replace(".", " ", regex=True)
        .str.strip()
    )

    # Search through description column for the most common phrases
    # Input the results in the new column
    df["auto_project_category"] = df["clean_description"].str.findall(
        r"\b({})\b".format(query)
    )

    # Explode to take categories out of a list
    # Drop duplicate project keywords by title
    df = (
        df.explode("auto_project_category")
        .sort_values([project_id_column, title_column])
        .drop_duplicates(
            subset=[
                description_column,
                project_id_column,
                title_column,
                "auto_project_category",
            ]
        )
    )

    # Fill any uncategorized projects as "Other"
    df["auto_project_category"] = (
        df["auto_project_category"].fillna("Other").str.title()
    )

    # Correct spelling
    spell = Speller(lang="en")
    df["auto_project_category"] = df["auto_project_category"].apply(
        lambda x: " ".join([spell(i) for i in x.split()])
    )

    # Summarize - put all the categories onto one line
    df = (
        df.groupby(
            [
                description_column,
                project_id_column,
                title_column,
            ]
        )["auto_project_category"]
        .apply(",".join)
        .reset_index()
    )

    return df

In [None]:
def add_all_projects2():

    # Load  dataframes
    state_rail_plan = harmonize_srp()
    lost = harominze_lost()
    sb1 = harmonize_sb1()

    # Concat for df
    all_projects_df = pd.concat([lost, state_rail_plan, sb1])

    # Categorize
    categories = categorize_projects(
        all_projects_df,
        "project_description",
        "project_title",
        "project_id",
        [
            "operating",
            "service",
            "zero emission vehicle",
            "zev",
            "maintain/repair",
            "repair/replace",
        ],
    )

    # Merge categorized
    all_projects_df = pd.merge(
        all_projects_df.drop(columns=["clean_description"]),
        categories,
        how="left",
        on=["project_description", "project_title", "project_id"],
    )

    # Rename
    all_projects_df = all_projects_df.drop(columns=["auto_project_category_x"]).rename(
        columns={"auto_project_category_y": "auto_tagged_project_categories"}
    )
    # Concat for gdf
    all_projects_gdf = pd.concat([sb1])
    all_projects_gdf = all_projects_gdf.set_geometry("location")

    return all_projects_df, all_projects_gdf

In [None]:
# all_projects, all_projects_geo = add_all_projects()

In [None]:
# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)

In [None]:
all_projects.info()

In [None]:
all_projects.drop(columns=["location"]).sample(5)

### Look at the data

In [None]:
all_projects.groupby(["lead_agency"]).agg({"project_id": "nunique"}).sort_values(
    "project_id", ascending=False
).head(10)

In [None]:
all_projects[
    (all_projects.county == "Kern")
    & (all_projects.project_description.str.contains("Seal Coat"))
].drop(columns=["location"])

In [None]:
# all_projects.groupby(['project_category','auto_tagged_project_categories']).agg({'project_id':'nunique'})

In [None]:
all_projects.groupby(["auto_tagged_project_categories"]).agg(
    {"project_id": "nunique"}
).sort_values("project_id", ascending=False).head(10)

In [None]:
all_projects.groupby(["project_category"]).agg({"project_id": "nunique"}).sort_values(
    "project_id", ascending=False
).head(10)

In [None]:
all_projects.groupby(["project_description"]).agg(
    {"project_id": "nunique"}
).sort_values("project_id", ascending=False).head(10)

In [None]:
all_projects.groupby(["county"]).agg({"project_id": "nunique"}).sort_values(
    "project_id", ascending=False
).head(10)

In [None]:
all_projects.lead_agency.nunique()

In [None]:
all_projects.total_project_cost.describe()

In [None]:
all_projects.loc[all_projects.fully_funded == "Fully funded"].groupby(
    ["data_source"]
).agg({"project_id": "nunique"})

In [None]:
all_projects.loc[all_projects.fully_funded == "Partially funded"].groupby(
    ["data_source"]
).agg({"project_id": "nunique"})

In [None]:
all_projects.groupby(["data_source"]).agg({"project_id": "nunique"})

In [None]:
all_projects.groupby(["fully_funded"]).agg(
    {"project_id": "nunique"}
).reset_index().sort_values("project_id", ascending=False)

In [None]:
all_projects.groupby(["data_source", "fully_funded"]).agg({"project_id": "nunique"})