In [None]:
# Normal packages
# String Manipulation
import re
from collections import Counter

# My utilities
import A1_data_prep
import A2_tableau
import nltk
import numpy as np
import pandas as pd

# Formatting
from babel.numbers import format_currency
from calitp import *
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# GCS File Path:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
df_tircp = to_snakecase(A2_tableau.tableau_dashboard())

In [None]:
df_tircp["description"] = df_tircp["description"].str.lower()

In [None]:
df_tircp = df_tircp[['award_year','title', 'description']]

### Grabbing keywords  

In [None]:
# Natalie's function
def get_list_of_words(df, col):

    # get just the one col
    column = df[[col]]
    # remove single-dimensional entries from the shape of an array
    col_text = column.squeeze()
    # get list of words
    text_list = col_text.tolist()
    # join list of words
    text_list = " ".join(text_list).lower()

    # remove punctuation
    text_list = re.sub(r"[^\w\s]", "", text_list)
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words("english")]
    # remove stopwords
    clean_text_list = [
        word for word in word_tokenize(text_list.lower()) if word not in swords
    ]

    return clean_text_list

In [None]:
# Place common keywords into a dataframe
# https://stackoverflow.com/questions/64593557/how-to-find-most-common-word-from-the-entire-column-of-string-in-python
descriptions_list = get_list_of_words(df_tircp, "description")

In [None]:
counter = Counter()  # Initializing a counter variable

In [None]:
for tag in descriptions_list:
    split_string = re.findall(r"\w+", tag)
    counter.update(split_string)

In [None]:
most_common = counter.most_common

In [None]:
most_common_dict = Counter(
    {
        "service": 75,
        "new": 62,
        "transit": 60,
        "rail": 46,
        "buses": 43,
        "station": 42,
        "bus": 41,
        "project": 33,
        "purchase": 32,
        "zeroemission": 31,
        "line": 28,
        "construction": 25,
        "electric": 23,
        "services": 23,
        "san": 22,
        "corridor": 21,
        "infrastructure": 21,
        "includes": 21,
        "improvements": 19,
        "vehicles": 19,
        "facility": 17,
        "train": 17,
        "county": 16,
        "santa": 16,
        "routes": 15,
        "including": 15,
        "system": 15,
        "expansion": 15,
        "also": 14,
        "route": 14,
        "trains": 14,
        "charging": 14,
        "center": 14,
        "frequency": 13,
        "sacramento": 13,
        "increase": 13,
        "allow": 13,
        "downtown": 12,
        "emission": 11,
        "light": 11,
        "expand": 11,
        "existing": 11,
        "additional": 11,
        "capacity": 11,
        "track": 10,
        "zero": 10,
        "la": 10,
        "passenger": 10,
        "regional": 10,
        "fleet": 10,
        "integration": 10,
        "city": 10,
        "increased": 9,
        "connecting": 9,
        "along": 9,
        "two": 9,
        "construct": 9,
        "los": 9,
        "angeles": 9,
        "extension": 9,
        "valley": 9,
        "bart": 9,
        "layover": 9,
        "2": 8,
        "well": 8,
        "10": 8,
        "3": 8,
        "ridership": 8,
        "improve": 8,
        "support": 8,
        "development": 8,
        "network": 8,
        "access": 8,
        "key": 8,
        "local": 8,
        "microtransit": 8,
        "provide": 8,
        "8": 7,
        "maintenance": 7,
        "trolley": 7,
        "design": 7,
        "program": 7,
        "battery": 7,
        "associated": 7,
        "4": 7,
        "per": 7,
        "metrolink": 7,
        "connect": 7,
        "commuter": 7,
        "allowing": 7,
        "housing": 7,
        "serving": 7,
        "improved": 7,
        "barbara": 7,
        "travel": 7,
        "add": 6,
        "jose": 6,
        "1": 6,
        "signal": 6,
        "priority": 6,
        "operations": 6,
        "rapid": 6,
        "muni": 6,
        "three": 6,
        "centers": 6,
        "airport": 6,
        "facilities": 6,
        "amtrak": 6,
        "metro": 6,
        "projects": 6,
        "riders": 6,
        "peak": 6,
        "future": 6,
        "funding": 6,
        "communities": 6,
        "lowfloor": 6,
        "stations": 6,
        "time": 6,
        "pacific": 6,
        "surfliner": 6,
        "implements": 5,
        "purchases": 5,
        "corridors": 5,
        "lanes": 5,
        "replace": 5,
        "blue": 5,
        "30": 5,
        "diego": 5,
        "7": 5,
        "green": 5,
        "california": 5,
        "study": 5,
        "multiple": 5,
        "investment": 5,
        "ceres": 5,
        "area": 5,
        "5": 5,
        "provides": 5,
        "connections": 5,
        "street": 5,
        "inglewood": 5,
        "hydrogen": 5,
        "el": 5,
        "bay": 5,
        "deploys": 4,
        "solar": 4,
        "ace": 4,
        "9": 4,
        "high": 4,
        "brt": 4,
        "orange": 4,
        "acquisition": 4,
        "cars": 4,
        "include": 4,
        "implement": 4,
        "day": 4,
        "multimodal": 4,
        "capital": 4,
        "gold": 4,
        "university": 4,
        "13": 4,
        "control": 4,
        "efforts": 4,
        "related": 4,
        "improving": 4,
        "6": 4,
        "parking": 4,
        "use": 4,
        "longer": 4,
        "express": 4,
        "investments": 4,
        "funds": 4,
        "performance": 4,
        "one": 4,
        "entertainment": 4,
        "coaster": 4,
        "cities": 4,
        "cajon": 4,
        "reliability": 4,
        "transportation": 4,
        "faster": 3,
        "metros": 3,
        "tier": 3,
        "boarding": 3,
        "15": 3,
        "power": 3,
        "upgrade": 3,
        "monterey": 3,
        "east": 3,
        "salinas": 3,
        "intermodal": 3,
        "linking": 3,
        "used": 3,
        "throughout": 3,
        "crossing": 3,
        "lines": 3,
        "reduce": 3,
        "installation": 3,
        "stockton": 3,
        "12": 3,
        "20": 3,
        "replacement": 3,
        "components": 3,
        "round": 3,
        "optimization": 3,
        "extend": 3,
        "international": 3,
        "bicycle": 3,
        "union": 3,
        "frequent": 3,
        "connectivity": 3,
        "west": 3,
        "torrance": 3,
        "bernardino": 3,
        "redlands": 3,
        "lrvs": 3,
        "car": 3,
        "completes": 3,
        "hour": 3,
        "hours": 3,
        "batteryelectric": 3,
        "fresno": 3,
        "creating": 3,
        "providing": 3,
        "create": 3,
        "targeted": 3,
        "unit": 3,
        "bidirectional": 3,
        "entire": 3,
        "lossan": 3,
        "seamless": 3,
        "supports": 3,
        "allelectric": 3,
        "expanded": 3,
        "award": 3,
        "effort": 3,
        "platform": 3,
        "set": 3,
        "would": 3,
        "conversion": 3,
        "statewide": 3,
        "stops": 3,
        "bike": 3,
        "imperial": 3,
        "beach": 3,
        "current": 3,
        "ventura": 3,
        "proposed": 3,
        "shuttle": 3,
        "serve": 3,
        "improvement": 3,
        "opportunities": 3,
        "intercity": 3,
        "pedestrian": 3,
        "extending": 3,
        "tracks": 3,
        "better": 3,
        "svs": 3,
        "mobility": 3,
        "connection": 3,
        "sonoma": 3,
        "rosa": 3,
        "fuel": 3,
        "extended": 3,
        "operated": 3,
        "right": 3,
        "way": 3,
        "relocation": 3,
        "efficiency": 3,
        "terminals": 3,
        "currently": 3,
        "phase": 3,
        "forward": 3,
        "south": 3,
        "merced": 3,
        "francisco": 3,
        "ferry": 3,
        "upgrades": 3,
        "implementation": 3,
        "glendale": 3,
        "contactless": 3,
        "payment": 3,
        "smart": 3,
        "turlock": 3,
        "40": 2,
        "double": 2,
        "levels": 2,
        "canopy": 2,
        "martinez": 2,
        "journeys": 2,
        "capitol": 2,
        "passengers": 2,
        "supportive": 2,
        "j": 2,
        "shared": 2,
        "agencies": 2,
        "busonly": 2,
        "information": 2,
        "install": 2,
        "locations": 2,
        "overhead": 2,
        "communications": 2,
        "reduced": 2,
        "cng": 2,
        "otay": 2,
        "mesa": 2,
        "courthouse": 2,
        "diesel": 2,
        "wayside": 2,
        "locomotives": 2,
        "metrolinks": 2,
        "2028": 2,
        "vans": 2,
        "ondemand": 2,
        "micro": 2,
        "areas": 2,
        "augment": 2,
        "photovoltaic": 2,
        "eastwest": 2,
        "connector": 2,
        "trips": 2,
        "frequencies": 2,
        "customer": 2,
        "crenshawlax": 2,
        "transfers": 2,
        "automated": 2,
        "people": 2,
        "mover": 2,
        "apm": 2,
        "single": 2,
        "southern": 2,
        "advancing": 2,
        "montclair": 2,
        "ana": 2,
        "north": 2,
        "hollywood": 2,
        "streetcar": 2,
        "ticketing": 2,
        "caltrain": 2,
        "procurement": 2,
        "platforms": 2,
        "60foot": 2,
        "articulated": 2,
        "16": 2,
        "transbay": 2,
        "allows": 2,
        "railyards": 2,
        "plaza": 2,
        "planned": 2,
        "reconfiguration": 2,
        "15min": 2,
        "part": 2,
        "job": 2,
        "acquire": 2,
        "order": 2,
        "comprehensive": 2,
        "dublinpleasanton": 2,
        "pilot": 2,
        "antelope": 2,
        "clarita": 2,
        "luis": 2,
        "obispo": 2,
        "counties": 2,
        "increases": 2,
        "accommodate": 2,
        "improves": 2,
        "expands": 2,
        "tircp": 2,
        "direction": 2,
        "highest": 2,
        "conversions": 2,
        "efficient": 2,
        "operate": 2,
        "testing": 2,
        "equipment": 2,
        "avenue": 2,
        "addition": 2,
        "total": 2,
        "commuters": 2,
        "employment": 2,
        "oxnard": 2,
        "overtheroad": 2,
        "significant": 2,
        "rider": 2,
        "trip": 2,
        "reduction": 2,
        "using": 2,
        "gas": 2,
        "benefits": 2,
        "ab": 2,
        "1550": 2,
        "awarded": 2,
        "clean": 2,
        "extends": 2,
        "clara": 2,
        "critical": 2,
        "delivers": 2,
        "riverside": 2,
        "11": 2,
        "fully": 2,
        "integrated": 2,
        "core": 2,
        "modes": 2,
        "elements": 2,
        "consolidation": 2,
        "boulevard": 2,
        "sports": 2,
        "district": 2,
        "stadium": 2,
        "growth": 2,
        "destinations": 2,
        "cell": 2,
        "greater": 2,
        "long": 2,
        "ontime": 2,
        "investing": 2,
        "robust": 2,
        "fencing": 2,
        "prepare": 2,
        "higher": 2,
        "maintenancelayover": 2,
        "enable": 2,
        "pomona": 2,
        "ontario": 2,
        "rancho": 2,
        "cucamonga": 2,
        "continuing": 2,
        "santee": 2,
        "enhance": 2,
        "14": 2,
        "bringing": 2,
        "options": 2,
        "providers": 2,
        "weta": 2,
        "contra": 2,
        "costa": 2,
        "inductive": 2,
        "systems": 2,
        "limited": 2,
        "shoreside": 2,
        "link": 2,
        "mission": 2,
        "school": 2,
        "avta": 2,
        "community": 2,
        "oakland": 2,
        "result": 2,
        "least": 2,
        "affordable": 2,
        "plan": 2,
        "completing": 2,
        "operators": 2,
        "application": 2,
        "overall": 2,
        "periods": 2,
        "fueling": 2,
        "hub": 2,
        "offered": 2,
        "mendocino": 2,
        "marina": 2,
        "safety": 2,
        "goleta": 2,
        "petaluma": 2,
        "perrissouth": 2,
        "circulatorondemand": 1,
        "firstmilelastmile": 1,
        "structures": 1,
        "curve": 1,
        "benefiting": 1,
        "joaquins": 1,
        "261": 1,
        "deploy": 1,
        "divisions": 1,
        "18": 1,
        "silver": 1,
        "many": 1,
        "bulbs": 1,
        "islands": 1,
        "shelters": 1,
        "realtime": 1,
        "crossovers": 1,
        "controls": 1,
        "led": 1,
        "signals": 1,
        "switches": 1,
        "19": 1,
        "turnouts": 1,
        "catenary": 1,
        "renovation": 1,
        "deadheads": 1,
        "five": 1,
        "40foot": 1,
        "sartc": 1,
        "metrolinkamtrak": 1,
        "refurbishment": 1,
        "seven": 1,
        "utdc": 1,
        "emissions": 1,
        "eliminate": 1,
        "need": 1,
        "overnight": 1,
        "idling": 1,
        "engines": 1,
        "sources": 1,
        "aces": 1,
        "mlk": 1,
        "crosstown": 1,
        "miner": 1,
        "nine": 1,
        "complete": 1,
        "four": 1,
        "acars": 1,
        "b": 1,
        "smarts": 1,
        "creates": 1,
        "ecosystem": 1,
        "offers": 1,
        "endtoend": 1,
        "solutions": 1,
        "residents": 1,
        "employees": 1,
        "global": 1,
        "audience": 1,
        "drawn": 1,
        "tourismconvention": 1,
        "summer": 1,
        "olympics": 1,
        "events": 1,
        "john": 1,
        "wayne": 1,
        "anaheim": 1,
        "neighborhoods": 1,
        "electricity": 1,
        "generation": 1,
        "roseville": 1,
        "standby": 1,
        "486": 1,
        "pamona": 1,
        "focused": 1,
        "enhancements": 1,
        "fresnos": 1,
        "clax": 1,
        "integrate": 1,
        "via": 1,
        "century": 1,
        "aviation": 1,
        "amc": 1,
        "lax": 1,
        "joining": 1,
        "municipal": 1,
        "red": 1,
        "purple": 1,
        "subway": 1,
        "offer": 1,
        "broaden": 1,
        "modernize": 1,
        "region": 1,
        "simultaneously": 1,
        "fernando": 1,
        "orangered": 1,
        "pasadena": 1,
        "vermont": 1,
        "12000": 1,
        "oc": 1,
        "garden": 1,
        "grove": 1,
        "invest": 1,
        "systemwide": 1,
        "mobile": 1,
        "electrification": 1,
        "approximately": 1,
        "51": 1,
        "miles": 1,
        "96": 1,
        "emus": 1,
        "lengthen": 1,
        "iv": 1,
        "locomotive": 1,
        "busiest": 1,
        "seating": 1,
        "45foot": 1,
        "272": 1,
        "communicationbased": 1,
        "cbtc": 1,
        "tunnel": 1,
        "length": 1,
        "alleviate": 1,
        "crowing": 1,
        "20000": 1,
        "ride": 1,
        "northside": 1,
        "10000": 1,
        "units": 1,
        "i5": 1,
        "northbound": 1,
        "ramp": 1,
        "southwest": 1,
        "northern": 1,
        "112": 1,
        "propane": 1,
        "dash": 1,
        "15minute": 1,
        "recommended": 1,
        "analysis": 1,
        "foot": 1,
        "multilevel": 1,
        "structure": 1,
        "500": 1,
        "spaces": 1,
        "prioritized": 1,
        "vanpool": 1,
        "capacityincreasing": 1,
        "step": 1,
        "assess": 1,
        "feasibility": 1,
        "rmu": 1,
        "propulsion": 1,
        "avl": 1,
        "together": 1,
        "regular": 1,
        "60minute": 1,
        "30minute": 1,
        "wide": 1,
        "lntegratlon": 1,
        "strategic": 1,
        "planning": 1,
        "expanding": 1,
        "lengthens": 1,
        "onboard": 1,
        "wifi": 1,
        "folsom": 1,
        "combines": 1,
        "previous": 1,
        "min": 1,
        "weekdays": 1,
        "plus": 1,
        "begins": 1,
        "initial": 1,
        "accessible": 1,
        "develop": 1,
        "zemu": 1,
        "dmu": 1,
        "could": 1,
        "impact": 1,
        "like": 1,
        "separated": 1,
        "lane": 1,
        "border": 1,
        "supplemental": 1,
        "eleven": 1,
        "modesto": 1,
        "sjjpa": 1,
        "weekday": 1,
        "madera": 1,
        "oakley": 1,
        "natomas": 1,
        "supported": 1,
        "sb": 1,
        "132": 1,
        "procument": 1,
        "zeremission": 1,
        "complements": 1,
        "enhancing": 1,
        "commute": 1,
        "host": 1,
        "lead": 1,
        "reductions": 1,
        "hov": 1,
        "completed": 1,
        "estimates": 1,
        "expect": 1,
        "45minute": 1,
        "gain": 1,
        "greenhouse": 1,
        "geographic": 1,
        "diversity": 1,
        "states": 1,
        "pedestrians": 1,
        "constructing": 1,
        "modern": 1,
        "safe": 1,
        "functional": 1,
        "inviting": 1,
        "accommodates": 1,
        "shuttles": 1,
        "250000": 1,
        "address": 1,
        "identify": 1,
        "coordina": 1,
        "facchinited": 1,
        "28": 1,
        "sbcag": 1,
        "air": 1,
        "intod": 1,
        "52000": 1,
        "2035": 1,
        "100000": 1,
        "2075": 1,
        "increasing": 1,
        "caltran": 1,
        "diridon": 1,
        "coachstyle": 1,
        "connects": 1,
        "redding": 1,
        "crossings": 1,
        "segments": 1,
        "larkspur": 1,
        "northward": 1,
        "windsor": 1,
        "healdsburg": 1,
        "cloverdale": 1,
        "reliable": 1,
        "runthough": 1,
        "movement": 1,
        "30min": 1,
        "basin": 1,
        "moorpark": 1,
        "highperformance": 1,
        "longrange": 1,
        "vision": 1,
        "gilroy": 1,
        "positive": 1,
        "twotrain": 1,
        "allocation": 1,
        "500000": 1,
        "networks": 1,
        "34": 1,
        "306": 1,
        "completion": 1,
        "tube": 1,
        "23": 1,
        "operation": 1,
        "10car": 1,
        "interrelated": 1,
        "introduce": 1,
        "redesigned": 1,
        "facilitate": 1,
        "convenient": 1,
        "realignment": 1,
        "cycle": 1,
        "h": 1,
        "pickup": 1,
        "dropoff": 1,
        "loop": 1,
        "x": 1,
        "6th": 1,
        "8th": 1,
        "richards": 1,
        "midtown": 1,
        "joaquin": 1,
        "altamont": 1,
        "16mile": 1,
        "electrically": 1,
        "powered": 1,
        "directly": 1,
        "regionally": 1,
        "lased": 1,
        "parksofi": 1,
        "basketball": 1,
        "ibec": 1,
        "inglewoods": 1,
        "clearlake": 1,
        "fuelcell": 1,
        "terminal": 1,
        "technology": 1,
        "range": 1,
        "contributing": 1,
        "coach": 1,
        "ucla": 1,
        "nctd": 1,
        "leverages": 1,
        "2018": 1,
        "stateoftheart": 1,
        "mile": 1,
        "stretch": 1,
        "terminate": 1,
        "relieve": 1,
        "operational": 1,
        "constraints": 1,
        "impacting": 1,
        "still": 1,
        "del": 1,
        "mar": 1,
        "bluffs": 1,
        "stabilization": 1,
        "combination": 1,
        "transitonly": 1,
        "stop": 1,
        "complementary": 1,
        "included": 1,
        "corridorsthe": 1,
        "mlines": 1,
        "near": 1,
        "term": 1,
        "build": 1,
        "advance": 1,
        "third": 1,
        "playa": 1,
        "vista": 1,
        "disadvantaged": 1,
        "integrating": 1,
        "available": 1,
        "solano": 1,
        "travelers": 1,
        "solanoexpress": 1,
        "sta": 1,
        "coordinated": 1,
        "napa": 1,
        "vine": 1,
        "share": 1,
        "seeking": 1,
        "ghg": 1,
        "vacaville": 1,
        "fairfieldvacaville": 1,
        "hannigan": 1,
        "fairfield": 1,
        "vallejo": 1,
        "suisun": 1,
        "walnut": 1,
        "creek": 1,
        "4x": 1,
        "downton": 1,
        "crenshaw": 1,
        "newly": 1,
        "kaiser": 1,
        "permanente": 1,
        "medical": 1,
        "western": 1,
        "portion": 1,
        "130": 1,
        "artesia": 1,
        "galleria": 1,
        "mall": 1,
        "size": 1,
        "fixed": 1,
        "intercommunity": 1,
        "rural": 1,
        "operates": 1,
        "enough": 1,
        "keep": 1,
        "demand": 1,
        "developed": 1,
        "residential": 1,
        "vessel": 1,
        "26mile": 1,
        "zero_x0002_emission": 1,
        "partnership": 1,
        "agency": 1,
        "status": 1,
        "first": 1,
        "nation": 1,
        "2019": 1,
        "lbt": 1,
        "essential": 1,
        "lake": 1,
        "merritt": 1,
        "cerrito": 1,
        "transitoriented": 1,
        "2000": 1,
        "homes": 1,
        "built": 1,
        "ticket": 1,
        "different": 1,
        "tickets": 1,
        "transaction": 1,
        "reroute": 1,
        "ccjpa": 1,
        "minutes": 1,
        "savings": 1,
        "dumbarton": 1,
        "bridge": 1,
        "cupertino": 1,
        "focus": 1,
        "offering": 1,
        "facilitating": 1,
        "27": 1,
        "busses": 1,
        "headway": 1,
        "college": 1,
        "transition": 1,
        "arroyo": 1,
        "verdugo": 1,
        "cañada": 1,
        "flintridge": 1,
        "crescenta": 1,
        "montrose": 1,
        "deck": 1,
        "lastly": 1,
        "400": 1,
        "phone": 1,
        "suite": 1,
        "aimed": 1,
        "historic": 1,
        "waterfront": 1,
        "several": 1,
        "underserved": 1,
        "specific": 1,
        "intersection": 1,
        "safey": 1,
        "across": 1,
        "freight": 1,
        "deployed": 1,
        "take": 1,
        "advantage": 1,
        "wascos": 1,
        "diaaride": 1,
        "availability": 1,
        "50": 1,
        "i680": 1,
        "pleasanton": 1,
        "bollinger": 1,
        "canyon": 1,
        "road": 1,
        "training": 1,
        "gomentum": 1,
        "parttime": 1,
        "lanestransit": 1,
        "shoulder": 1,
        "resiliency": 1,
        "backup": 1,
        "energey": 1,
        "storage": 1,
        "vehicle": 1,
        "ev": 1,
        "doubling": 1,
        "procure": 1,
        "private": 1,
        "trinidad": 1,
        "scotia": 1,
        "ukiah": 1,
        "located": 1,
        "lowincome": 1,
        "census": 1,
        "tracts": 1,
        "eureka": 1,
        "humboldt": 1,
        "seat": 1,
        "largest": 1,
        "restructuring": 1,
        "made": 1,
        "possible": 1,
        "central": 1,
        "coast": 1,
        "overhaul": 1,
        "modernization": 1,
        "railcars": 1,
        "dedicated": 1,
        "busway": 1,
        "parallel": 1,
        "highway": 1,
        "seaside": 1,
        "mst": 1,
        "tamcowned": 1,
        "branch": 1,
        "morning": 1,
        "afternoon": 1,
        "congested": 1,
        "rapidly": 1,
        "growing": 1,
        "commercial": 1,
        "hospitality": 1,
        "jobs": 1,
        "peninsula": 1,
        "resulting": 1,
        "optimized": 1,
        "enhancement": 1,
        "consists": 1,
        "signaling": 1,
        "rehabilitation": 1,
        "12th": 1,
        "division": 1,
        "speeds": 1,
        "mts": 1,
        "achieving": 1,
        "full": 1,
        "2040": 1,
        "vessels": 1,
        "necessary": 1,
        "treasure": 1,
        "island": 1,
        "ferries": 1,
        "rest": 1,
        "k": 1,
        "n": 1,
        "38r": 1,
        "geary": 1,
        "times": 1,
        "comfort": 1,
        "invests": 1,
        "embarcadero": 1,
        "3rd": 1,
        "location": 1,
        "delay": 1,
        "interim": 1,
        "direct": 1,
        "eight": 1,
        "zones": 1,
        "uc": 1,
        "general": 1,
        "deployment": 1,
        "racks": 1,
        "shelter": 1,
        "constructs": 1,
        "zeb": 1,
        "amenities": 1,
        "citybus": 1,
        "among": 1,
        "partners": 1,
        "authority": 1,
        "final": 1,
        "91perris": 1,
        "91pvl": 1,
        "peakperiod": 1,
        "4th": 1,
        "cp": 1,
        "eastridge": 1,
        "moreno": 1,
        "valleymarch": 1,
        "field": 1,
        "phased": 1,
        "cross": 1,
        "purchasing": 1,
        "feeder": 1,
        "selected": 1,
        "speed": 1,
    }
)

In [None]:
# Make a dataframe out of the dictionary
df_common_words = (
    pd.DataFrame.from_dict(most_common_dict, orient="index")
    .reset_index()
    .rename(columns={"index": "word", 0: "total apperance"})
)

### Grabbing phrases #1 
* Find most common phrases. 
* https://stackoverflow.com/questions/60037924/how-count-the-most-frequently-repeated-phrases-in-pandas

In [None]:
c = Counter([" ".join(y) for x in [2] for y in ngrams(descriptions_list, x)])

In [None]:
df_phrases = pd.DataFrame({"phrases": list(c.keys()), "total": list(c.values())})

In [None]:
# Turn phrases that are repeated more than 4 times into a list
most_common_phrases = (
    ((df_phrases.loc[df_phrases["total"] > 4])[["phrases"]]).phrases.unique().tolist()
)

In [None]:
values_to_remove = ('per day','green line','san jose','san diego', 'los angeles','also includes', 'santa barbara','pacific surfliner')
values_to_add = ('zero-emission', 'capacity improvements', 'rider', 'safety','capacity', 'battery-electric','contactless payment', 'buses',
                        'light rail vehicles', 'passengers','increase ridership', 'new routes','mobile ticketing','service expansion', 'extension',)

In [None]:
# Remove geographies
most_common_phrases = [i for i in most_common_phrases if i not in values_to_remove]

# Add some other keywords manually
most_common_phrases.extend(values_to_add)

In [None]:
# https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa
query = "|".join(most_common_phrases)

In [None]:
df_tircp["project_keywords"] = df_tircp["description"].str.findall(r"\b({})\b".format(query)) 

In [None]:
# Explode to take project keywords out of a list
# Drop duplicate project keywords by title 
df_explode = (df_tircp
 .explode('project_keywords')
 .sort_values(['award_year','title']).
 drop_duplicates(subset = ['title','project_keywords']))

In [None]:
df_explode["project_keywords"] = df_explode["project_keywords"].fillna('Other').str.title()

In [None]:
df_explode.groupby(['project_keywords']).agg({'description':'nunique'}).sort_values('description', ascending = False)

In [None]:
# Summarize 
df_explode.groupby(['award_year','title','description'])['project_keywords'].apply(",".join).reset_index()

In [None]:
extract = f"(electric buses|construction new|zero emission|transit service|service connecting|light rail|rail vehicles|bus rapid|rapid transit|expand service|battery electric|transit services|zeroemission buses|bus service|network integration|rail service|layover facility|charging infrastructure|travel time|transit center|zero-emission|capacity improvements|rider|safety|capacity|battery-electric|bus stops|contactless|buses|light rail vehicles|passengers|passenger|new routes)"

In [None]:
matches = df_tircp['description'].str.extractall(extract).unstack()

In [None]:
matches