# Add metrics of interest

In [1]:
import pandas as pd
from siuba import *

In [2]:
df= pd.read_parquet("dla_df.parquet")

In [None]:
ACTIVE_TRANSPORTATION = ['bike', 'bicycle', 'cyclist', 
                         'pedestrian', 'crosswalk', 
                         'bulb out', 'bulb-out', 
                         'active transp', 'traffic reduction', 
                         'speed reduction', 
                        ]
TRANSIT = ['bus', 'metro', 'station', 'transit']
BRIDGE = ["bridge", 'viaduct']
STREET = ['traffic signal', 'resurface', 'resurfacing', 'slurry', 'seal' 
          'sign', 'stripe', 'striping', 'median', 
          'guard rail', 'guardrail', 
          'road', 'street', 
          'sinkhole', 'intersection'
         ]

FREEWAY = ['hov ', 'hot ']

SIDEWALK_CURB = ['curb', 'sidewalk', 'side walk', 
                'light', 'tree', 'pavement']


NOT_TRANSIT = ['metropolitan']
    



def categorize_project_descriptions(row):
    """
    This function takes a individual type of work description (row of a dataframe)
    and returns a dummy flag of 1 if it finds keyword present in
    project categories (active transportation, transit, bridge, etc).
    A description can contain multiple keywords across categories.
    """
    # Make lowercase
    description = row.type_of_work.lower()
    
    # Store a bunch of columns that will be flagged
    # A project can involve multiple things...also, not sure what's in the descriptions
    active_transp = 0
    transit = 0
    bridge = 0
    street = 0
    freeway = 0
    sidewalk_curb = 0
    
    if any(word in description for word in ACTIVE_TRANSPORTATION):
        active_transp = 1
    
    if (any(word in description for word in TRANSIT) and 
        not (any exclude_word in description for exclude_word in NOT_TRANSIT)
       ):
        transit = 1
        
    if any(word in description for word in BRIDGE):
        bridge = 1
    if any(word in description for word in STREET):
        street = 1
    if any(word in description for word in FREEWAY):
        freeway = 1        
    if any(word in description for word in SIDEWALK_CURB):
        sidewalk_curb = 1
        
    return pd.Series(
        [active_transp, transit, bridge, street, freeway, sidewalk_curb], 
        index=['active_transp', 'transit', 'bridge', 'street', 
               'freeway', 'sidewalk_curb']
    )

In [None]:
work_categories = df.apply(categorize_project_descriptions, axis=1)

In [None]:
df2 = pd.concat([df, work_categories], axis=1)

In [None]:
work_cols = list(work_categories.columns)
print(work_cols)

df2 = df2.assign(
    work_categories = df2[work_cols].sum(axis=1)
)

df2.work_categories.value_counts()

In [None]:
# list out descriptions where it's still zero keywords flagged
list(df2[df2.work_categories==0].type_of_work.unique())

Once `type_of_work` is categorized:

* which category had the most funding across all years? by year? do active transportation or transit projects see more funding in recent years?
* is funding lopsided? across all years, what % of funding do the top 5, top 10 agencies account for? if it is lopsided, show a breakdown of these top 5, top 10 agencies across these categories  
* show who are the top 5, 10 agencies within each category (converse of the above). within transit projects, who are the top 5, 10 agencies? within bridge projects, who are the top 5, 10 agencies?
* calculate the average funding for each category across all years (inflation-adjusted). this should normalize between the very few active transportation projects vs the many road repair projects, but give an average amt for that type of project in the district 

Show some `processing_days` metrics by categories.
* are the columns used for processing days usable? is there a way to derive measures of project start / end or project approval timeframe at aggregated levels?