## TIRCP DRISI
* DRISI emailed a request asking for TIRCP outcomes for cycles 3-5.  
* [Cycles 1-6](https://calsta.ca.gov/subject-areas/transit-intercity-rail-capital-prog)
* Cycle 1: 2015
* Cycle 2: 2016
* Cycle 3: 2018
* Cycle 4: 2020
* Cycle 5: 2022
* Cycle 6: 2023

<b>Notes 2/7/23</b>
* Application is different than what we have in the dataset.
* Outputs: hydrogen-battery fuel cell.
* DRISI wants the data before end of this week.
* Outputs
    * Category for outputs: transit/multimodal or rail/multimodal. 
    * Pull it from `allocations` tab.
    * Filter out design categories. We only care about 'CONST'.
    * Transit/Multimodal
        * Buses breakout between mobile/school/etc. Unit is each.
            * Microtransit
            * Bus Conversion
            * Zero emmission (don't distinguish between hydrogen vs battery)
            * Battery Electric
            * Ferries
    * Rail/Multimodal
        * Trolleys
        * Trains/Cars/Coach/Rolling Stocks
        * Traffic Control
    * Track
        * Track Extension
        * Double Track
    * Network Integration improves multimodal network so everything is synced. 
    * Parking Lots/Parking Deck
    * Active Transportation
        * Bike Shelters/Shade Structures
        * Bike and Pedestrians Improvements
    * Facilities
        * Center/Facility/Station/Station/Hub/Islands
    * Charging Infrastructure. 
    * Storm Drain Line
    * Street Extension
    * Charging
    * Signaling
* Outcomes
    * On Time Performance
        * Expanded Service
        * Routes 
        * Ridership

In [81]:
import A1_data_prep
import A2_tableau
import A8_strings
import numpy as np
import pandas as pd
from babel.numbers import format_currency
from calitp import *

In [82]:
pd.options.display.max_columns = 125
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [83]:
# GCS File Path:
GCS_FILE_PATH = f"{A1_data_prep.GCS_FILE_PATH}DRISI/"

## Prep Functions

In [84]:
def read_in_files(file_name: str, columns_to_drop: list):
    """
    Prepping the award project, implemented project,
    and outcome project details Excel file.
    """
    df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{file_name}"))

    df = df.drop(columns=columns_to_drop)
    
    # Delete any columns with the following strings
    df = df.loc[:, ~df.columns.str.contains("funding|recipient|date|cost|program|amount|name|description")]
    
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    return df

In [85]:
def merge_value_counts(left_df, right_df, left_m_col:str, right_m_col:str):
    """
    Merging datasets methodologically. 
    Find columns that intersect and drop them from
    the right dataframe. Return 2 merged dataframes,
    one left and one outer.
    """
    # Some of these sheets have repeated columns
    # Del them off so the info isn't repeated again
    left_df_cols = left_df.columns
    right_df_cols = right_df.columns
    common_cols = right_df_cols.intersection(left_df_cols)
    common_cols = common_cols.to_list()
    
    right_df = right_df.drop(columns = common_cols) 
    
    # One df for an outer merge just for checking both/left only/right only vals
    outer = pd.merge(left_df, right_df, how="outer", left_on=left_m_col, right_on=right_m_col, indicator=True)
    
    # One for left
    left =  pd.merge(left_df, right_df, how="left",  left_on=left_m_col, right_on=right_m_col)

    return outer, left

## Outputs #1 
### Manipulate TIRCP Project Sheet

In [86]:
allocation_cols = [
    "allocation_ppno",
    "allocation_award_year",
    "allocation_components",
    "allocation_phase",
]
project_cols = [
    "project_ppno",
    "project_award_year",
    "project_project_#",
    "project_project_title",
    "project_project_description",
    "project_grant_recipient",
]

In [87]:
def output_tircp(project_cols_to_keep:list, allocation_cols_to_keep:list):
    """
    Clean up TIRCP workbook before searching
    through the components and project 
    descriptions. 
    """
    # Read in a joined allocation-project sheet
    df = A1_data_prep.merge_allocation_project(project_cols_to_keep, allocation_cols_to_keep, "left")
    
    # Drop irrelevant years & non-construction.
    df = df[(df["project_award_year"] >= 2018)].reset_index(drop=True)

    # Sort df by award year, number, and phase.
    df = df.sort_values(
    ["project_award_year", "project_project_#", "allocation_phase"])
    
    # Drop duplicates by ppno and allocation componentes
    # Sorted by allocation phase, so construction is kept
    # if components is the same across multiple phases
    df = (
    df.drop_duplicates(subset=["project_ppno", "allocation_components"])
    ).reset_index(drop=True)
    
    # Create project number
    # Map 0 to another single digit numbers
    # https://stackoverflow.com/questions/20990863/python-pandas-add-leading-zero-to-make-all-months-2-digits
    df["project_number"] = (
    df["project_award_year"].astype(str) + "-" + df["project_project_#"].map("{:02}".format).astype(str))
    
    # For projects that don't have any allocation components info
    # fill it in with project description
    df.allocation_components = df.allocation_components.fillna(
    df.project_project_description
    )
    
    return df

In [88]:
tircp3 = output_tircp(project_cols, allocation_cols)

### Extract Outputs

In [89]:
# Simplify allocation components before extracting digits
tircp3 = A8_strings.simplify_descriptions(
    tircp3,
    "allocation_components",
    "clean_components",
    A8_strings.description_words_to_delete,
)

In [90]:
# Keywords that will go under each category
track = ["track", "tracking"]
rail = ["trolley", "train", "car", "coach", "rolling", "traffic", "light rail", "rail"]
bus = ["bus", "van", "buses", "microtransit", "micro transit", "zebs"]
ferry = ["ferry", "ferries", "vessel"]
active_transportation = [
    "bike",
    "shelter",
    "pedestrian",
    "cycle",
    "crosswalk",
    "sidewalk",
    "bicycle",

]
network = ["network"]
parking = ["parking", " lots ", "deck"]
facilities = [
    "center",
    "facility",
    "station",
    "hub",
    "islands",
    "shelter",
    "shade",
    "location",
    "canopies",
    "stations"
]
charging = ["charging", "charge"]
storm_drain = ["storm"]
street = [
    "street",
    "curve",
    "cross",
    "signal",
    "signaling",
    "traffic control",
    "road",
    "lane",
    "surface",
    "interchange",
    "intersection",
    "shoulder",
    "grade separations",
    "crossing",
    "border",
    "corridor",
]
other_vehicles = ["vehicle", "zemu", "lrv", "fleet"]
bridge_tunnel = ["bridge", "tunnel"]
infrastructure = [
    "infrastructure",
    "bluff",
    "operation",
    "rehabilitation",
    "stabilization",
    "electrification",
    "ramp",
    "mainline",
    "port",
    "fiber optic",
    "switches",
    "signals",
    "signs",
    "lights"
    
]
ontime_transit_improvements = [
    "connect",
    "service",
    "mobility",
    "contactless",
    "rider",
    "integrated",
    " line ",
    "wayfinding",
    "loop",
    "modernization",
    "safety",
    "speed",
    "ITS",
    "signange",
    "mobile app",
    "ticket",
    "ridership",
    "expanded",
    "service",
    "time",
    "route",
]

In [91]:
my_keywords_list = [
    track,
    rail,
    bus,
    ferry,
    active_transportation,
    network,
    parking,
    facilities,
    charging,
    storm_drain,
    street,
    other_vehicles,
    bridge_tunnel,
    infrastructure,
    ontime_transit_improvements,
]

In [92]:
my_new_column_names_list = [
    "track",
    "rail",
    "bus",
    "ferry",
    "active_transportation",
    "network",
    "parking",
    "facilities",
    "charging",
    "storm_drain",
    "street",
    "other_vehicles",
    "bridge_tunnels",
    "infrastructure",
    "ontime_transit_improvements",
]

In [93]:
def fill_in_zeroes(df, keywords: list, description_column: str, new_col_name: str):
    """
    If a keyword appears in the desc
    Automatically code it as 1 instead of 0.
    """
    # Delinate items in keywords list using |
    keywords_blob = f"({'|'.join(keywords)})"

    keywords_dict = dict.fromkeys(keywords, 1)
    
    df[new_col_name] = (
        df[description_column]
        .str.extract(keywords_blob, expand=False)
        .replace(keywords_dict)
        .fillna(0)
    )
    return df

In [94]:
def clean_procurements(df, description_col:str,  keywords:list, new_columns:list):
    """
    If a keyword appears in the desc
    automatically code it as 1 instead of 0
    for all keyword lists/columns.
    """
    df = A8_strings.total_procurement_estimates(df,description_col,keywords, new_columns) 
    
    for i in range(0, len(keywords)):
        df = fill_in_zeroes(
            df,
            keywords[i],
            "clean_components",
            f"new_{new_columns[i]}",)

        # Replace any zeroes in the original columns with 1 if a keyword is found
        # https://stackoverflow.com/questions/68243146/replace-zero-with-value-of-an-other-column-using-pandas
        df[f"total_{new_columns[i]}"] = (df[f"total_{new_columns[i]}"]
        .mask(df[f"total_{new_columns[i]}"].eq(0),
        df[f"new_{new_columns[i]}"],
        )) 
    
    df = df[df.columns.drop(list(df.filter(regex="new")))]
    
    return df 

In [95]:
tircp4 = clean_procurements(tircp3,
    "clean_components",
    my_keywords_list,
    my_new_column_names_list,)

In [97]:
additional_keywords = [
    "microtransit",
    "emission",
    "conversion",
    "zero",
    "hydrogen",
    "battery",
    "electric",
    "hybrid",
    "zev",
    "zemu",
]

In [98]:
groupby_cols = [
    "project_project_title",
    "project_award_year",
    "project_number",
    "allocation_components",
    "total_track",
    "total_rail",
    "total_bus",
    "total_ferry",
    "total_active_transportation",
    "total_network",
    "total_parking",
    "total_facilities",
    "total_charging",
    "total_storm_drain",
    "total_street",
    "total_other_vehicles",
    "total_bridge_tunnels",
    "total_infrastructure",
    "total_ontime_transit_improvements",
]

In [99]:
def extract_keywords(df, list_of_words: list, more_keywords: list, column: str, 
                     unique_cols:list, cols_to_keep:list):
    """
    Categorize projects. Search through
    a certain column for keywords specified
    in "list of words" and "more keywords." 
    Input the reesults into a new column called "categories"
    """
    for i in list_of_words: more_keywords.extend(i)

    query = "|".join(more_keywords)

    df["categories"] = df[column].str.lower().str.findall(r"\b({})\b".format(query))

    # All the category values are packed into a list. Unpack and del duplicates
    df = (
    df.explode("categories")
    .sort_values(unique_cols)
    .drop_duplicates(subset=unique_cols.append('categories')))
    
    # Fill na
    df.categories = df.categories.fillna("none")
    
    # Regroup
    df = df.groupby(cols_to_keep)["categories"].apply(",".join).reset_index()
    
    return df

In [100]:
tircp4 = extract_keywords(
    tircp4, my_keywords_list, additional_keywords, "allocation_components", ["project_award_year", "project_project_title"],groupby_cols
)

In [101]:
#tircp4 = tircp4.explode("categories").drop_duplicates(subset = ["project_award_year", "project_project_title","categories"])

In [102]:
#tircp4['categories'] = tircp4['categories'].fillna("none")

In [103]:
#tircp4.groupby(groupby_cols)["categories"].apply(",".join).reset_index()

### Add in info from TIRCP Battery and Hydrogen Fuel Cell Bus list 10-10-2022 (1).xlsx

In [104]:
battery_drop_cols = [
    "local_agency_contact",
    "local_agency_email",
    "local_agency_phone_number",
    "awarded_allocated",
    "procured_contracted",
    "components",
]

In [105]:
battery = read_in_files("TIRCP Battery and Hydrogen Fuel Cell Bus list 10-10-2022 (1).xlsx", battery_drop_cols)

In [106]:
battery["project_number"] = (
    battery["award_year"].astype(str) + "-" + battery["project_#"].map("{:02}".format).astype(str)
)

In [107]:
battery["hydrogen_battery_buses"] = battery['#_hydrogen_fuel_cell_buses'] + battery['#_battery_electric_buses']

In [108]:
tircp4.merge(battery[['project_number','hydrogen_battery_buses']], how="outer", on=["project_number"], indicator= True)[["_merge"]].value_counts()

_merge    
left_only     243
both           49
right_only      7
dtype: int64

In [109]:
tircp5 = tircp4.merge(battery[['project_number','hydrogen_battery_buses']], how="left", on=["project_number"])

In [110]:
tircp5["total_bus"] = tircp5["total_bus"].mask(
        tircp5["total_bus"].eq(0),
        tircp5["hydrogen_battery_buses"],
    ).fillna(0)

In [111]:
# Dups appear: sort by project number and total buses
# Keep only the row with the most buses.
tircp5 = tircp5.sort_values(['project_number','total_bus']).drop_duplicates(subset = ['project_number','project_project_title', 'allocation_components'])

### Clean

In [112]:
# Subset for only relevant columns
outputs_cols = [
    "project_project_title",
    "project_award_year",
    "allocation_components",
    "categories",
    "total_track",
    "total_rail",
    "total_bus",
    "total_ferry",
    "total_active_transportation",
    "total_network",
    "total_parking",
    "total_facilities",
    "total_charging",
    "total_storm_drain",
    "total_street",
    "total_other_vehicles",
    "total_bridge_tunnels",
    "total_infrastructure",
    "total_ontime_transit_improvements",
]

In [113]:
outputs = tircp5[outputs_cols]

In [114]:
outputs = A1_data_prep.clean_up_columns(outputs)

In [115]:
agg_cols = ['Total Track',
       'Total Rail', 'Total Bus', 'Total Ferry', 'Total Active Transportation',
       'Total Network', 'Total Parking', 'Total Facilities', 'Total Charging',
       'Total Storm Drain', 'Total Street', 'Total Other Vehicles',
       'Total Bridge Tunnels', 'Total Infrastructure',
       'Total Ontime Transit Improvements',
]

* When mapping battery-hydrogen data, this causes # of buses to be double counted. 
* How to de duplicate. 

In [116]:
outputs_project = outputs.groupby(['Title','Components', 'Categories']).agg({**{e: "max" for e in agg_cols}})

In [166]:
# outputs_project

In [117]:
outputs_year = outputs.groupby(['Award Year']).agg({**{e: "sum" for e in agg_cols}}).T

## Output 2
* Repeat the process above but based on table from Tyler instead of Linda's notes.

In [118]:
bicycle_lane_miles = ["bicycle lane miles", "bicycle", "bike"]
signs = [" sign ",' signs ','lights','greenway','safety']
crosswalk_sidewalk = ['sidewalk','crosswalk']
pedestrian  = ['ped','pedestrian'] 
interchange = ['interchange']
bridge_tunnel = ['bridge','bridges','tunnel','tunnels']
intersection_signal = ['signal','intersection', 'signals','signaling','intersections'] 
fiber_optics = ['fiber','optics','cable','communication']
its = [' its ']
port = [' port ']
rail = ["trolley", "train", "car", "coach", "rolling", "traffic", "light rail", "rail"]
bus = ["bus", "van", "buses", "microtransit", "micro transit", "zebs"]
freeway = ["freeway ramp", "hov", "shoulder", "roadway", "lane", "local road", "auxilary lane",
          "ramps", "connectors", "pocket", "curve"]
station = ["station"]
grade_seperation = ['grade separations']


In [119]:
my_keywords_list2 = [
   bicycle_lane_miles, signs, crosswalk_sidewalk,
    pedestrian, interchange, bridge_tunnel, intersection_signal,
    fiber_optics, its, port, rail, bus,freeway, station, grade_seperation
]

In [120]:
my_columns_list2 = [
   'bicycle_lane_miles', 'signs', 'crosswalk_sidewalk',
    'pedestrian', 'interchange', 'bridge_tunnel', 'intersection_signal',
    'fiber_optics', 'its', 'port', 'rail', 'bus','freeway', 'station', 'grade_seperation'
]

In [121]:
tircp6 = clean_procurements(tircp3,
    "clean_components",
    my_keywords_list2,
    my_columns_list2,)

In [122]:
outputs3 = A1_data_prep.clean_up_columns(tircp6)

In [123]:
agg_cols2 = ['Total Bicycle Lane Miles',
       'Total Signs', 'Total Crosswalk Sidewalk', 'Total Pedestrian',
       'Total Interchange', 'Total Bridge Tunnel', 'Total Intersection Signal',
       'Total Fiber Optics', 'Total Its', 'Total Port', 'Total Rail',
       'Total Bus', 'Total Freeway', 'Total Station',
       'Total Grade Seperation']

In [124]:
# Change from wide to long dataframe
outputs4 = pd.melt(outputs3, id_vars=['Components'], value_vars=agg_cols2).rename(columns = {'variable':'Variable', 'value':'Value'})

In [169]:
# Only retain rows with a value greater than 0
outputs5 = outputs4.loc[outputs4.Value > 0 ]

In [171]:
# outputs5

In [172]:
outputs6 = outputs5.groupby(['Variable']).agg({'Components':'nunique'})

In [173]:
outputs6

Unnamed: 0_level_0,Components
Variable,Unnamed: 1_level_1
Total Bicycle Lane Miles,5
Total Bridge Tunnel,2
Total Bus,56
Total Fiber Optics,2
Total Freeway,7
Total Intersection Signal,12
Total Pedestrian,4
Total Rail,29
Total Signs,5
Total Station,30


## Outcomes
### Clean up Projects Sheet

In [128]:
def clean_project():
    """"
    Clean up TIRCP Project sheet
    to merge with other sheets
    """
    project = A1_data_prep.clean_project()
    
    project = project.loc[project["project_award_year"] >=2018].reset_index(drop = True)
    
    project["project_number"] = (project["project_award_year"].astype(str) 
                                 + "-"+ project["project_project_#"].map("{:02}".format).astype(str))
    
    project = project[['project_award_year','project_project_title','project_grant_recipient','project_ppno', 'project_number']]
    
    return project

In [129]:
# Subset tircp
project = clean_project()

In [130]:
project.shape

(68, 5)

### AwardedProjectsDetail.xlsx

In [131]:
drisi_drop_cols = [
    "agency_code",
    "agency_short_name",
    "sub_program_description",
    "agency_name",
    "program_short_name",
    "program_name",
    "program_description",
    "sub_program_short_name",
    "sub_program_name",
    "project_type",
    "agency",
    "program",
    "date_imported",
    "contractor_or_awardee_admin_expenses",
    "voucher_id",
    "project_is_completed",
    "project_is_canceled"
]

In [132]:
drisi = read_in_files("AwardedProjectsDetail.xlsx", drisi_drop_cols)

In [133]:
drisi.shape

(63, 33)

In [134]:
outer_drisi, m1 = merge_value_counts(project, drisi, "project_number", "project_id",) 

In [135]:
m1 = m1.drop(columns = ['record_type', "project_id", "project_life_years", "project_status"])

In [136]:
m1.shape

(68, 34)

### ImplementedProjectsDetail.xlsx

In [137]:
implemented_drop_cols = [
    "programuniqueidentifier",
    "record_type",
    "reporting_cycle_name",
    "agency_short_name",
    "agency_name",
    "date_operational",
    "program_name",
    "program_description",
    "project_completion_date",
    "date_imported",
    "sub_program_name",
    "date_selected_for_award",
    "project_name",
    "project_type",
    "fiscal_year_funding_project",
    "census_tract",
    "address",
    "lat_long",
    "total_program_ggrffunding",
    "voucher_name",
    "voucher_description",
]

In [138]:
implemented = read_in_files("ImplementedProjectsDetail.xlsx", implemented_drop_cols)

In [139]:
outer_implemented, m2 = merge_value_counts(m1, implemented, "project_number", "project_idnumber") 

In [140]:
m2 = m2.drop(columns = ['project_count',"project_idnumber",'proj_rec_id','voucher_id'])

In [141]:
# Why does the df become so large?
m2.shape

(171, 90)

In [142]:
m2 = m2.drop_duplicates(subset = ["project_ppno", "project_number"])

In [143]:
m2.shape

(68, 90)

### OutcomeProjectsDetail.xlsx 

In [144]:
outcomes_drop_cols = [
    "unnamed:_0",
    "proj_rec_id",
    "reporting_cycle_name",
    "agency_short_name",
    "agency_name",
    "program_short_name",
    "program_name",
    "program_description",
    "record_type",
    "sub_program_short_name",
    "sub_program_name",
    "sub_program_description",
    "date_imported",
]

In [145]:
outcomes = read_in_files("OutcomeProjectsDetail.xlsx", outcomes_drop_cols)

In [146]:
# outcomes.sort_values('projectid_number')

In [147]:
# list(outcomes.columns)

In [148]:
outer_outcomes, m3 = merge_value_counts(m2, outcomes, "project_number", "projectid_number") 

In [149]:
outer_outcomes._merge.value_counts()

left_only     67
right_only    57
both           3
Name: _merge, dtype: int64

In [150]:
outer_outcomes.loc[outer_outcomes._merge == "right_only"]['projectid_number'].unique()

array(['2015-07', '2015-06', '2015-01', '2015-04', '2015-10', '2015-09',
       '2015-12', '2015-05', '2016-04', '2016-01', '2016-12', '2015-02',
       '2015-08', '2015-11', '2016-08', '2015-14', '2015-13', '2016-05'],
      dtype=object)

### TIRCP_AllProjects_12212022 (002).xlsx
* Doesn't have anything interesting.

In [158]:
# Fill in empty values with NA
all_projects = read_in_files("TIRCP_AllProjects_12212022 (002).xlsx", [])

In [159]:
# all_projects.sample()

### Clean merges

In [151]:
m3 = m3.drop(columns = ["projectid_number","ab1550choice"])

In [152]:
m3 = A1_data_prep.clean_up_columns(m3)

In [153]:
# Keep only numeric columns
value_cols = [e for e in m3.select_dtypes(exclude=['object']).columns.tolist() if e not in ('Award Year')]

In [154]:
# m3.info(verbose=True)

In [155]:
outcomes_project = m3.groupby('Title').agg({**{e: "max" for e in value_cols}}).T

In [156]:
outcomes_year = m3.groupby('Award Year').agg({**{e: "sum" for e in value_cols}}).T

In [157]:
# outcomes_year

## Save

In [162]:
"""
with pd.ExcelWriter(f"{GCS_FILE_PATH}drisi_outcomes_outputs.xlsx") as writer:
    outputs_project.to_excel(writer, sheet_name="outputs_project", index=True)
    outputs_year.to_excel(writer, sheet_name="outputs_year", index=True)
    outcomes_project.to_excel(writer, sheet_name="outcomes_project", index=True)
    outcomes_year.to_excel(writer, sheet_name="outcomes_year", index=True)
    outputs5.to_excel(writer, sheet_name="investment_composition", index=True)
    outputs6.to_excel(writer, sheet_name="investment_composition_detail", index=True)"""

'\nwith pd.ExcelWriter(f"{GCS_FILE_PATH}drisi_outcomes_outputs.xlsx") as writer:\n    outputs_project.to_excel(writer, sheet_name="outputs_project", index=True)\n    outputs_year.to_excel(writer, sheet_name="outputs_year", index=True)\n    outcomes_project.to_excel(writer, sheet_name="outcomes_project", index=True)\n    outcomes_year.to_excel(writer, sheet_name="outcomes_year", index=True)\n    outputs5.to_excel(writer, sheet_name="investment_composition", index=True)\n    outputs6.to_excel(writer, sheet_name="investment_composition_detail", index=True)'