## TIRCP DRISI
* DRISI emailed a request asking for TIRCP outcomes for cycles 3-5.  
* [Cycles 1-6](https://calsta.ca.gov/subject-areas/transit-intercity-rail-capital-prog)
* Cycle 1: 2015
* Cycle 2: 2016
* Cycle 3: 2018
* Cycle 4: 2020
* Cycle 5: 2022
* Cycle 6: 2023

<b>Notes 2/7/23</b>
* Application is different than what we have in the dataset.
* Outputs: hydrogen-battery fuel cell.
* DRISI wants the data before end of this week.
* Outputs
    * Category for outputs: transit/multimodal or rail/multimodal. 
    * Pull it from `allocations` tab.
    * Filter out design categories. We only care about 'CONST'.
    * Transit/Multimodal
        * Buses breakout between mobile/school/etc. Unit is each.
            * Microtransit
            * Bus Conversion
            * Zero emmission (don't distinguish between hydrogen vs battery)
            * Battery Electric
            * Ferries
    * Rail/Multimodal
        * Trolleys
        * Trains/Cars/Coach/Rolling Stocks
        * Traffic Control
    * Track
        * Track Extension
        * Double Track
    * Network Integration improves multimodal network so everything is synced. 
    * Parking Lots/Parking Deck
    * Active Transportation
        * Bike Shelters/Shade Structures
        * Bike and Pedestrians Improvements
    * Facilities
        * Center/Facility/Station/Station/Hub/Islands
    * Charging Infrastructure. 
    * Storm Drain Line
    * Street Extension
    * Charging
    * Signaling
* Outcomes
    * On Time Performance
        * Expanded Service
        * Routes 
        * Ridership

In [55]:
import itertools

import A1_data_prep
import A2_tableau
import A8_strings
import numpy as np
import pandas as pd
from babel.numbers import format_currency
from calitp import *

In [56]:
pd.options.display.max_columns = 125
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [57]:
# GCS File Path:
GCS_FILE_PATH = f"{A1_data_prep.GCS_FILE_PATH}DRISI/"

In [58]:
GCS_FILE_PATH

'gs://calitp-analytics-data/data-analyses/tircp/DRISI/'

## Prep Functions

In [59]:
def read_in_files(file_name: str, columns_to_drop: list):
    df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{file_name}"))

    df = df.drop(columns=columns_to_drop)
    
    # Delete any columns with the following strings
    df = df.loc[:, ~df.columns.str.contains("funding|recipient|date|cost|program|amount|name|description")]
    
    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    return df

In [60]:
def merge_value_counts(left_df, right_df, left_m_col:str, right_m_col:str):
    # Some of these sheets have repeated columns
    # Del them off so the info isn't repeated again
    left_df_cols = left_df.columns
    right_df_cols = right_df.columns
    common_cols = right_df_cols.intersection(left_df_cols)
    common_cols = common_cols.to_list()
    
    right_df = right_df.drop(columns = common_cols) 
    
    # One df for an outer merge just for checking both/left only/right only vals
    outer = pd.merge(left_df, right_df, how="outer", left_on=left_m_col, right_on=right_m_col, indicator=True)
    
    # One for left
    left =  pd.merge(left_df, right_df, how="left",  left_on=left_m_col, right_on=right_m_col)

    return outer, left

## Outputs
### Manipulate TIRCP Project Sheet

In [61]:
allocation_cols = [
    "allocation_ppno",
    "allocation_award_year",
    "allocation_components",
    "allocation_phase",
]
project_cols = [
    "project_ppno",
    "project_award_year",
    "project_project_#",
    "project_project_title",
    "project_project_description",
    "project_grant_recipient",
]

In [62]:
# project2.groupby('project_award_year').size(), project2.groupby('project_award_year').project_ppno.nunique()

In [63]:
# Read in a joined allocation-project sheet
tircp = A1_data_prep.merge_allocation_project(project_cols, allocation_cols, "left")



In [64]:
# Drop irrelevant years & non-construction.
tircp2 = tircp[(tircp["project_award_year"] >= 2018)].reset_index(drop=True)

In [65]:
# Double check that the PPNO are correct
tircp2.groupby("project_award_year").project_ppno.nunique()

project_award_year
2018    26
2020    17
2022    23
Name: project_ppno, dtype: int64

In [66]:
# Sort df by award year, number, and phase.
tircp2 = tircp2.sort_values(
    ["project_award_year", "project_project_#", "allocation_phase"]
)

In [67]:
# Create project number
# Map 0 to another single digit numbers
# https://stackoverflow.com/questions/20990863/python-pandas-add-leading-zero-to-make-all-months-2-digits
tircp2["project_number"] = (
    tircp2["project_award_year"].astype(str) + "-" + tircp2["project_project_#"].map("{:02}".format).astype(str)
)

In [68]:
tircp2.allocation_phase.value_counts()

CONST    311
PS&E     134
PA&ED     40
R/W       28
Name: allocation_phase, dtype: int64

In [69]:
len(tircp2), len(
    tircp2.drop_duplicates(subset=["project_ppno", "allocation_components"])
)

(515, 280)

In [70]:
# Drop duplicates by ppno and allocation componentes
# Sorted by allocation phase, so construction is kept
# if components is the same across multiple phases
tircp3 = (
    tircp2.drop_duplicates(subset=["project_ppno", "allocation_components"])
).reset_index(drop=True)

In [71]:
tircp3.groupby("project_award_year").project_ppno.nunique()

project_award_year
2018    26
2020    17
2022    23
Name: project_ppno, dtype: int64

In [72]:
# For projects that don't have any allocation components info
# fill it in with project description
# Cp068 & cp106
tircp3.allocation_components = tircp3.allocation_components.fillna(
    tircp3.project_project_description
)

### Extract Outputs

In [73]:
tircp3 = A8_strings.simplify_descriptions(
    tircp3,
    "allocation_components",
    "clean_components",
    A8_strings.description_words_to_delete,
)

In [74]:
tircp3 = tircp3.drop(columns=["allocation_ppno", "allocation_award_year", "_merge"])

In [75]:
track = ["track", "double track"]
rail = ["trolley", "train", "car", "coach", "rolling", "traffic", "light rail", "rail"]
bus = ["bus", "van", "buses", "microtransit", "micro transit", "zebs"]
ferry = ["ferry", "ferries", "vessel"]
active_transportation = [
    "bike",
    "shelter",
    "pedestrian",
    "cycle",
    "crosswalk",
    "sidewalk",
    "bicycle",
]
network = ["network"]
parking = ["parking", "lots", "deck"]
facilities = [
    "center",
    "facility",
    "station",
    "hub",
    "islands",
    "shelter",
    "shade",
    "location",
    "center",
    "stations"
]
charging = ["charging", "charge"]
storm_drain = ["storm"]
street = [
    "street",
    "signal",
    "signaling",
    "traffic control",
    "road",
    "lane",
    "surface",
    "interchange",
    "intersection",
    "shoulder",
    "grade separations",
    "crossing",
    "border",
    "corridor",
]
other_vehicles = ["vehicle", "zemu"]
bridge_tunnel = ["bridge", "tunnel"]
infrastructure = [
    "infrastructure",
    "bluff",
    "operation",
    "ramp",
    "mainline",
    "port",
    "fiber optic",
]
ontime_transit_improvements = [
    "ITS",
    "signange",
    "mobile app",
    "ticket",
    "ridership",
    "expanded",
    "service",
    "time",
    "route",
]

In [76]:
my_keywords_list = [
    track,
    rail,
    bus,
    ferry,
    active_transportation,
    network,
    parking,
    facilities,
    charging,
    storm_drain,
    street,
    other_vehicles,
    bridge_tunnel,
    infrastructure,
    ontime_transit_improvements,
]

In [77]:
my_new_column_names_list = [
    "track",
    "rail",
    "bus",
    "ferry",
    "active_transportation",
    "network",
    "parking",
    "facilities",
    "charging",
    "storm_drain",
    "street",
    "other_vehicles",
    "bridge_tunnels",
    "infrastructure",
    "ontime_transit_improvements",
]

In [78]:
tircp3 = A8_strings.total_procurement_estimates(
    tircp3,
    "clean_components",
    my_keywords_list,
    my_new_column_names_list,
)

In [79]:
def fill_in_zeroes(df, keywords: list, description_column: str, new_col_name: str):
    # Delinate items in keywords list using |
    keywords_blob = f"({'|'.join(keywords)})"

    # If a keyword appears in the desc
    # Automatically add it as 1
    keywords_dict = dict.fromkeys(keywords, 1)

    df[new_col_name] = (
        df[description_column]
        .str.extract(keywords_blob, expand=False)
        .replace(keywords_dict)
        .fillna(0)
    )
    return df

In [80]:
# Turn this into a function later
for i in range(0, len(my_keywords_list)):
    tircp4 = fill_in_zeroes(
        tircp3,
        my_keywords_list[i],
        "clean_components",
        f"new_{my_new_column_names_list[i]}",
    )

    # Replace any zeroes in the original columns with 1
    # if a keyword is found
    # https://stackoverflow.com/questions/68243146/replace-zero-with-value-of-an-other-column-using-pandas
    tircp4[f"total_{my_new_column_names_list[i]}"] = tircp4[
        f"total_{my_new_column_names_list[i]}"
    ].mask(
        tircp4[f"total_{my_new_column_names_list[i]}"].eq(0),
        tircp4[f"new_{my_new_column_names_list[i]}"],
    )

    tircp4 = tircp4[tircp4.columns.drop(list(tircp4.filter(regex="new")))]

In [81]:
tircp4.columns

Index(['project_ppno', 'project_award_year', 'project_project_#',
       'project_project_title', 'project_project_description',
       'project_grant_recipient', 'allocation_components', 'allocation_phase',
       'project_number', 'clean_components', 'total_track', 'total_rail',
       'total_bus', 'total_ferry', 'total_active_transportation',
       'total_network', 'total_parking', 'total_facilities', 'total_charging',
       'total_storm_drain', 'total_street', 'total_other_vehicles',
       'total_bridge_tunnels', 'total_infrastructure',
       'total_ontime_transit_improvements'],
      dtype='object')

In [82]:
additional_keywords = [
    "microtransit",
    "emission",
    "conversion",
    "zero",
    "hydrogen",
    "battery",
    "electric",
    "hybrid",
    "zev",
    "zemu",
]

In [83]:
groupby_cols = [
    "project_project_title",
    "project_award_year",
    "project_number",
    "allocation_components",
    "total_track",
    "total_rail",
    "total_bus",
    "total_ferry",
    "total_active_transportation",
    "total_network",
    "total_parking",
    "total_facilities",
    "total_charging",
    "total_storm_drain",
    "total_street",
    "total_other_vehicles",
    "total_bridge_tunnels",
    "total_infrastructure",
    "total_ontime_transit_improvements",
]

In [84]:
def extract_keywords(df, list_of_words: list, more_keywords: list, column: str, 
                     unique_cols:list, cols_to_keep:list):
    """
    Extract keywords found in a certain column
    into a new column called "categories"
    """
    for i in list_of_words: more_keywords.extend(i)

    query = "|".join(more_keywords)

    df["categories"] = df[column].str.lower().str.findall(r"\b({})\b".format(query))
    
    # All the category values are packed into a list. Unpack and del dups
    df = (
    df.explode("categories")
    .sort_values(unique_cols)
    .drop_duplicates(subset=unique_cols.append('categories'))
    .fillna(" "))
    
    # Regroup: gro
    df = df.groupby(cols_to_keep)["categories"].apply(",".join).reset_index()
    return df

In [85]:
tircp4 = extract_keywords(
    tircp4, my_keywords_list, additional_keywords, "allocation_components", ["project_award_year", "project_project_title"],groupby_cols
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Add in info from TIRCP Battery and Hydrogen Fuel Cell Bus list 10-10-2022 (1).xlsx

In [86]:
battery_drop_cols = [
    "local_agency_contact",
    "local_agency_email",
    "local_agency_phone_number",
    "awarded_allocated",
    "procured_contracted",
    "components",
]

In [87]:
battery = read_in_files("TIRCP Battery and Hydrogen Fuel Cell Bus list 10-10-2022 (1).xlsx", battery_drop_cols)

In [88]:
battery.columns

Index(['award_year', 'project_#', 'implementing_agency', 'in_operation',
       '#_hydrogen_fuel_cell_buses', '#_battery_electric_buses'],
      dtype='object')

In [89]:
battery["project_number"] = (
    battery["award_year"].astype(str) + "-" + battery["project_#"].map("{:02}".format).astype(str)
)

In [92]:
battery["hydrogen_battery_buses"] = battery['#_hydrogen_fuel_cell_buses'] + battery['#_battery_electric_buses']

In [93]:
tircp4.merge(battery[['project_number','hydrogen_battery_buses']], how="outer", on=["project_number"], indicator= True)[["_merge"]].value_counts()

_merge    
left_only     243
both           49
right_only      7
dtype: int64

In [94]:
tircp5 = tircp4.merge(battery[['project_number','hydrogen_battery_buses']], how="left", on=["project_number"])

In [95]:
tircp5["total_bus"] = tircp5["total_bus"].mask(
        tircp5["total_bus"].eq(0),
        tircp5["hydrogen_battery_buses"],
    ).fillna(0)

In [96]:
# tircp4[['project_number','total_bus']].sort_values('project_number')

In [97]:
# tircp5[['project_number','total_bus','hydrogen_battery_buses']].sort_values('project_number')

### Clean

In [98]:
# Subset for only relevant columns
outputs_cols = [
    "project_project_title",
    "project_award_year",
    "allocation_components",
    "categories",
    "total_track",
    "total_rail",
    "total_bus",
    "total_ferry",
    "total_active_transportation",
    "total_network",
    "total_parking",
    "total_facilities",
    "total_charging",
    "total_storm_drain",
    "total_street",
    "total_other_vehicles",
    "total_bridge_tunnels",
    "total_infrastructure",
    "total_ontime_transit_improvements",
]

In [99]:
outputs = tircp5[outputs_cols]

In [100]:
outputs = A1_data_prep.clean_up_columns(outputs)

In [101]:
agg_cols = ['Total Track',
       'Total Rail', 'Total Bus', 'Total Ferry', 'Total Active Transportation',
       'Total Network', 'Total Parking', 'Total Facilities', 'Total Charging',
       'Total Storm Drain', 'Total Street', 'Total Other Vehicles',
       'Total Bridge Tunnels', 'Total Infrastructure',
       'Total Ontime Transit Improvements',
]

In [102]:
outputs_project = outputs.groupby(['Award Year', 'Title','Components', 'Categories']).agg({**{e: "max" for e in agg_cols}})

In [103]:
outputs_year = outputs.groupby(['Award Year']).agg({**{e: "sum" for e in agg_cols}}).T

In [104]:
outputs_year

Award Year,2018,2020,2022
Total Track,13.0,1.0,6.0
Total Rail,27.0,40.0,12.0
Total Bus,409.0,64.0,182.0
Total Ferry,0.0,1.0,1.0
Total Active Transportation,4.0,0.0,9.0
Total Network,19.0,3.0,2.0
Total Parking,2.0,0.0,4.0
Total Facilities,30.0,7.0,43.0
Total Charging,7.0,0.0,11.0
Total Storm Drain,0.0,0.0,1.0


## Outcomes
### Clean up Projects Sheet

In [105]:
def clean_project():
    project = A1_data_prep.clean_project()
    
    project = project.loc[project["project_award_year"] >=2018].reset_index(drop = True)
    
    project["project_number"] = (project["project_award_year"].astype(str) 
                                 + "-"+ project["project_project_#"].map("{:02}".format).astype(str))
    
    project = project[['project_award_year','project_project_title','project_grant_recipient','project_ppno', 'project_number']]
    
    return project

In [106]:
# Subset tircp
project = clean_project()



In [107]:
project.shape

(68, 5)

### AwardedProjectsDetail.xlsx

In [108]:
drisi_drop_cols = [
    "agency_code",
    "agency_short_name",
    "sub_program_description",
    "agency_name",
    "program_short_name",
    "program_name",
    "program_description",
    "sub_program_short_name",
    "sub_program_name",
    "project_type",
    "agency",
    "program",
    "date_imported",
    "contractor_or_awardee_admin_expenses",
    "voucher_id",
    "project_is_completed",
    "project_is_canceled"
]

In [109]:
drisi = read_in_files("AwardedProjectsDetail.xlsx", drisi_drop_cols)

In [110]:
drisi.shape

(63, 33)

In [111]:
outer_drisi, m1 = merge_value_counts(project, drisi, "project_number", "project_id",) 

In [112]:
m1 = m1.drop(columns = ['record_type', "project_id", "project_life_years", "project_status"])

In [113]:
m1.shape

(68, 34)

### ImplementedProjectsDetail.xlsx

In [114]:
implemented_drop_cols = [
    "programuniqueidentifier",
    "record_type",
    "reporting_cycle_name",
    "agency_short_name",
    "agency_name",
    "date_operational",
    "program_name",
    "program_description",
    "project_completion_date",
    "date_imported",
    "sub_program_name",
    "date_selected_for_award",
    "project_name",
    "project_type",
    "fiscal_year_funding_project",
    "census_tract",
    "address",
    "lat_long",
    "total_program_ggrffunding",
    "voucher_name",
    "voucher_description",
]

In [115]:
implemented = read_in_files("ImplementedProjectsDetail.xlsx", implemented_drop_cols)

In [116]:
outer_implemented, m2 = merge_value_counts(m1, implemented, "project_number", "project_idnumber") 

In [117]:
m2 = m2.drop(columns = ['project_count',"project_idnumber",'proj_rec_id','voucher_id'])

In [118]:
# Why does the df become so large?
m2.shape

(171, 90)

In [119]:
m2 = m2.drop_duplicates(subset = ["project_ppno", "project_number"])

In [120]:
m2.shape

(68, 90)

### OutcomeProjectsDetail.xlsx 

In [121]:
outcomes_drop_cols = [
    "unnamed:_0",
    "proj_rec_id",
    "reporting_cycle_name",
    "agency_short_name",
    "agency_name",
    "program_short_name",
    "program_name",
    "program_description",
    "record_type",
    "sub_program_short_name",
    "sub_program_name",
    "sub_program_description",
    "date_imported",
]

In [122]:
outcomes = read_in_files("OutcomeProjectsDetail.xlsx", outcomes_drop_cols)

In [123]:
# outcomes.sort_values('projectid_number')

In [124]:
# list(outcomes.columns)

In [125]:
outer_outcomes, m3 = merge_value_counts(m2, outcomes, "project_number", "projectid_number") 

In [126]:
outer_outcomes._merge.value_counts()

left_only     67
right_only    57
both           3
Name: _merge, dtype: int64

In [127]:
outer_outcomes.loc[outer_outcomes._merge == "right_only"]['projectid_number'].unique()

array(['2015-07', '2015-06', '2015-01', '2015-04', '2015-10', '2015-09',
       '2015-12', '2015-05', '2016-04', '2016-01', '2016-12', '2015-02',
       '2015-08', '2015-11', '2016-08', '2015-14', '2015-13', '2016-05'],
      dtype=object)

### Clean 

In [132]:
m3 = m3.drop(columns = ["projectid_number","ab1550choice"])

In [134]:
m3 = A1_data_prep.clean_up_columns(m3)

In [140]:
value_cols = [e for e in m3.columns.tolist() if e not in ('Award Year','Number','Title','Grant Recipient','Ppno')]

In [156]:
# m3.info(verbose=True)

In [157]:
outcomes_project = m3.groupby('Title').agg({**{e: "max" for e in value_cols}}).T

In [158]:
outcomes_year = m3.groupby('Award Year').agg({**{e: "sum" for e in value_cols}}).T

### TIRCP_AllProjects_12212022 (002).xlsx
* Doesn't have anything interesting.

In [159]:
# Fill in empty values with NA
all_projects = read_in_files("TIRCP_AllProjects_12212022 (002).xlsx", [])

In [160]:
# all_projects.sample()

## Save

In [161]:
with pd.ExcelWriter(f"{GCS_FILE_PATH}drisi_outcomes_outputs.xlsx") as writer:
    outputs_project.to_excel(writer, sheet_name="outputs_project", index=True)
    outputs_year.to_excel(writer, sheet_name="outputs_year", index=True)
    outcomes_project.to_excel(writer, sheet_name="outcomes_year", index=True)
    outcomes_year.to_excel(writer, sheet_name="outcomes_year", index=True)

### Save

In [162]:
"""
with pd.ExcelWriter(f"{GCS_FILE_PATH}calsta_draft.xlsx") as writer:
    outcomes.to_excel(writer, sheet_name="outcomes_unpivoted", index=True)
    outcomes_transformed.to_excel(writer, sheet_name="outcomes_transformed", index=True)
    projects.to_excel(writer, sheet_name="projects", index=True)
    year_summary.to_excel(writer, sheet_name="year_summary", index=True)
    GHG_by_year.to_excel(writer, sheet_name="GHG_reduction_year", index=True)
    """

'\nwith pd.ExcelWriter(f"{GCS_FILE_PATH}calsta_draft.xlsx") as writer:\n    outcomes.to_excel(writer, sheet_name="outcomes_unpivoted", index=True)\n    outcomes_transformed.to_excel(writer, sheet_name="outcomes_transformed", index=True)\n    projects.to_excel(writer, sheet_name="projects", index=True)\n    year_summary.to_excel(writer, sheet_name="year_summary", index=True)\n    GHG_by_year.to_excel(writer, sheet_name="GHG_reduction_year", index=True)\n    '