## TIRCP DRISI
* DRISI emailed a request asking for TIRCP outcomes for cycles 3-5.  
* [Cycles 1-6](https://calsta.ca.gov/subject-areas/transit-intercity-rail-capital-prog)
* Cycle 1: 2015
* Cycle 2: 2016
* Cycle 3: 2018
* Cycle 4: 2020
* Cycle 5: 2022
* Cycle 6: 2023

<b>Notes 2/7/23</b>
* Application is different than what we have in the dataset.
* Outputs: hydrogen-battery fuel cell.
* DRISI wants the data before end of this week.
* Outputs
    * Category for outputs: transit/multimodal or rail/multimodal. 
    * Pull it from `allocations` tab.
    * Filter out design categories. We only care about 'CONST'.
    * Transit/Multimodal
        * Buses breakout between mobile/school/etc. Unit is each.
            * Microtransit
            * Bus Conversion
            * Zero emmission (don't distinguish between hydrogen vs battery)
            * Battery Electric
            * Ferries
    * Rail/Multimodal
        * Trolleys
        * Trains/Cars/Coach/Rolling Stocks
        * Traffic Control
    * Track
        * Track Extension
        * Double Track
    * Network Integration improves multimodal network so everything is synced. 
    * Parking Lots/Parking Deck
    * Active Transportation
        * Bike Shelters/Shade Structures
        * Bike and Pedestrians Improvements
    * Facilities
        * Center/Facility/Station/Station/Hub/Islands
    * Charging Infrastructure. 
    * Storm Drain Line
    * Street Extension
    * Charging
    * Signaling
* Outcomes
    * On Time Performance
        * Expanded Service
        * Routes 
        * Ridership
        * 

In [None]:
import A1_data_prep
import A2_tableau
import A8_strings
import numpy as np
import pandas as pd
from babel.numbers import format_currency
from calitp import *

In [None]:
pd.options.display.max_columns = 125
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# GCS File Path:
GCS_FILE_PATH = A1_data_prep.GCS_FILE_PATH

In [None]:
def read_in_files(file_name: str, columns_to_drop: list):
    df = to_snakecase(pd.read_excel(f"{A1_data_prep.GCS_FILE_PATH}{file_name}"))

    df = df.drop(columns=columns_to_drop)

    df = df.fillna(df.dtypes.replace({"float64": 0.0, "object": "None"}))

    return df

## Outputs
### Manipulate TIRCP Project Sheet

In [None]:
allocation_cols = ["allocation_ppno", "allocation_award_year","allocation_components", "allocation_phase"]
project_cols = ["project_ppno", "project_award_year", "project_project_#", "project_project_title", "project_project_description", "project_grant_recipient"]

In [None]:
project = A1_data_prep.clean_project()

In [None]:
project2 = project.loc[project["project_award_year"] >=2018].reset_index(drop = True)

In [None]:
project2.groupby('project_award_year').size(), project2.groupby('project_award_year').project_ppno.nunique()

In [None]:
tircp = A1_data_prep.merge_allocation_project(project_cols, allocation_cols,"left")

In [None]:
# Drop irrelevant years & non-construction.
tircp2 = tircp[(tircp["project_award_year"] >= 2018)].reset_index(drop = True)

In [None]:
tircp2.groupby('project_award_year').project_ppno.nunique()

In [None]:
# Sort df by award year and number
tircp2 = tircp2.sort_values(["project_award_year", "project_project_#", "allocation_phase"])

In [None]:
# Create project number
# Map 0 to another single digit numbers
# https://stackoverflow.com/questions/20990863/python-pandas-add-leading-zero-to-make-all-months-2-digits
tircp2["project_number"] = (tircp2["project_award_year"].astype(str) + "-"  + tircp2["project_project_#"].astype(str))

In [None]:
tircp2.allocation_phase.value_counts()

In [None]:
len(tircp2), len(tircp2.drop_duplicates(subset = ["project_ppno", "allocation_components"]))

In [None]:
# Drop duplicates by ppno and allocation componentes
# Sorted by allocation phase, so construction is kept
# if components is the same across multiple phases 
tircp3 = (tircp2.drop_duplicates(subset = ["project_ppno", "allocation_components"])).reset_index(drop = True)

In [None]:
tircp3.groupby('project_award_year').project_ppno.nunique()

### Extract Outputs

In [None]:
# Simplify description and components
#tircp3 = A8_strings.simplify_descriptions(
#    tircp3, "project_project_description", "clean_description", A8_strings.description_words_to_delete
#)

In [None]:
tircp3 = A8_strings.simplify_descriptions(
    tircp3, "allocation_components", "clean_components", A8_strings.description_words_to_delete
)

In [None]:
tircp3 = tircp3.drop(columns = ["allocation_ppno","allocation_award_year", "_merge"])

In [None]:
# need plurals? 
track = ["track","double track"]
rail = ["trolley","train","car","coach","rolling","traffic", "light rail", "rail"]
bus = ["bus","van","buses","microtransit","micro transit"]
ferry = ["ferry","ferries", "vessel"]
active_transportation = ["bike","shelter", "pedestrian","cycle", "crosswalk","sidewalk"]
network = ["network"]
parking = ["parking", "lots", "deck"]
facilities = ["center","facility","station","hub","islands","shelter", "shade", "location","center"]
charging = ["charging", "charge"] 
storm_drain = ["storm"]
street = ["street", "signal", "signaling", "traffic control", "road","lane"]
other_vehicles = ["vehicles","emu","emus"]
bridge_tunnel = ["bridge","tunnel"]
infrastructure = ["infrastructure","bluff"]

In [None]:
my_keywords_list = [track,rail,bus,ferry,active_transportation,
                    network,parking,facilities, charging, 
                    storm_drain, street, other_vehicles, bridge_tunnel, infrastructure]

In [None]:
my_new_column_names_list = ["track","rail","bus","ferry","active_transportation",
                    "network","parking","facilities", "charging", 
                    "storm_drain", "street", "other_vehicles", "bridge_tunnels","infrastructure"]

In [None]:
tircp3 = A8_strings.total_procurement_estimates(tircp3,
    "clean_components",
    my_keywords_list,
    my_new_column_names_list,)

In [None]:
def fill_in_zeroes(df, keywords:list, description_column:str, new_col_name:str):
    # Delinate items in keywords list using |
    keywords_blob = f"({'|'.join(keywords)})"
    
    # If a keyword appears in the desc
    # Automatically add it as 1
    keywords_dict = dict.fromkeys(keywords, 1)
    
    df[new_col_name] = (df[description_column]
                                   .str.extract(keywords_blob,expand=False)
                                   .replace(keywords_dict)
                                   .fillna(0)
                                  ) 
    return df 

In [None]:
# tircp3 = fill_in_zeroes(tircp3, network, "clean_components", "new_network")

In [None]:
for my_list in my_keywords_list:
    for i in my_new_column_names_list:
        tircp4 = fill_in_zeroes(tircp3,  my_list, "clean_components", f"new_{i}")

In [None]:
for i in range(0, len(my_keywords_list)):
        tircp4 = fill_in_zeroes(
            tircp3, my_keywords_list[i], "clean_components", f"new_{my_new_column_names_list[i]}"
        )
        
        # https://stackoverflow.com/questions/68243146/replace-zero-with-value-of-an-other-column-using-pandas
        tircp4[f'total_{my_new_column_names_list[i]}'] = tircp4[f'total_{my_new_column_names_list[i]}'].mask(tircp4[f'total_{my_new_column_names_list[i]}'].eq(0),tircp4[f"new_{my_new_column_names_list[i]}"])
        
        tircp4 = tircp4[tircp4.columns.drop(list(tircp4.filter(regex='new')))]

In [None]:
tircp4.columns

* Add in 
* signage, mobile app development
* service ticket integration
* What about projects that don't have allocation yet-> have to draw stuff from project description

In [None]:
tircp4[[ 'allocation_components', 'clean_components', 'total_track', 'total_rail',
       'total_bus', 'total_ferry', 'total_active_transportation',
       'total_network', 'total_parking', 'total_facilities', 'total_charging',
       'total_storm_drain', 'total_street', 'total_other_vehicles','total_bridge_tunnels', 'total_infrastructure']]

## Outcomes
### AwardedProjectsDetail.xlsx

In [None]:
drisi = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}DRISI/AwardedProjectsDetail.xlsx"))

In [None]:
drisi_drop_cols = [
    "agency_code",
    "agency_short_name",
    "sub_program_description",
    "agency_name",
    "program_short_name",
    "program_name",
    "program_description",
    "sub_program_short_name",
    "sub_program_name",
    "project_type",
    "agency",
    "program",
    "date_imported",
]

In [None]:
drisi = drisi.drop(columns=drisi_drop_cols)

In [None]:
drisi = drisi.fillna(drisi.dtypes.replace({"float64": 0.0, "object": "None"}))

In [None]:
# drisi.info()

In [None]:
drisi.sample()

### ImplementedProjectsDetail.xlsx

In [None]:
implemented = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}DRISI/ImplementedProjectsDetail.xlsx")
)

In [None]:
# list(implemented.columns)

In [None]:
# implemented.sample()

In [None]:
implemented_drop_cols = [
    "programuniqueidentifier",
    "record_type",
    "reporting_cycle_name",
    "agency_short_name",
    "agency_name",
    "date_operational",
    "program_name",
    "program_description",
    "project_completion_date",
    "date_imported",
    "sub_program_name",
    "date_selected_for_award",
    "project_name",
    "project_type",
    "fiscal_year_funding_project",
    "census_tract",
    "address",
    "lat_long",
    "total_program_ggrffunding",
    "voucher_name",
    "voucher_description",
]

In [None]:
implemented = implemented.drop(columns=implemented_drop_cols)

In [None]:
implemented = implemented.fillna(
    implemented.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [None]:
# implemented.info(verbose=True, show_counts=True)

### OutcomeProjectsDetail.xlsx 

In [None]:
outcomes = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}DRISI/OutcomeProjectsDetail.xlsx")
)

In [None]:
outcomes_drop_cols = [
    "unnamed:_0",
    "proj_rec_id",
    "reporting_cycle_name",
    "agency_short_name",
    "agency_name",
    "program_short_name",
    "program_name",
    "program_description",
    "record_type",
    "sub_program_short_name",
    "sub_program_name",
    "sub_program_description",
    "date_imported",
]

In [None]:
outcomes = outcomes.drop(columns=outcomes_drop_cols)

In [None]:
# list(outcomes.columns)

In [None]:
outcomes = outcomes.fillna(outcomes.dtypes.replace({"float64": 0.0, "object": "None"}))

### TIRCP_AllProjects_12212022 (002).xlsx
* Doesn't have anything interesting.

In [None]:
# Fill in empty values with NA
projects = projects.fillna(
    projects.dtypes.replace({"float64": 0.0, "object": "None", "int64": 0})
)

### TIRCP Battery and Hydrogen Fuel Cell Bus list 10-10-2022 (1).xlsx

In [None]:
battery = to_snakecase(
    pd.read_excel(
        f"{GCS_FILE_PATH}DRISI/TIRCP Battery and Hydrogen Fuel Cell Bus list 10-10-2022 (1).xlsx"
    )
)

In [None]:
battery_drop_cols = [
    "local_agency_contact",
    "local_agency_email",
    "local_agency_phone_number",
    "awarded_allocated",
    "procured_contracted",
    "components",
]

In [None]:
battery = battery.drop(columns=battery_drop_cols)

In [None]:
battery = battery.fillna(battery.dtypes.replace({"float64": 0.0, "object": "None"}))

In [None]:
battery["project_#"] = battery["project_#"].map("{:02}".format)

In [None]:
battery["project_number"] = (
    battery["award_year"].astype(str) + "-" + "0" + battery["project_#"].astype(str)
)

In [None]:
battery.sample()

In [None]:
tircp.merge(battery, how="outer", on=["project_number"], indicator=True)[
    ["_merge"]
].value_counts()

In [None]:
# Format moentary cols
monetary_cols = ["total__cost", "tircp"]
for i in monetary_cols:
    projects[i] = projects[i].apply(
        lambda x: format_currency(x, currency="USD", locale="en_US")
    )

In [None]:
# Clean up column names
projects = A1_data_prep.clean_up_columns(projects)

In [None]:
projects = projects.rename(
    columns={
        "Number Use": "Project Number",
        "Assembly\nDistricts": "Assembly Districts",
        "Senate\nDistricts": "Senate Districts",
        "Caltransdistrict": "CT Districts",
        "Assembly\nDistricts": "Assembly Districts",
    }
)

In [None]:
# Rearrange columns
right_order = [
    "Award Year",
    "#",
    "Project Number",
    "Ppno",
    "Title",
    "Grant Recipient",
    "Tircp",
    "Total  Cost",
    "Description",
    "District",
    "County",
    "Status",
    "CT Districts",
    "Assembly Districts",
    "Senate Districts",
    "City Code",
    "County Code",
    "Implementing Agency  Id",
]

In [None]:
projects = projects[right_order]

### Outcomes Sheet

In [None]:
# Measure columns
measure_cols = [
    "estimated_tircp_ghg_reductions",
    "cost_per_ghg_ton_reduced",
    "increased_ridership",
    "service_integration",
    "improve_safety",
]

In [None]:
# Turn estimated GHG reductions into a number
merge2["estimated_tircp_ghg_reductions"] = (
    merge2["estimated_tircp_ghg_reductions"]
    .str.replace("MTCO2e", "")
    .str.replace("None", "")
    .str.replace(",", "")
)

In [None]:
merge2["estimated_tircp_ghg_reductions"] = (
    merge2["estimated_tircp_ghg_reductions"]
    .apply(pd.to_numeric, errors="coerce")
    .fillna(0)
)

In [None]:
# Subset to cols similar to SCCP
outcomes = merge2[
    [
        "award_year",
        "detailed_title_col",
        "estimated_tircp_ghg_reductions",
        "increased_ridership",
        "service_integration",
        "improve_safety",
    ]
].sort_values(
    [
        "award_year",
        "detailed_title_col",
    ]
)

In [None]:
outcomes = A1_data_prep.clean_up_columns(outcomes)

In [None]:
outcomes.head(1)

##### Version 1

In [None]:
# Drop award year
outcomes_transformed = outcomes.drop(columns=["Award Year"]).T

In [None]:
# Make first row to column names
outcomes_transformed.columns = outcomes_transformed.iloc[0]

In [None]:
# Del first row
outcomes_transformed = outcomes_transformed.iloc[1:]

In [None]:
outcomes_transformed.head()

##### Outputs: Measures except GHG Reductions.

In [None]:
outcomes_melt = pd.melt(
    outcomes,
    id_vars=[
        "Award Year",
        "Detailed Title Col",
    ],
    value_vars=[
        "Increased Ridership",
        "Service Integration",
        "Improve Safety",
    ],
)

In [None]:
outcomes_melt = A1_data_prep.clean_up_columns(outcomes_melt)

In [None]:
year_summary = (
    outcomes_melt.groupby(["Award Year", "Variable", "Value"])
    .agg({"Detailed Title Col": "nunique"})
    .rename(columns={"Detailed Title Col": "Number of Projects in this Value Category"})
)

In [None]:
year_summary

In [None]:
GHG_by_year = outcomes.groupby(["Award Year"]).agg(
    {"Estimated Tircp Ghg Reductions": "sum"}
)

In [None]:
GHG_by_year

#### Save

In [None]:
"""
with pd.ExcelWriter(f"{GCS_FILE_PATH}calsta_draft.xlsx") as writer:
    outcomes.to_excel(writer, sheet_name="outcomes_unpivoted", index=True)
    outcomes_transformed.to_excel(writer, sheet_name="outcomes_transformed", index=True)
    projects.to_excel(writer, sheet_name="projects", index=True)
    year_summary.to_excel(writer, sheet_name="year_summary", index=True)
    GHG_by_year.to_excel(writer, sheet_name="GHG_reduction_year", index=True)
    """