## TIRCP Calsta
* TIRCP outcomes for cycles 3-5 for California State Transportation Agency. 
* [Cycles 1-6](https://calsta.ca.gov/subject-areas/transit-intercity-rail-capital-prog)
* Cycle 1: 2015
* Cycle 2: 2016
* Cycle 3: 2018
* Cycle 4: 2020
* Cycle 5: 2022
* Cycle 6: 2023

In [324]:
import re
from collections import Counter

import A1_data_prep
import A2_tableau
import numpy as np
import pandas as pd
from babel.numbers import format_currency
from calitp import *

In [325]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [326]:
# GCS File Path:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"

### Filter out for cycles of interest

In [327]:
df_tircp = to_snakecase(A2_tableau.tableau_dashboard())

  warn(msg)


In [328]:
df_tircp2 = df_tircp.loc[df_tircp["award_year"] >= 2018].reset_index(drop=True)

In [329]:
df_tircp2.award_year.value_counts(), len(df_tircp2)

(2018    28
 2022    23
 2020    17
 Name: award_year, dtype: int64,
 68)

In [330]:
df_tircp2.ppno.nunique(), df_tircp2.title.nunique()

(59, 67)

### Add info based on SCCP's output example
Project ID	Project Name	Implementing Agency	Program	Project Description	 Total Cost 	 SB 1 Funds 	Fiscal Year	Is SB 1?	Project Status	Assembly Districts	Senate Districts	Counties	Cities	Caltrans Districts	Is on SHS?	Date Updated	Cycle


#### GIS Template has Assembly District/Senate District/City/Counties info

In [331]:
# Read in sheet with Assembly info.
gis = to_snakecase(
    pd.read_excel(
        f"{GCS_FILE_PATH}TIRCP_GIS_Template_Requirements 6-1-2022.xlsx",
        sheet_name="Projects Table",
    )
)

In [332]:
# Clean some column names
gis = gis.rename(
    columns={
        "ppno_": "ppno",
        "assembly\ndistricts": "assembly_districts",
        "senate\ndistricts": "senate_districts",
        "caltrans\ndistrict": "CT_district",
    }
)

In [333]:
# Clean PPNO
gis = A1_data_prep.ppno_slice(gis)

In [334]:
# Subset for only cols of interest
gis2 = gis[
    [
        "project_number",
        "ppno",
        "projecttitle",
        "projectstatus",
        "assembly_districts",
        "senate_districts",
        "city_code",
        "CT_district",
        "county_code",
    ]
]

In [335]:
gis2.ppno.nunique()

45

In [336]:
# There are mulitple entries for each ppno.
gis2.ppno.value_counts().head()

CP033    60
CP035    21
CP042    18
CP032    14
CP031    11
Name: ppno, dtype: int64

In [337]:
# Clean project_number, only keep year
gis2["project_number"] = gis2["project_number"].str.split(":").str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gis2["project_number"] = gis2["project_number"].str.split(":").str[0]


In [338]:
gis2["project_number"] = gis2["project_number"].fillna(0).astype("int64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gis2["project_number"] = gis2["project_number"].fillna(0).astype("int64")


In [339]:
# Place project status all on one row.
# Remove duplicate statuses
def summarize_rows(df, col_to_group: str, col_to_summarize: str):
    df = df.groupby(col_to_group)[col_to_summarize].apply(",".join).reset_index()

    df[col_to_summarize] = (
        df[col_to_summarize]
        .apply(lambda x: ", ".join(set([y.strip() for y in x.split(",")])))
        .str.strip()
    )
    return df

In [340]:
project_status_gis = summarize_rows(gis2, "ppno", "projectstatus")

In [341]:
# Check that each row matches the number of unique ppno
len(project_status_gis) == gis2.ppno.nunique()

True

In [342]:
# Drop old project status
gis2 = gis2.drop(columns=["projectstatus"])

In [343]:
# Merge with original gis, so there is only one row for each PPNO
final_gis = (
    pd.merge(project_status_gis, gis2, how="left", on=["ppno"])
    .drop_duplicates("ppno")
    .reset_index(drop=True)
)

#### Merge with TIRCP Tracking

In [344]:
# Merge with df_tircp2
merge1 = pd.merge(
    df_tircp2,
    final_gis,
    how="left",
    left_on=["ppno", "award_year"],
    right_on=["ppno", "project_number"],
    indicator=True,
)

In [345]:
merge1._merge.value_counts()

both          43
left_only     25
right_only     0
Name: _merge, dtype: int64

In [346]:
# Double Check
"""
merge1.loc[merge1['_merge'] == 'both'][
    [
        "award_year",
        'project_number',
        "title",
        "ppno",
        "projecttitle",
        "_merge",
        
    ]
]
"""

'\nmerge1.loc[merge1[\'_merge\'] == \'both\'][\n    [\n        "award_year",\n        \'project_number\',\n        "title",\n        "ppno",\n        "projecttitle",\n        "_merge",\n        \n    ]\n]\n'

### Project Sheet 

In [347]:
# Subset to cols simila to SCCP
projects = merge1[
    [
        "award_year",
        "ppno",
        "title",
        "grant_recipient",
        "district",
        "county",
        "description",
        "total__cost",
        "tircp",
        "expended_amount",
        "projectstatus",
        "assembly_districts",
        "county_code",
        "senate_districts",
        "city_code",
    ]
]

In [348]:
projects = projects.fillna(projects.dtypes.replace({"float64": 0.0, "object": "None", "int64": 0}))

In [349]:
monetary_cols = ['total__cost', 'tircp', 'expended_amount']

In [350]:
for i in monetary_cols:
    projects[i] = projects[i].apply(
    lambda x: format_currency(x, currency="USD", locale="en_US"))

In [375]:
projects = A1_data_prep.clean_up_columns(projects)

### Outcomes Sheet

In [352]:
# Create a detailed title column
merge1["award_year"] = merge1["award_year"].astype("object")

In [353]:
detailed_title_cols = [
    "award_year",
    "title",
    "grant_recipient",
]

In [354]:
# https://stackoverflow.com/questions/39291499/how-to-concatenate-multiple-column-values-into-a-single-column-in-pandas-datafra
merge1["detailed_title_cols"] = merge1[detailed_title_cols].apply(
    lambda row: "-".join(row.values.astype(str)), axis=1
)

In [355]:
# Measure columns 
measure_cols=[ "estimated_tircp_ghg_reductions",
        "cost_per_ghg_ton_reduced",
        "increased_ridership",
        "service_integration",
        "improve_safety",]

In [356]:
# Turn estimated GHG reductions into a number
merge1["estimated_tircp_ghg_reductions"] = (
    merge1["estimated_tircp_ghg_reductions"]
    .str.replace("MTCO2e", "")
    .str.replace("None", "")
    .str.replace(",", "")
)

In [357]:
merge1["estimated_tircp_ghg_reductions"] = merge1[
    "estimated_tircp_ghg_reductions"
].apply(pd.to_numeric, errors="coerce")

In [358]:
# Subset to cols simila to SCCP
outcomes = merge1[
    [
        "award_year",
        "detailed_title_cols",
        "estimated_tircp_ghg_reductions",
        "cost_per_ghg_ton_reduced",
        "increased_ridership",
        "service_integration",
        "improve_safety",
    ]
].sort_values(["award_year", "detailed_title_cols"])

##### Version 1

In [359]:
# Drop award year
outcomes_transformed = outcomes.drop(columns=["award_year"]).T

In [360]:
# Make first row to column names
outcomes_transformed.columns = outcomes_transformed.iloc[0]

In [361]:
# Del first row
outcomes_transformed = outcomes_transformed.iloc[1:]

##### Version 2

In [368]:
outcomes_melt = pd.melt(outcomes, id_vars=[ "award_year",
        "detailed_title_cols",], value_vars=[
        "cost_per_ghg_ton_reduced",
        "increased_ridership",
        "service_integration",
        "improve_safety",])

In [384]:
year_summary = (outcomes_melt
                .groupby(['award_year','variable', 'value'])
                .agg({'detailed_title_cols':'nunique'})
                .rename(columns = {'detailed_title_cols':'number of projects'}) 
               )

In [385]:
year_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,number of projects
award_year,variable,value,Unnamed: 3_level_1
2018,cost_per_ghg_ton_reduced,High,16
2018,cost_per_ghg_ton_reduced,Medium,3
2018,cost_per_ghg_ton_reduced,Medium-High,8
2018,cost_per_ghg_ton_reduced,,1
2018,improve_safety,High,9
2018,improve_safety,Medium,12
2018,improve_safety,Medium-High,7
2018,increased_ridership,High,13
2018,increased_ridership,Medium,10
2018,increased_ridership,Medium-High,5


#### Save

In [396]:
with pd.ExcelWriter(f"{GCS_FILE_PATH}calsta_draft.xlsx") as writer:
    outcomes.to_excel(writer, sheet_name="outcomes_unpivoted", index=True)
    outcomes_transformed.to_excel(writer, sheet_name="outcomes_transformed", index=True)
    projects.to_excel(writer, sheet_name="projects", index=True)
    year_summary.to_excel(writer, sheet_name="year_summary", index=True)