## Add TIRCP

In [1]:
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase
import _utils



In [14]:
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### ATP 

In [3]:
# Read in ATP-10 Year non SHOPP
atp_shopp = to_snakecase(pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_with_fake_metrics_plus_atp.xlsx"))

In [22]:
# Subset
atp_shopp_subset = ['ppno','project_name', 'lead_agency', 'previous_caltrans_nominations', 'county',
 'district','project_description',
 'current_phase','primary_mode','urban_rural', 'notes',]

In [23]:
atp_shopp2 = atp_shopp[atp_shopp_subset]

In [36]:
# Lowercase previous caltrans nominations
atp_shopp2.previous_caltrans_nominations = atp_shopp2.previous_caltrans_nominations.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
# Find projects that have DRMT in previous caltrans nominations
atp_shopp_drmt = (atp_shopp2[atp_shopp2["previous_caltrans_nominations"].str.contains(("tircp|drmt"))]).reset_index(drop = True)

In [43]:
len(atp_shopp_drmt)

167

In [72]:
atp_shopp_drmt.columns

Index(['ppno', 'project_name', 'lead_agency', 'previous_caltrans_nominations',
       'county', 'district', 'project_description', 'current_phase',
       'primary_mode', 'urban_rural', 'notes'],
      dtype='object')

### TIRCP 

In [57]:
# Read in TIRCP. Last updated December 2022
tircp = to_snakecase(pd.read_excel("gs://calitp-analytics-data/data-analyses/tircp/Tableau_Workbook.xlsx"))

In [58]:
# Subset TIRCP with only the basic information
tircp_subset = ['award_year','grant_recipient', 'title', 'ppno', 'district',
       'county', 'description','total__cost', 'tircp',
       'allocated_amount', 'unallocated_amount',
       'expended_amount', 'award_cycle','on_shs?']

In [59]:
tircp = tircp[tircp_subset]

In [60]:
# Create a column with cycle + tircp
tircp['award_cycle'] = 'TIRCP' + ' Cycle ' + tircp['award_cycle'].astype('str') 

In [61]:
# Perhaps to narrow down projects
# Figure out which TIRCP projects' total cost are completely covered by 
# TIRCP requested
tircp['total_cost_vs_tircp_req'] = tircp.total__cost/tircp.tircp

In [62]:
# Maybe filter these out
tircp = (tircp.loc[tircp.total_cost_vs_tircp_req != 1]).reset_index(drop = True)

In [63]:
tircp.shape

(93, 15)

### Try to match

In [65]:
# Function to clean agency/organization names
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.strip()  # strip again after getting rid of certain things
    )
    return df

In [66]:
tircp = organization_cleaning(tircp, "grant_recipient")

In [67]:
atp_shopp_drmt = organization_cleaning(atp_shopp_drmt, "lead_agency")

In [75]:
# Merge on titles
m1 = pd.merge(tircp, atp_shopp_drmt, how="outer", left_on=["title"], right_on=["project_name"], indicator = True)

In [76]:
m1._merge.value_counts()

right_only    167
left_only      93
both            0
Name: _merge, dtype: int64