## Add TIRCP

In [1]:
import _utils
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import fuzzywuzzy
from fuzzywuzzy import process



### ATP - 10 Year Non SHOPP 

In [4]:
# Read in 10 Year non SHOPP
atp_shopp = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_with_fake_metrics_plus_atp.xlsx")
)

In [5]:
# Subset
atp_shopp_subset = [
    "ppno",
    "project_name",
    "lead_agency",
    "previous_caltrans_nominations",
    "full_county_name",
    "district",
    "project_description",
    "current_phase",
    "primary_mode",
    "urban_rural",
    'total_project_cost__$1,000',
    "total_unfunded_need__$1,000",
    "notes",
    'shs_capacity_increase_detail'
]

In [6]:
atp_shopp2 = atp_shopp[atp_shopp_subset]

In [7]:
# atp_shopp2.sample()

In [8]:
# Function to clean agency/organization names
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.strip()  # strip again after getting rid of certain things
    )
    return df

In [9]:
# Lowercase previous caltrans nominations
atp_shopp2.previous_caltrans_nominations = (
    atp_shopp2.previous_caltrans_nominations.str.lower()
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
# Clean ATP 
atp_shopp2 = organization_cleaning(atp_shopp2, "lead_agency")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
# atp_shopp2.head(100)

### TIRCP 
* Filter out projects in which total cost equals total award 

In [12]:
# Read in TIRCP. Last updated November 2022.
tircp = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/tircp/Tableau_Workbook.xlsx"
    )
)

In [13]:
# tircp.columns

In [14]:
# Subset TIRCP with only the basic information
tircp_subset = [
    "award_year",
    "grant_recipient",
    "title",
    "ppno",
    "district",
    "county",
    "description",
    "total__cost",
    "tircp",
    "award_cycle",
    "on_shs?",
    "comments_additional_contacts"
]

In [15]:
tircp2 = tircp[tircp_subset]

In [16]:
# Create a column with cycle + tircp for previous CT nominations
tircp2["previous_caltrans_nominations"] = "TIRCP" + " Cycle " + tircp2["award_cycle"].astype("str")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [17]:
# Create column for unmet needs
tircp2["total_unfunded_need__$1,000"] = tircp2["total__cost"] - tircp2["tircp"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [18]:
# Perhaps to narrow down projects
# Figure out which TIRCP projects' total cost are completely covered by TIRCP requested
tircp2["total_cost_vs_tircp_req"] =  tircp2["tircp"]/tircp2["total__cost"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
# tircp2.total_cost_vs_tircp_req.value_counts()

In [20]:
# Filter out projects in which total cost equals total award 
tircp2 = (tircp2.loc[tircp2.total_cost_vs_tircp_req != 1]).reset_index(drop=True)

In [21]:
# Drop some columns
tircp_cols_drop = ['tircp', 'total_cost_vs_tircp_req', 'award_cycle']

In [22]:
tircp2 = tircp2.drop(columns = tircp_cols_drop)

In [23]:
tircp2.shape

(93, 12)

### Find TIRCP projects that are already in the 10 year Non SHOPP. 

#### Harmonize the way columns are named/formatted.
* TIRCP: `Total_cost` and `unmet needs` should be floats and divided by 1000. `District` should be int64. 

In [24]:
atp_shopp2.previous_caltrans_nominations.nunique(), len(atp_shopp2)

(242, 814)

In [25]:
# Find projects that have DRMT in previous caltrans nominations
atp_shopp_drmt = (
    atp_shopp2[atp_shopp2["previous_caltrans_nominations"].str.contains(("tircp|drmt"))]
).reset_index(drop=True)

In [26]:
# Check out that the filtering was correct. 
# atp_shopp_drmt.previous_caltrans_nominations.value_counts()

In [27]:
# Natalie's function
def align_funding_numbers(df, list_of_cols):
    for col in list_of_cols:
        df[col] = df[col]/1000
        
    return df

In [28]:
tircp2 = align_funding_numbers(tircp2, ["total_unfunded_need__$1,000", "total__cost",])

In [29]:
# (tircp2[["total_unfunded_need__$1,000"]]/1000).astype('int64')*1000

In [30]:
tircp2[["total_unfunded_need__$1,000", "total__cost",]] = tircp2[["total_unfunded_need__$1,000", "total__cost",]].fillna(0)

In [31]:
tircp2["district_full_name"] = tircp2["district"]

In [32]:
# Extract digit from district. Fill "Various" districts with "0" and change to int64
tircp2["district"] = tircp2["district"].str.extract('(\d+)').fillna(0).astype('int64')

In [33]:
# Rename columns
tircp_new_cols = {'grant_recipient': 'lead_agency',
                  'title': 'project_name',
                  'county': 'full_county_name', 
                  'description': 'project_description',
                  'total__cost': 'total_project_cost__$1,000', 
                  'on_shs?': 'shs_capacity_increase_detail',
                  'comments_additional_contacts': 'notes',
                 }

In [34]:
# atp_shopp_drmt.info(), tircp2.info()

In [35]:
tircp2 = tircp2.rename(columns = tircp_new_cols)

#### Test with Merges
* Zero merges across the board. 

In [36]:
# Make a copy of tircp
# tircp_test = tircp2.copy()

In [37]:
# Round numbers
# tircp_test["total_unfunded_need__$1,000"] = (tircp_test[["total_unfunded_need__$1,000"]]/1000).astype('int64')*1000

In [38]:
# tircp_test["total_project_cost__$1,000"] = (tircp_test[["total_project_cost__$1,000"]]/1000).astype('int64')*1000

In [39]:
# atp_shopp_drmt["total_project_cost__$1,000"] = ((atp_shopp_drmt[["total_project_cost__$1,000"]]/1000).astype('int64')*1000).astype('int64')
# atp_shopp_drmt["total_project_cost__$1,000"] = ((atp_shopp_drmt[["total_project_cost__$1,000"]]/1000).astype('int64')*1000).astype('int64')

In [40]:
# Merge on district and unfunded needs
# test1 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["district", "total_unfunded_need__$1,000"], indicator = True)

In [41]:
# test1._merge.value_counts()

In [42]:
# Merge on county and unfunded needs
# test2 = pd.merge(atp_shopp_drmt, tircp2, how= "left", on=["full_county_name", "total_unfunded_need__$1,000"], indicator = True, suffixes = ["_shopp", "_tircp"])

In [43]:
# test2._merge.value_counts()

In [44]:
# test2.loc[test2._merge == "both"][["project_name_shopp","project_name_tircp", "full_county_name", "project_description_shopp","project_description_tircp"]]

In [45]:
# Merge on district and total costs 
# test2 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["district", "total_project_cost__$1,000"], indicator = True)

#### Observation: 
* Total Project Cost, project titles, lead agency, and districts are potentially listed differently across datasets.
* Inglewood Transit Connector Project	  is listed as $1,016,000.000 in TIRCP but 1,666,466.0 in non SHOPP.
    * City Of Inglewood is the grant recipient in TIRCP but Caltrans is the lead agency in Non SHOPP.
* Valley Rail Expansion: Altamont Corridor Express (ACE) Ceres to Turlock Extension (the TIRCP title) is listed as D6 in TIRCP but D10 in Stanislaus.
    * Fresno Subdivision (Ceres To Turlock) Double Tracking is the non SHOPP title.

In [46]:
tircp_already_entered = ['Inglewood Transit Connector Project','Valley Rail Expansion: Altamont Corridor Express (ACE) Ceres to Turlock Extension',]

In [47]:
# Projects with only TIRCP in the previous nomination.
len(atp_shopp_drmt[atp_shopp_drmt["previous_caltrans_nominations"].str.contains(("tircp"))])

3

In [48]:
# Delete TIRCP projects that are already in non SHOPP
# One project below in non SHOPP mentions TIRCP but that project
# was ultimately not nominated.
tircp2 = (tircp2[~tircp2["project_name"].isin(tircp_already_entered)]).reset_index(drop = True)

In [49]:
# atp_shopp_drmt[atp_shopp_drmt["previous_caltrans_nominations"].str.contains(("tircp"))].drop(columns = "notes")

#### Test with Project Names

In [50]:
tircp2["project_test"] = tircp2["project_name"]

In [51]:
atp_shopp_drmt["project_test"] = atp_shopp_drmt["project_name"]

In [52]:
def simplify_project_names(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.lower()
        .str.replace("/", "")
        .str.replace("-","")
        .str.replace("!","")
        .str.replace("&","")
        .str.replace("#","")
        .str.replace("(", "")
        .str.replace(")", "")
        .str.replace(":","")
        .str.replace("the","")
        .str.strip() # strip again after getting rid of certain things
    )
    return df

In [53]:
tircp2 = simplify_project_names(tircp2, "project_test")



In [54]:
atp_shopp_drmt = simplify_project_names(atp_shopp_drmt, "project_test")



In [55]:
# atp_shopp_drmt[["project_name","project_test"]].sort_values(by = ["project_name"])

In [56]:
# Merge on project names
test3 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["project_test"], indicator = True, suffixes = ["_shopp", "_tircp"] )

In [57]:
test3._merge.value_counts()

left_only     167
right_only     91
both            0
Name: _merge, dtype: int64

In [58]:
# atp_shopp_drmt[["project_test"]].sort_values("project_test")

In [59]:
# tircp2[["project_test"]].sort_values("project_test")

#### Test 1 with Fuzzy Matching on Project Names
* Unsuccessful with project names. Only 2 matches and after looking at descriptions/districts manually, they aren't similar enough.

In [60]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [61]:
# Create a list of all the TIRCP names 
tircp_projects = tircp2.project_name.unique().tolist()

In [62]:
atp_shopp_projects = atp_shopp_drmt.project_name.unique().tolist()

In [63]:
#for i in atp_shopp_projects:
#    replace_matches_in_column(
#        tircp2, "project_name", "project_name_fuzzy_match", i, 80
#    )

In [64]:
# tircp2[["project_description", "project_description","project_description_fuzzy_match"]]

In [65]:
# Merge on project names
# test4 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", left_on=["project_test"], right_on = ["project_name_fuzzy_match"], indicator = True, suffixes = ["_shopp", "_tircp"] )

In [66]:
# test4._merge.value_counts()

In [67]:
# test4.loc[test4._merge == "both"][["district_shopp","district_tircp","project_name_tircp","project_name_shopp", "project_description_shopp", "project_description_tircp"]]

### Concat

In [68]:
# Drop columns
tircp2 = tircp2.drop(columns = ['project_test', 'award_year'])

In [69]:
# Add column for primary mode. 
tircp2["primary_mode"] = "Transit/Zev/Rail (Passenger)" 

In [70]:
atp_drop = ['unnamed:_0','update_complete','modified',
 'modified_by',] 

In [71]:
atp_shopp = atp_shopp.drop(columns = atp_drop)

In [72]:
# tircp2

In [73]:
concat1 = pd.concat([atp_shopp, tircp2])

In [74]:
# Fill NA based on dtypes
concat1 = concat1.fillna(
    concat1.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [75]:
concat1 = _utils.clean_up_columns(concat1)

In [76]:
concat1.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 905 entries, 0 to 90
Data columns (total 101 columns):
 #    Column                                            Dtype  
---   ------                                            -----  
 0    Rail Project Id                                   object 
 1    10 Year Plan                                      float64
 2    Ct Project Id                                     object 
 3    Ea                                                object 
 4    Ppno                                              object 
 5    Project Name                                      object 
 6    District                                          int64  
 7    County                                            object 
 8    Route                                             object 
 9    Project Description                               object 
 10   Current Phase                                     object 
 11   Con Existing Source S  Of Funds                   object 

In [77]:
matched_vendors = []

for row in tircp2.index:
    tircp_projects = tircp2.get_value(row,"project_test")
    for columns in atp_shopp_drmt.index:
        shopp_projects=atp_shopp_drmt.get_value(columns,"project_test")
        matched_token=fuzz.partial_ratio(tircp_projects,shopp_projects)
        if matched_token> 80:
            matched_vendors.append([shopp_projects,tircp_projects,matched_token])

AttributeError: 'DataFrame' object has no attribute 'get_value'

In [79]:
concat1.tail()

Unnamed: 0,Rail Project Id,10 Year Plan,Ct Project Id,Ea,Ppno,Project Name,District,County,Route,Project Description,Current Phase,Con Existing Source S Of Funds,Con Anticipated Source Of Funds,Target Opening Year,Beg Pm,End Pm,Primary Mode,Previous Caltrans Nominations,Urban Rural,Notes,Lead Agency,Pid Approval Date M010,Target Pa Ed M200,Rtl Date M460,Con Start Date M500,Funding Need Phase S,"Pa Ed Cost $1,000","Ps E Cost $1,000","Row Cost $1,000","Con Support Cost $1,000","Non Infrastructure Plan Cost $1,000","Total Unfunded Need $1,000",Previous Funding Request Phase,Last Scored,Csis Alignment,Csis Total Score Out Of 45,Mode Shift Csis Score,Mode Shift Csis Comment,Vmt Csis Score,Vmt Csis Comment,Public Engagement Csis Score,Public Engagement Csis Comment,Dac Local Community Needs Csis Score,Dac Local Community Needs Csis Comment,Safety Csis Score,Safety Csis Comment,Zev Csis Score,Zev Csis Comment,Climate Resiliency Csis Score,Climate Resiliency Csis Comment,...,Natural Resources And Ecosystems Csis Comment,Infill Development And Land Use Csis Score,Infill Development And Land Use Csis Comment,Benefits To Dac And Advancing Equity Atp Score,Community Need Atp Score,Safety Atp Score,Public Participation Atp Score,Community Feedback Atp Score,Continued Engagement Atp Score,Context Sensitive And Innovation Atp Score,Transformative Atp Score,Atp Total Score Out Of 100,Atp Alignment,Access Alignment,Previous Funding Request,Purpose Need,Parcel Counts,"Total Project Cost $1,000","Con Capital Cost $1,000",Hq Priority,District Priority,Potential Funding Program S,Located In Dac,Shs Capacity Increase Detail,Secondary Mode S,Full County Name,Detailed Project Title,District Full Name,Increase Peak Person Throughput,Reduction In Peak Period Delay,Reduction In Fatal And Injury Crashes,Reduction In Injury Rates,Increase Access To Jobs,Increase Access Jobs To Dac,Commercial Dev Developed,Tons Of Goods Impacted,Improve Air Quality,Impact Natural Resources,Support Of Transportation,Fake Benefit Score,Statewide Rank,District Rank,Fake Benefit Score Statewide Percentile,District District Median,"Total Project Cost $1,000 District Median",Fake Benefit Score District Median,"Total Unfunded Need $1,000 District Median",Csis Total Score Out Of 45 District Median,Atp Total Score Out Of 100 District Median,Ppno1
86,,0.0,,,CP103,SFMTA Core Capacity Program,4,,,"Implements the Muni Forward program on three key corridors (K, N, and the 38R Geary lines) to enhance reliability, efficiency, travel times, and rider comfort. Also invests in Phase 9 and 1 of the Train Control Upgrade Project, including upgrades from Embarcadero and 3rd Street to Muni Metro East, improving the reliability of the overall Muni Metro service up to a key location of current system delay.",,,,,,,Transit/Zev/Rail (Passenger),TIRCP Cycle 5,,Primary Contact/ Joel Goldberg(Joel.Goldberg@sfmta.com) additional contact,San Francisco Municipal Transportation Agency,,,,,0.0,0.0,0.0,0.0,0.0,0.0,262215.062,,0.0,,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0,,0.0,,0.0,,...,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,378291.062,0.0,0.0,0.0,,,0,,San Francisco,,District 4: Bay Area / Oakland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,
87,,0.0,,,CP091,Next Wave: Expanding MTD's Electric Legacy on the South Coast,5,,,"Purchases eight battery-electric buses and 3 electric microtransit vans, continuing fleet conversion and allowing expansion of microtransit service into additional zones serving the City of Goleta, UC Santa Barbara, and the Goleta rail station. Funds general transit improvements including signal priority, contactless payment deployment, additional bike racks, and bus shelter improvements, and constructs facility improvements at two terminals including the construction of new ZEB infrastructure.",,,,,,,Transit/Zev/Rail (Passenger),TIRCP Cycle 5,,,Santa Barbara Metropolitan Transit District,,,,,0.0,0.0,0.0,0.0,0.0,0.0,18561.52,,0.0,,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0,,0.0,,0.0,,...,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,33041.52,0.0,0.0,0.0,,,0,,San Barbara,,District 5: San Luis Obispo / Santa Barbara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,
88,,0.0,,,CP100,Sonoma Regional Bus and Rail Connectivity Improvements,4,,,"Includes the purchase of 30 zero-emission buses and associated charging infrastructure and passenger amenities for Petaluma Transit, Santa Rosa CityBus and Sonoma County Transit, construction of the SMART Petaluma North commuter rail station, and improved network integration among all application partners and other transit operators in Sonoma County, including contactless payment equipment for Mendocino Transit Authority",,,,,,,Transit/Zev/Rail (Passenger),TIRCP Cycle 5,,,Sonoma County Transportation Authority,,,,,0.0,0.0,0.0,0.0,0.0,0.0,28944.0,,0.0,,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0,,0.0,,0.0,,...,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,53769.0,0.0,0.0,0.0,,,0,,SON,,District 4: Bay Area / Oakland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,
89,,0.0,,,CP081,Metrolink Perris Valley Line Capacity Improvements,7,,,"Improvements Completes the final design and construction of three capacity improvements on Metrolink’s 91/Perris Valley Line (91/PVL) that allow for bi-directional, peak-period service to be increased: 1) PerrisSouth Station Expansion, 2) Perris-South Layover 4th Track, and 3) CP Eastridge to Moreno Valley/March Field Double Track.",,,,,,,Transit/Zev/Rail (Passenger),TIRCP Cycle 5,,,Southern California Regional Rail Authority,,,,,0.0,0.0,0.0,0.0,0.0,0.0,32000.279,,0.0,,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0,,0.0,,0.0,,...,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,57042.279,0.0,0.0,0.0,,,0,,Various,,District 7: Los Angeles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,
90,,0.0,,,CP093,Tulare Cross-Valley Corridor ZEB Expansion,6,,,Supports the phased development of an east-west Cross Valley Corridor by purchasing 14 zero-emission feeder buses in multiple cities in and along the corridor (as well as 16 micro-transit vehicles to be operated in selected cities) that will provide comprehensive access to the future rail system for all these communities and will connect to the California High Speed Rail system.,,,,,,,Transit/Zev/Rail (Passenger),TIRCP Cycle 5,,,Tulare County Regional Transit Agency,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0,,0.0,,0.0,,...,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,0.0,0.0,0.0,0.0,,,0,,TUL,,District 6: Fresno / Bakersfield,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,
