In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import shared_utils

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)
pd.set_option('display.max_colwidth', None)

## AGREEMENT ALLOCATIONS SHEET DATA

### Agreement Allocations - Read in Raw data

In [None]:
url = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/TIRCP Tracking Sheets 2_1-10-2024.xlsx"
sheet_name = "Agreement Allocations"

In [None]:
tircp = pd.read_excel(url, sheet_name)

### Agreement Allocations -Data Cleaning and QC

In [None]:
# reducing initialdf to first 11 columns.
tircp = tircp.iloc[:, :12]

In [None]:
# dictionary for column name update
new_col = [
    "award_year",
    "project_#",
    "grant_recipient",
    "implementing_agency",
    "ppno",
    "project_id",
    "ea",
    "components",
    "#_of_buses",
    "phase",
    "allocation_amount",
    "expended_amount",
]

In [None]:
tircp.columns = new_col
tircp.columns

In [None]:
tircp = tircp.drop("expended_amount", axis=1)

In [None]:
# fill NaN with zero?
# see if you can sum the bus column
tircp.agg({"#_of_buses": "sum"})
# nope this is correct

In [None]:
display(tircp.shape, list(tircp.columns), tircp.head())

In [None]:
tircp.grant_recipient.nunique()

In [None]:
# use strip to help combine names
tircp["grant_recipient"] = tircp["grant_recipient"].str.strip()

tircp.grant_recipient.nunique()

In [None]:
# see list of unique names
# may be able to consolidate a few
tircp.grant_recipient.sort_values().unique()

In [None]:
new_dict = {
    "Antelope Valley Transit Authority": "Antelope Valley Transit Authority (AVTA)",
    "Bay Area Rapid Transit District": "Bay Area Rapid Transit (BART)",
    "Capitol Corridor Joint Powers Authority": "Capitol Corridor Joint Powers Authority (CCJPA)",
    "Los Angeles County Metropolitan Transportation (LA Metro)": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Los Angeles County Metropolitan Transportation Authority": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Sacramento Regional Transit (SacRT)": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District (SacRT)",
    "San Diego Metropolitan Transit System (SDMTS)": "San Diego Metropolitan Transit System (MTS)",
    "San Francisco Bay Area Water Emergency Transportation Authority": "San Francisco Bay Area Water Emergency Transportation Authority (WETA)",
    "San Francisco Municipal Transportation Agency": "San Francisco Municipal Transportation Authority (SFMTA)",
    "Santa Barbara County Association of Governments\n(SBCAG)": "Santa Barbara County Association of Governments (SBCAG)",
    "Santa Clara Valley Transportation Authority": "Santa Clara Valley Transportation Authority (VTA)",
    "Transportation Agency for Monterey County": "Transportation Agency for Monterey County (TAMC)",
}

In [None]:
# replace the values in grant_recipient using dict
# df.replace({'bus_desc': new_dict}, inplace=True)
tircp = tircp.replace({"grant_recipient": new_dict})

In [None]:
# see that some rows were consolidated
display(tircp.grant_recipient.nunique())

### Agreement Allocations-Export Cleaned data

In [None]:
tircp.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

### Agreement Allocations-Read in Cleaned data from GCS

In [None]:
tircp = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

In [None]:
display(tircp.shape, tircp.columns, tircp.head())

### Agreement Allocations-Cost per Bus, per agency

In [None]:
# filer to project with bus count values
# caveat: some rows in "component" column state some variation of "purchased buses", but did not specify the amount of buses.
# only rows stating the specificy number of buses purchased are included
only_bus = tircp[tircp["#_of_buses"] > 0]

In [None]:
display(only_bus.shape)

In [None]:
# aggregate # of buses and allocation by transit agency
bus_cost = (
    only_bus.groupby("grant_recipient")
    .agg({"#_of_buses": "sum", "allocation_amount": "sum"})
    .reset_index()
)

In [None]:
bus_cost

In [None]:
bus_cost["cost_per_bus"] = (
    (bus_cost["allocation_amount"]) / (bus_cost["#_of_buses"])
).astype("int64")

In [None]:
display(bus_cost.dtypes, bus_cost)

In [None]:
# exporting cost per bus
bus_cost.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocation_cost_per_bus.csv"
)

### Agreement Allocations - Stat analysis

In [None]:
bus_cost

In [None]:
plt.figure(
plt.hist(bus_cost['cost_per_bus'],density=True)
plt.show()

## PROJECT TRACKING SHEET DATA

### project tracking -  read raw data


In [None]:
gcs_path = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/'
file_name = 'TIRCP Tracking Sheets 2_1-10-2024.xlsx'
sheet_name = 'Project Tracking'

def get_data(path, file, sheet):
    df = pd.read_excel(path+file, sheet_name=sheet)
    
    return df

project = get_data(gcs_path, file_name, sheet_name)

In [None]:
display(
    project.shape,
    project.columns,
    project.dtypes,
)

## Project Tracking- data cleaning

### data frame cleaning

In [None]:
# only keep first couple of columns
# tircp = tircp.iloc[:, :12]
project = project.iloc[:, :20]

In [None]:
list(project.columns)

In [None]:
# drop specific columns
drop_col = [
    "Master Agreement Expiration Date",
    "Project Manager",
    "Regional Coordinator",
    "Technical Assistance-CALITP (Y/N)",
    "Technical Assistance-Fleet (Y/N)",
    "Technical Assistance-Network Integration (Y/N)",
    "Technical Assistance-Priority Population (Y/N)",
]

In [None]:
project.drop(columns=drop_col, inplace=True)

In [None]:
len(project.columns)

In [None]:
# replace space with _ & lower everything
project.columns = project.columns.str.replace(" ", "_")
project.columns = project.columns.str.lower()

In [None]:
# check work
project.columns

### check columns
check values of all columns to see if:
-any duplicates values
-invalid int/str values


In [None]:
project.columns

In [None]:
# function to check column information
def col_checker(col):
    display(
        f"Displaying column: {col}",
        len(project[col]),
        list(project[col].sort_values(ascending=True).unique()),
    )

In [None]:
# col is OK, all numbers
col_checker("tircp_award_amount_($)")

In [None]:
# col is good, everything is a number
col_checker("total_project_cost")

In [None]:
# col is OK
col_checker("master_agreement_number")

In [None]:
# col is OK
col_checker("bus_count")

In [None]:
# column is OK
col_checker("project_description")

In [None]:
project[project["district"] == "VAR"]

In [None]:
# Project title OK,
col_checker("project_title")

In [None]:
# award year OK
col_checker("award_year")

In [None]:
# project num OK
col_checker("project_#")

---

In [None]:
# DROP COL
# Col is OK
col_checker("allocated_amount")

In [None]:
# NEEDS CLEANING grant_recipient need to clean
col_checker("grant_recipient")

In [None]:
# may need to clean, there are rows that say '3, 4'
col_checker("county")

In [None]:
# Move to cleaning, check what is 'VAR'. various?
# may be ok just check to make sure
project.district.unique()

In [None]:
# couldnt run col_checker, guessing because some PPNO numbers are inconsistent
# may need to clean, there is a ppno of CP052/CP053
project.ppno.unique()

### dropping allocated amount column

In [None]:
# dropping allocated amount column
project.drop(columns=["allocated_amount"], inplace=True)

In [None]:
# checking work
project.columns

### Clean `grant_recipient` column

In [None]:
list(project.grant_recipient.sort_values(ascending=True).unique())

In [None]:
agency_dict = {
    "Antelope Valley Transit Authority ": "Antelope Valley Transit Authority (AVTA)",
    "Humboldt Transit Authority": "Humboldt Transit Authority (HTA)",
    "Orange County Transportation Authority": "Orange County Transportation Authority (OCTA)",
    "Capitol Corridor Joint Powers Authority": "Capitol Corridor Joint Powers Authority (CCJPA)",
    "Los Angeles County Metropolitan Transportation Authority": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Monterey-Salinas Transit": "Monterey-Salinas Transit District (MST)",
    "Sacramento Regional Transit (SacRT)": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District (SacRT) ": "Sacramento Regional Transit District (SacRT)",
    "San Diego Association of Governments": "San Diego Association of Governments (SANDAG)",
    "Santa Clara Valley Transportation Authority (SCVTA)": "Santa Clara Valley Transportation Authority (VTA)",
    "Southern California  Regional Rail Authority (SCRRA)": "Southern California Regional Rail Authority (SCRRA - Metrolink)",
    "Southern California Regional Rail Authority": "Southern California Regional Rail Authority (SCRRA - Metrolink)",
}

In [None]:
# df.replace({'bus_desc': new_dict}, inplace=True)
project.replace({"grant_recipient": agency_dict}, inplace=True)

In [None]:
# check work. looks good
list(project["grant_recipient"].sort_values().unique())

### Cleaning `county` column

In [None]:
col_checker("county")

In [None]:
#checking specific row with '3,4' as county
project[project["county"] == "3, 4"]

In [None]:
# change county value from '3, 4' to 'VAR' like the other rows.
project.at[3, "county"] = "VAR"

In [None]:
# check work
project.iloc[3]

### Cleaning `district`column
This is good as is, no cleaning requried. All rows with VAR district has VAR in county as well.

In [None]:
#GTG
project.district.unique()

In [None]:
#GTG 
project[project["district"] == "VAR"]

### Clean `ppno` column
This should all be fine as is, no cleaning needed

In [None]:
list(project.ppno.unique())

In [None]:
#GTG 
project[project["ppno"] == "CP052/CP053"]

### Skim the project description column?
double check to ensure bus count is accurate to what the description says?

Saw that some rows mention procuring both zero and non-zero emission buses (count total buses in `bus count` and `VAR` in prop type and bus size?


In [None]:
project[
    project["project_title"]
    == "ATN FAST (Family of Advanced Solutions for Transit): Revolutionizing Transit for a Global Audience"
]

In [None]:
# iloc check
project.iloc[73]

In [None]:
# code to update value at specific index and column
project.loc[project['ppno'] == 'CP106', 'bus_count'] = 42


In [None]:
# check work
project.iloc[73]

---

## Export cleaned Project df 

In [None]:
# exproject cleaned project df
project.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_clean.csv"
)

## Read in cleaned project data

In [None]:
project = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_clean.csv"
)

In [None]:
# ensure df is able to read in
display(project.shape, project.columns)

### filter df for project descriptions that contain bus

In [None]:
bus_only = project[project["bus_count"] > 0]

In [None]:
# this looks correct
display(project.shape, bus_only.shape)

## New column for propulsion type - `prop_type`
Use on `bus_only` df

In [None]:
prop_type = [
    "electric buses",
    "electric commuter",
    "Electric Buses",
    "battery electric",
    "Batery Electric",
    "battery-electric",
    "fuel-cell",
    "fuel cell",
    "Fuel Cell",
    "zero emission",
    "Zero Emission",
    "zero-emission electric buses",
    "zero-emission buses",
    "zero‐emission",
    "zero-emission",
    "zeroemission",
    "CNG",
    "cng",
]

In [None]:
type(prop_type)

In [None]:
# function to match keywords to list
def prop_type_finder(description):
    for keyword in prop_type:
        if keyword in description:
            return keyword
    return "not specified"

In [None]:
# add new col `prop_type`, fill it with values based on project_description using prop_type_finder function
bus_only["prop_type"] = bus_only["project_description"].apply(prop_type_finder)

In [None]:
# check work
display(
    bus_only.columns,
    bus_only["prop_type"].value_counts(),
)

In [None]:
# exploring the not specified rows
bus_only[bus_only["prop_type"] == "not specified"]
# coach-style buses, this row does not specify if buses are zero or non-zero emission bus. GOOD TO GO

In [None]:
# what is in CNG rows?
bus_only[bus_only["prop_type"] == "CNG"]
# was 4 rows, then adjusted prop list to have cng at the bottom. now showing 1 row thats actually CNG

In [None]:
# consolidate values
list(bus_only["prop_type"].sort_values(ascending=True).unique())

In [None]:
prop_dict = {
    "battery electric": "BEB",
    "battery-electric": "BEB",
    "electric buses": "electric (not specified)",
    "electric commuter": "electric (not specified)",
    "fuel cell": "FCEB",
    "fuel-cell": "FCEB",
    "zero-emission buses": "zero-emission bus (not specified)",
    "zero emission": "zero-emission bus (not specified)",
    "zero-emission": "zero-emission bus (not specified)",
    "zero‐emission": "zero-emission bus (not specified)",
}

In [None]:
# replacing prop_type values with dictionary
bus_only.replace({"prop_type": prop_dict}, inplace=True)

In [None]:
# check work
display(bus_only.prop_type.value_counts(), bus_only.head())

# looks good

## New column for bus size type - `bus_size_type`


In [None]:
bus_size = [
    "standard",
    "30-foot",
    "40 foot",
    "40-foot",
    "45-foot",
    "45 foot",
    "40ft",
    "60-foot",
    "articulated",
    "cutaway",
    "coach-style",
    "over-the-road",
    "feeder bus",
]

In [None]:
type(bus_size)

In [None]:
# re writing prop type funct for bus size
def bus_size_finder(description):
    for keyword in bus_size:
        if keyword in description:
            return keyword
    return "not specified"

In [None]:
# creating new column, filling the column using the function applied to project_desctiotion
bus_only["bus_size_type"] = bus_only["project_description"].apply(bus_size_finder)

In [None]:
# checking work
display(bus_only.columns, bus_only.bus_size_type.value_counts())

In [None]:
list(bus_only['bus_size_type'].sort_values().unique())

In [None]:
# expected that not a lot of rows specify a size type.
# will still take a random peek into some

bus_only[bus_only["bus_size_type"] == "not specified"].sample(5)

In [None]:
# consolidate
size_dict={'40 foot': 'conventional (40-ft like)' ,
 '40-foot': 'conventional (40-ft like)',
 '45-foot': 'conventional (40-ft like)',
 'coach-style':'over-the-road',
 'feeder bus': 'conventional (40-ft like)',
 }

In [None]:
type(size_dict)

In [None]:
# .replace() with size_dict to replace values in bus size col
bus_only.replace({"bus_size_type": size_dict}, inplace=True)

In [None]:
# check work
bus_only.bus_size_type.value_counts()

## export project- bus only df

In [None]:
bus_only.to_parquet(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.parquet"
)

## Read in project bus only data


In [None]:
test = pd.read_parquet('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/test_tircp.parquet')

In [2]:
bus_checker = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.parquet"
)

In [None]:
bus_checker

In [32]:
def project_type_checker(description: str) -> str:
    """
    function to match keywords to project description col to identy projects that only have bus procurement.
    used to identify projects into diffferent categories: bus only, bus + others, no bus procurement.
    use with .assign() to get a new col.
    """
    bus_list =[
        "bus"
    ]
    
    exclude_list =[
        "facility",
        #"station",
        "installation",
        "construct",
        "infrastructure",
        "signal priority",
        "improvements",
        #"associated infrastructure" may need to look at what is associated infrastructure is for ZEB 
        
    ]
    proj_description = description.lower().strip()

    if any(word in proj_description for word in bus_list) and not any(
        word in proj_description for word in exclude_list
    ):
        return "bus only"
    
    elif any(word in proj_description for word in exclude_list) and not any(
        word in proj_description for word in bus_list
    ):
        return "non-bus components"
    
    else:
        return "includes bus and non-bus components"

In [33]:
bus_checker = bus_checker.assign(
    project_type = bus_checker['project_description'].apply(project_type_checker)
)


In [34]:
bus_checker["project_type"].value_counts()

includes bus and non-bus components    24
bus only                               11
Name: project_type, dtype: int64

In [35]:
# just_bus rows are all good. 
just_bus = bus_checker[bus_checker['project_type'] == "bus only"]

# bus_non_bus rows are all good
bus_non_bus = bus_checker[bus_checker['project_type'] == "includes bus and non-bus components"]

In [36]:
just_bus

Unnamed: 0,award_year,project_#,grant_recipient,project_title,ppno,district,county,project_description,bus_count,master_agreement_number,total_project_cost,tircp_award_amount_($),prop_type,bus_size_type,project_type
0,2015,1,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environmental Sustability,CP005,7,LA,Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses,13.0,64AVTA2015MA,39478000,24403000,electric (not specified),conventional (40-ft like),bus only
5,2015,6,Orange County Transportation Authority (OCTA),Bravo! Route 560 Rapid Buses,CP004,12,ORA,Purchase five 40-foot CNG buses for BRT Route linking SARTC to Metrolink/Amtrak,40.0,64OCTAMA,2900000,2320000,CNG,conventional (40-ft like),bus only
16,2016,3,Foothill Transit,"Transforming California: Bus Electrification, Service Expansion and Rail Integration",CP076,7,LA,Purchase 20 zero-emission buses to extend Route 486 to the Pamona Metrolink station and increase frequencies,20.0,64FOOTHILLMA,16580000,5000000,zero-emission bus (not specified),not specified,bus only
34,2018,7,City of Los Angeles (LA DOT),Los Angeles City: Leading the Transformation to Zero-Emission Electric Bus Transit Service,CP029,7,LA,"Acquire 112 zero-emission buses to replace existing propane vehicles and add new vehicles, in order to increase frequency of all existing DASH routes to 15-minute service and add 4 new routes, serving communities throughout the City of Los Angeles as recommended in the comprehensive Transit Service Analysis.",112.0,64LADOTMA,102790000,36104000,zero-emission bus (not specified),not specified,bus only
51,2018,24,Shasta Regional Transportation Agency (SRTA),North State Intercity Bus System,CP045,2,VAR,Purchase 7 new coach-style buses to support a new intercity service that connects Redding to Sacramento. Purchase 7 new coach-style buses to support a new intercity service that connects Redding to Sacramento Shata Regional Transportation Agency Bus System to Sacramento International Airport,14.0,64SRTAMA,9516000,8641000,not specified,over-the-road,bus only
68,2020,13,Santa Monica Big Blue Bus,"For People, Place and Planet: Connecting Inglewood to Regional Opportunities",CP071,7,LA,"Purchase 7 zero emission buses to enhance and extend Route 14 from Playa Vista to Inglewood, bringing new transit opportunities to disadvantaged communities, while also integrating light rail and bus services.",7.0,64SANMONICAMA,6743000,1105000,zero-emission bus (not specified),not specified,bus only
70,2020,15,Torrance Transit Department,Torrance Transit Bus Service Enhancement Program,CP073,7,LA,"Purchase 7 electric buses to expand services on Line 4X (between Torrance and Downton LA), on an extended line 10 (serving the Metro Green Line Crenshaw station and the Inglewood Stadium and Entertainment District, an extended line 9 (newly serving the Kaiser Permanente South Bay Medical Center), and the acquisition of the western portion of LA Metro’s Route 130 between the Blue Line Artesia Station and the South Bay Galleria Mall.",7.0,64TTDMA,7200000,6000000,electric (not specified),not specified,bus only
71,2020,16,Transit Joint Powers Authority of Merced County,"Improving Air Quality & Economic Growth with Electric Buses in Merced County, the Gateway to Yosemite",CP074,10,MER,Purchases 3 zero-emission electric buses to increase fleet size and extend bus service levels on 2 fixed routes in Merced county. The proposed project allows for an expansion of service frequency on one existing inter-community route connecting rural communities to the city of Merced. The route currently operates on limited frequency and is not enough to keep up with existing demand. The project also expands local service on one local route to provide better bus service to a developed residential area currently with limited access to service.,3.0,64TJPAMCMA,3696513,3112000,electric (not specified),not specified,bus only
81,2022,9,City of Wasco,City of Wasco Improving Air Quality and Economic Growth with Bus Electrification,CP090,6,KER,Purchase of 3 zero-emission buses that will support Wasco's local Dia-a-Ride shuttle services to expand service to affordable housing projects and expand overall service availability by 50%.,3.0,Pending,1543000,1000000,zero-emission bus (not specified),not specified,bus only
95,2022,23,Tulare County Regional Transit Agency (TCRTA),Tulare Cross-Valley Corridor \n(CVC) ZEB Project,CP093,6,TUL,Supports the phased development of an east-west Cross Valley Corridor by purchasing 14 zero-emission feeder buses in multiple cities in and along the corridor (as well as 16 micro-transit vehicles to be operated in selected cities) that will provide comprehensive access to the future rail system for all these communities and will connect to the California High Speed Rail system.,14.0,Pending,53702693,33769000,zero-emission bus (not specified),conventional (40-ft like),bus only


## DEPRECATED - Data Analysis
see `cost_per_bus_analysis` notebook

### Consolidate up grant recipient name

### aggregate up

In [None]:
# aggregate # of buses and allocation by transit agency
# bus_cost = only_bus.groupby('grant_recipient').agg({
#    '#_of_buses':"sum",
#    'allocation_amount':'sum'
# }).reset_index()

bus_cost = (
    bus_only.groupby("grant_recipient")
    .agg({"bus_count": "sum", "tircp_award_amount_($)": "sum"})
    .reset_index()
)

In [None]:
# confirm aggregation worked
bus_cost

### create new cost per bus column

In [None]:
bus_cost["cost_per_bus"] = (
    bus_cost["tircp_award_amount_($)"] / bus_cost["bus_count"]
).astype("int64")

In [None]:
# confirm new column was created and values were populated
bus_cost.sort_values("cost_per_bus")

### Export cost per bus via project tracking sheet to gcs

In [None]:
bus_cost.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_cost_per_bus.csv"
)