In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import shared_utils
from dgs_data_cleaner import project_type_checker

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)
pd.set_option('display.max_colwidth', None)

## AGREEMENT ALLOCATIONS SHEET DATA

### Agreement Allocations - Read in Raw data

In [None]:
url = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/TIRCP Tracking Sheets 2_1-10-2024.xlsx"
sheet_name = "Agreement Allocations"

In [None]:
tircp = pd.read_excel(url, sheet_name)

### Agreement Allocations -Data Cleaning and QC

In [None]:
# reducing initialdf to first 11 columns.
tircp = tircp.iloc[:, :12]

In [None]:
# dictionary for column name update
new_col = [
    "award_year",
    "project_#",
    "grant_recipient",
    "implementing_agency",
    "ppno",
    "project_id",
    "ea",
    "components",
    "#_of_buses",
    "phase",
    "allocation_amount",
    "expended_amount",
]

In [None]:
tircp.columns = new_col
tircp.columns

In [None]:
tircp = tircp.drop("expended_amount", axis=1)

In [None]:
# fill NaN with zero?
# see if you can sum the bus column
tircp.agg({"#_of_buses": "sum"})
# nope this is correct

In [None]:
display(tircp.shape, list(tircp.columns), tircp.head())

In [None]:
tircp.grant_recipient.nunique()

In [None]:
# use strip to help combine names
tircp["grant_recipient"] = tircp["grant_recipient"].str.strip()

tircp.grant_recipient.nunique()

In [None]:
# see list of unique names
# may be able to consolidate a few
tircp.grant_recipient.sort_values().unique()

In [None]:
new_dict = {
    "Antelope Valley Transit Authority": "Antelope Valley Transit Authority (AVTA)",
    "Bay Area Rapid Transit District": "Bay Area Rapid Transit (BART)",
    "Capitol Corridor Joint Powers Authority": "Capitol Corridor Joint Powers Authority (CCJPA)",
    "Los Angeles County Metropolitan Transportation (LA Metro)": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Los Angeles County Metropolitan Transportation Authority": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Sacramento Regional Transit (SacRT)": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District (SacRT)",
    "San Diego Metropolitan Transit System (SDMTS)": "San Diego Metropolitan Transit System (MTS)",
    "San Francisco Bay Area Water Emergency Transportation Authority": "San Francisco Bay Area Water Emergency Transportation Authority (WETA)",
    "San Francisco Municipal Transportation Agency": "San Francisco Municipal Transportation Authority (SFMTA)",
    "Santa Barbara County Association of Governments\n(SBCAG)": "Santa Barbara County Association of Governments (SBCAG)",
    "Santa Clara Valley Transportation Authority": "Santa Clara Valley Transportation Authority (VTA)",
    "Transportation Agency for Monterey County": "Transportation Agency for Monterey County (TAMC)",
}

In [None]:
# replace the values in grant_recipient using dict
# df.replace({'bus_desc': new_dict}, inplace=True)
tircp = tircp.replace({"grant_recipient": new_dict})

In [None]:
# see that some rows were consolidated
display(tircp.grant_recipient.nunique())

### Agreement Allocations-Export Cleaned data

In [None]:
tircp.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

### Agreement Allocations-Read in Cleaned data from GCS

In [None]:
tircp = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

In [None]:
display(tircp.shape, tircp.columns, tircp.head())

### Agreement Allocations-Cost per Bus, per agency

In [None]:
# filer to project with bus count values
# caveat: some rows in "component" column state some variation of "purchased buses", but did not specify the amount of buses.
# only rows stating the specificy number of buses purchased are included
only_bus = tircp[tircp["#_of_buses"] > 0]

In [None]:
display(only_bus.shape)

In [None]:
# aggregate # of buses and allocation by transit agency
bus_cost = (
    only_bus.groupby("grant_recipient")
    .agg({"#_of_buses": "sum", "allocation_amount": "sum"})
    .reset_index()
)

In [None]:
bus_cost

In [None]:
bus_cost["cost_per_bus"] = (
    (bus_cost["allocation_amount"]) / (bus_cost["#_of_buses"])
).astype("int64")

In [None]:
display(bus_cost.dtypes, bus_cost)

In [None]:
# exporting cost per bus
bus_cost.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocation_cost_per_bus.csv"
)

### Agreement Allocations - Stat analysis

In [None]:
bus_cost

In [None]:
plt.figure(
plt.hist(bus_cost['cost_per_bus'],density=True)
plt.show()

## PROJECT TRACKING SHEET DATA

### project tracking -  read raw data


In [None]:
gcs_path = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/'
file_name = 'TIRCP Tracking Sheets 2_1-10-2024.xlsx'
sheet_name = 'Project Tracking'

def get_data(path, file, sheet):
    df = pd.read_excel(path+file, sheet_name=sheet)
    
    return df

project = get_data(gcs_path, file_name, sheet_name)

In [None]:
display(
    project.shape,
    project.columns,
    project.dtypes,
)

## Project Tracking- data cleaning

### data frame cleaning

In [None]:
# only keep first couple of columns
# tircp = tircp.iloc[:, :12]
project = project.iloc[:, :20]

In [None]:
list(project.columns)

In [None]:
# drop specific columns
drop_col = [
    "Master Agreement Expiration Date",
    "Project Manager",
    "Regional Coordinator",
    "Technical Assistance-CALITP (Y/N)",
    "Technical Assistance-Fleet (Y/N)",
    "Technical Assistance-Network Integration (Y/N)",
    "Technical Assistance-Priority Population (Y/N)",
]

In [None]:
project.drop(columns=drop_col, inplace=True)

In [None]:
len(project.columns)

In [None]:
# replace space with _ & lower everything
project.columns = project.columns.str.replace(" ", "_")
project.columns = project.columns.str.lower()

In [None]:
# check work
project.columns

### check columns
check values of all columns to see if:
-any duplicates values
-invalid int/str values


In [None]:
project.columns

In [None]:
# function to check column information
def col_checker(col):
    display(
        f"Displaying column: {col}",
        len(project[col]),
        list(project[col].sort_values(ascending=True).unique()),
    )

In [None]:
# col is OK, all numbers
col_checker("tircp_award_amount_($)")

In [None]:
# col is good, everything is a number
col_checker("total_project_cost")

In [None]:
# col is OK
col_checker("master_agreement_number")

In [None]:
# col is OK
col_checker("bus_count")

In [None]:
# column is OK
col_checker("project_description")

In [None]:
project[project["district"] == "VAR"]

In [None]:
# Project title OK,
col_checker("project_title")

In [None]:
# award year OK
col_checker("award_year")

In [None]:
# project num OK
col_checker("project_#")

---

In [None]:
# DROP COL
# Col is OK
col_checker("allocated_amount")

In [None]:
# NEEDS CLEANING grant_recipient need to clean
col_checker("grant_recipient")

In [None]:
# may need to clean, there are rows that say '3, 4'
col_checker("county")

In [None]:
# Move to cleaning, check what is 'VAR'. various?
# may be ok just check to make sure
project.district.unique()

In [None]:
# couldnt run col_checker, guessing because some PPNO numbers are inconsistent
# may need to clean, there is a ppno of CP052/CP053
project.ppno.unique()

### dropping allocated amount column

In [None]:
# dropping allocated amount column
project.drop(columns=["allocated_amount"], inplace=True)

In [None]:
# checking work
project.columns

### Clean `grant_recipient` column

In [None]:
list(project.grant_recipient.sort_values(ascending=True).unique())

In [None]:
agency_dict = {
    "Antelope Valley Transit Authority ": "Antelope Valley Transit Authority (AVTA)",
    "Humboldt Transit Authority": "Humboldt Transit Authority (HTA)",
    "Orange County Transportation Authority": "Orange County Transportation Authority (OCTA)",
    "Capitol Corridor Joint Powers Authority": "Capitol Corridor Joint Powers Authority (CCJPA)",
    "Los Angeles County Metropolitan Transportation Authority": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Monterey-Salinas Transit": "Monterey-Salinas Transit District (MST)",
    "Sacramento Regional Transit (SacRT)": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District (SacRT) ": "Sacramento Regional Transit District (SacRT)",
    "San Diego Association of Governments": "San Diego Association of Governments (SANDAG)",
    "Santa Clara Valley Transportation Authority (SCVTA)": "Santa Clara Valley Transportation Authority (VTA)",
    "Southern California  Regional Rail Authority (SCRRA)": "Southern California Regional Rail Authority (SCRRA - Metrolink)",
    "Southern California Regional Rail Authority": "Southern California Regional Rail Authority (SCRRA - Metrolink)",
}

In [None]:
# df.replace({'bus_desc': new_dict}, inplace=True)
project.replace({"grant_recipient": agency_dict}, inplace=True)

In [None]:
# check work. looks good
list(project["grant_recipient"].sort_values().unique())

### Cleaning `county` column

In [None]:
col_checker("county")

In [None]:
#checking specific row with '3,4' as county
project[project["county"] == "3, 4"]

In [None]:
# change county value from '3, 4' to 'VAR' like the other rows.
project.at[3, "county"] = "VAR"

In [None]:
# check work
project.iloc[3]

### Cleaning `district`column
This is good as is, no cleaning requried. All rows with VAR district has VAR in county as well.

In [None]:
#GTG
project.district.unique()

In [None]:
#GTG 
project[project["district"] == "VAR"]

### Clean `ppno` column
This should all be fine as is, no cleaning needed

In [None]:
list(project.ppno.unique())

In [None]:
#GTG 
project[project["ppno"] == "CP052/CP053"]

### Skim the project description column?
double check to ensure bus count is accurate to what the description says?

Saw that some rows mention procuring both zero and non-zero emission buses (count total buses in `bus count` and `VAR` in prop type and bus size?


In [None]:
project[
    project["project_title"]
    == "ATN FAST (Family of Advanced Solutions for Transit): Revolutionizing Transit for a Global Audience"
]

In [None]:
# iloc check
project.iloc[73]

In [None]:
# code to update value at specific index and column
project.loc[project['ppno'] == 'CP106', 'bus_count'] = 42


In [None]:
# check work
project.iloc[73]

---

## Export cleaned Project df 

In [None]:
# exproject cleaned project df
project.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_clean.csv"
)

## Read in cleaned project data

In [None]:
project = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_clean.csv"
)

In [None]:
# ensure df is able to read in
display(project.shape, project.columns)

### filter df for project descriptions that contain bus

In [None]:
bus_only = project[project["bus_count"] > 0]

In [None]:
# this looks correct
display(project.shape, bus_only.shape)

## New column for propulsion type - `prop_type`
Use on `bus_only` df

In [None]:
prop_type = [
    "electric buses",
    "electric commuter",
    "Electric Buses",
    "battery electric",
    "Batery Electric",
    "battery-electric",
    "fuel-cell",
    "fuel cell",
    "Fuel Cell",
    "zero emission",
    "Zero Emission",
    "zero-emission electric buses",
    "zero-emission buses",
    "zero‐emission",
    "zero-emission",
    "zeroemission",
    "CNG",
    "cng",
]

In [None]:
type(prop_type)

In [None]:
# function to match keywords to list
def prop_type_finder(description):
    for keyword in prop_type:
        if keyword in description:
            return keyword
    return "not specified"

In [None]:
# add new col `prop_type`, fill it with values based on project_description using prop_type_finder function
bus_only["prop_type"] = bus_only["project_description"].apply(prop_type_finder)

In [None]:
# check work
display(
    bus_only.columns,
    bus_only["prop_type"].value_counts(),
)

In [None]:
# exploring the not specified rows
bus_only[bus_only["prop_type"] == "not specified"]
# coach-style buses, this row does not specify if buses are zero or non-zero emission bus. GOOD TO GO

In [None]:
# what is in CNG rows?
bus_only[bus_only["prop_type"] == "CNG"]
# was 4 rows, then adjusted prop list to have cng at the bottom. now showing 1 row thats actually CNG

In [None]:
# consolidate values
list(bus_only["prop_type"].sort_values(ascending=True).unique())

In [None]:
prop_dict = {
    "battery electric": "BEB",
    "battery-electric": "BEB",
    "electric buses": "electric (not specified)",
    "electric commuter": "electric (not specified)",
    "fuel cell": "FCEB",
    "fuel-cell": "FCEB",
    "zero-emission buses": "zero-emission bus (not specified)",
    "zero emission": "zero-emission bus (not specified)",
    "zero-emission": "zero-emission bus (not specified)",
    "zero‐emission": "zero-emission bus (not specified)",
}

In [None]:
# replacing prop_type values with dictionary
bus_only.replace({"prop_type": prop_dict}, inplace=True)

In [None]:
# check work
display(bus_only.prop_type.value_counts(), bus_only.head())

# looks good

## New column for bus size type - `bus_size_type`


In [None]:
bus_size = [
    "standard",
    "30-foot",
    "40 foot",
    "40-foot",
    "45-foot",
    "45 foot",
    "40ft",
    "60-foot",
    "articulated",
    "cutaway",
    "coach-style",
    "over-the-road",
    "feeder bus",
]

In [None]:
type(bus_size)

In [None]:
# re writing prop type funct for bus size
def bus_size_finder(description):
    for keyword in bus_size:
        if keyword in description:
            return keyword
    return "not specified"

In [None]:
# creating new column, filling the column using the function applied to project_desctiotion
bus_only["bus_size_type"] = bus_only["project_description"].apply(bus_size_finder)

In [None]:
# checking work
display(bus_only.columns, bus_only.bus_size_type.value_counts())

In [None]:
list(bus_only['bus_size_type'].sort_values().unique())

In [None]:
# expected that not a lot of rows specify a size type.
# will still take a random peek into some

bus_only[bus_only["bus_size_type"] == "not specified"].sample(5)

In [None]:
# consolidate
size_dict={'40 foot': 'conventional (40-ft like)' ,
 '40-foot': 'conventional (40-ft like)',
 '45-foot': 'conventional (40-ft like)',
 'coach-style':'over-the-road',
 'feeder bus': 'conventional (40-ft like)',
 }

In [None]:
type(size_dict)

In [None]:
# .replace() with size_dict to replace values in bus size col
bus_only.replace({"bus_size_type": size_dict}, inplace=True)

In [None]:
# check work
bus_only.bus_size_type.value_counts()

## export project- bus only df

In [None]:
bus_only.to_parquet(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.parquet"
)

## Read in project bus only data


In [2]:
bus_checker = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.parquet"
)

In [3]:
bus_checker.shape

(35, 14)

In [4]:
bus_checker = bus_checker.assign(
    project_type = bus_checker['project_description'].apply(project_type_checker)
)


In [5]:
bus_checker["project_type"].value_counts()

includes bus and non-bus components    24
bus only                               11
Name: project_type, dtype: int64

In [6]:
# just_bus rows are all good. 
just_bus = bus_checker[bus_checker['project_type'] == "bus only"]

# bus_non_bus rows are all good
bus_non_bus = bus_checker[bus_checker['project_type'] == "includes bus and non-bus components"]

In [8]:
just_bus[["project_description", "project_type", "bus_count", "prop_type"]]

Unnamed: 0,project_description,project_type,bus_count,prop_type
0,Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses,bus only,13.0,electric (not specified)
5,Purchase five 40-foot CNG buses for BRT Route linking SARTC to Metrolink/Amtrak,bus only,40.0,CNG
16,Purchase 20 zero-emission buses to extend Route 486 to the Pamona Metrolink station and increase frequencies,bus only,20.0,zero-emission bus (not specified)
34,"Acquire 112 zero-emission buses to replace existing propane vehicles and add new vehicles, in order to increase frequency of all existing DASH routes to 15-minute service and add 4 new routes, serving communities throughout the City of Los Angeles as recommended in the comprehensive Transit Service Analysis.",bus only,112.0,zero-emission bus (not specified)
51,Purchase 7 new coach-style buses to support a new intercity service that connects Redding to Sacramento. Purchase 7 new coach-style buses to support a new intercity service that connects Redding to Sacramento Shata Regional Transportation Agency Bus System to Sacramento International Airport,bus only,14.0,not specified
68,"Purchase 7 zero emission buses to enhance and extend Route 14 from Playa Vista to Inglewood, bringing new transit opportunities to disadvantaged communities, while also integrating light rail and bus services.",bus only,7.0,zero-emission bus (not specified)
70,"Purchase 7 electric buses to expand services on Line 4X (between Torrance and Downton LA), on an extended line 10 (serving the Metro Green Line Crenshaw station and the Inglewood Stadium and Entertainment District, an extended line 9 (newly serving the Kaiser Permanente South Bay Medical Center), and the acquisition of the western portion of LA Metro’s Route 130 between the Blue Line Artesia Station and the South Bay Galleria Mall.",bus only,7.0,electric (not specified)
71,Purchases 3 zero-emission electric buses to increase fleet size and extend bus service levels on 2 fixed routes in Merced county. The proposed project allows for an expansion of service frequency on one existing inter-community route connecting rural communities to the city of Merced. The route currently operates on limited frequency and is not enough to keep up with existing demand. The project also expands local service on one local route to provide better bus service to a developed residential area currently with limited access to service.,bus only,3.0,electric (not specified)
81,Purchase of 3 zero-emission buses that will support Wasco's local Dia-a-Ride shuttle services to expand service to affordable housing projects and expand overall service availability by 50%.,bus only,3.0,zero-emission bus (not specified)
95,Supports the phased development of an east-west Cross Valley Corridor by purchasing 14 zero-emission feeder buses in multiple cities in and along the corridor (as well as 16 micro-transit vehicles to be operated in selected cities) that will provide comprehensive access to the future rail system for all these communities and will connect to the California High Speed Rail system.,bus only,14.0,zero-emission bus (not specified)


In [9]:
bus_non_bus[["project_description", "project_type", "bus_count", "prop_type"]]

Unnamed: 0,project_description,project_type,bus_count,prop_type
11,"Bus rapid transit infrastructure along the MLK Corridor and Crosstown Miner Corridor, including the acquisition of 12 new zero-emission electric vehicles",includes bus and non-bus components,12.0,zero-emission bus (not specified)
29,"Deploys 40 zero-emission electric buses to double service levels on up to 8 routes, add 2 new routes; Implements a new circulator/on-demand first-mile/last-mile service; and construction of a new maintenance facility with solar canopy structures.",includes bus and non-bus components,40.0,electric (not specified)
30,"Deploys 7 zero-emission battery electric buses and upgrades charging infrastructure serving AVTA local and commuter bus routes, bringing the entire AVTA system to fully electric status (the first in the nation) by 2019; Deploys 5 zero-emission battery electric buses and related infrastructure for Long Beach Transit services. Increased frequency on up to 5 local and community transit routes operated by LBT.",includes bus and non-bus components,7.0,electric (not specified)
33,Purchase of 6 zero-emission battery-electric buses and the construction of charging infrastructure to allow extension of 15-min service connecting Southwest Fresno to the northern part of Fresno and creating a new route providing access to job centers.,includes bus and non-bus components,6.0,electric (not specified)
35,Construction- Purchase 10- 40 foot battery electric buses,includes bus and non-bus components,10.0,electric (not specified)
52,"Purchases 13 electric buses and funds capital improvements including new bus stops, pedestrian crossings, and charging infrastructure",includes bus and non-bus components,13.0,electric (not specified)
56,Purchase of 11 zero emission battery electric buses and supportive charging infrastructure to allow for expansion of the zero-emission bus fleet and implement a new zero-emission microtransit service that is fully integrated into local and regional intermodal transit networks.,includes bus and non-bus components,11.0,electric (not specified)
60,"Construction of a new transit center in Clearlake and purchase 4 hydrogen fuel-cell buses with associated infrastructure. The project would expand service to out of county destinations, including the Sonoma County Airport and the Santa Rosa Bus Terminal in Downtown Santa Rosa. Hydrogen fuel cell technology is used in order to allow extended range services to be operated, contributing to increased ridership.",includes bus and non-bus components,4.0,FCEB
61,"Purchase of 5 zero-emission battery- electric buses and the construction of charging infrastructure to create a zero-emission over-the-road coach commuter route between the Greater Long Beach area and the University of California, Los Angeles (UCLA).",includes bus and non-bus components,5.0,electric (not specified)
73,"Creates a zero-emission transit ecosystem that offers end-to-end solutions for residents, employees and the global audience drawn by tourism/convention centers and the LA 2028 Summer Olympics events. Project components include (1) purchase of 7 zero-emission battery electric vans to implement a new service connecting John Wayne Airport to Anaheim, (2) purchase of 10 electric vehicles and associated infrastructure to expand on-demand micro transit services into new neighborhoods and service areas, (3) purchase of 15 zeroemission buses to replace existing buses and augment existing routes, including installation of photovoltaic electricity generation at two facilities, and (4) purchase of 10 additional zero-emission buses for a new east/west connector service.",includes bus and non-bus components,42.0,BEB


## DEPRECATED - Data Analysis
see `cost_per_bus_analysis` notebook

### Consolidate up grant recipient name

### aggregate up

In [None]:
# aggregate # of buses and allocation by transit agency
# bus_cost = only_bus.groupby('grant_recipient').agg({
#    '#_of_buses':"sum",
#    'allocation_amount':'sum'
# }).reset_index()

bus_cost = (
    bus_only.groupby("grant_recipient")
    .agg({"bus_count": "sum", "tircp_award_amount_($)": "sum"})
    .reset_index()
)

In [None]:
# confirm aggregation worked
bus_cost

### create new cost per bus column

In [None]:
bus_cost["cost_per_bus"] = (
    bus_cost["tircp_award_amount_($)"] / bus_cost["bus_count"]
).astype("int64")

In [None]:
# confirm new column was created and values were populated
bus_cost.sort_values("cost_per_bus")

### Export cost per bus via project tracking sheet to gcs

In [None]:
bus_cost.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_cost_per_bus.csv"
)