## Intro

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import shared_utils

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

## Agreement Allocations - Read in Raw data

In [None]:
url = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/TIRCP Tracking Sheets 2_1-10-2024.xlsx"
sheet_name = "Agreement Allocations"

In [None]:
tircp = pd.read_excel(url, sheet_name)

## Agreement Allocations -Data Cleaning and QC

In [None]:
# reducing initialdf to first 11 columns.
tircp = tircp.iloc[:, :12]

In [None]:
# dictionary for column name update
new_col = [
    "award_year",
    "project_#",
    "grant_recipient",
    "implementing_agency",
    "ppno",
    "project_id",
    "ea",
    "components",
    "#_of_buses",
    "phase",
    "allocation_amount",
    "expended_amount",
]

In [None]:
tircp.columns = new_col
tircp.columns

In [None]:
tircp = tircp.drop("expended_amount", axis=1)

In [None]:
# fill NaN with zero?
# see if you can sum the bus column
tircp.agg({"#_of_buses": "sum"})
# nope this is correct

In [None]:
display(tircp.shape, list(tircp.columns), tircp.head())

In [None]:
tircp.grant_recipient.nunique()

In [None]:
# use strip to help combine names
tircp["grant_recipient"] = tircp["grant_recipient"].str.strip()

tircp.grant_recipient.nunique()

In [None]:
# see list of unique names
# may be able to consolidate a few
tircp.grant_recipient.sort_values().unique()

In [None]:
new_dict = {
    "Antelope Valley Transit Authority": "Antelope Valley Transit Authority (AVTA)",
    "Bay Area Rapid Transit District": "Bay Area Rapid Transit (BART)",
    "Capitol Corridor Joint Powers Authority": "Capitol Corridor Joint Powers Authority (CCJPA)",
    "Los Angeles County Metropolitan Transportation (LA Metro)": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Los Angeles County Metropolitan Transportation Authority": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Sacramento Regional Transit (SacRT)": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District (SacRT)",
    "San Diego Metropolitan Transit System (SDMTS)": "San Diego Metropolitan Transit System (MTS)",
    "San Francisco Bay Area Water Emergency Transportation Authority": "San Francisco Bay Area Water Emergency Transportation Authority (WETA)",
    "San Francisco Municipal Transportation Agency": "San Francisco Municipal Transportation Authority (SFMTA)",
    "Santa Barbara County Association of Governments\n(SBCAG)": "Santa Barbara County Association of Governments (SBCAG)",
    "Santa Clara Valley Transportation Authority": "Santa Clara Valley Transportation Authority (VTA)",
    "Transportation Agency for Monterey County": "Transportation Agency for Monterey County (TAMC)",
}

In [None]:
# replace the values in grant_recipient using dict
# df.replace({'bus_desc': new_dict}, inplace=True)
tircp = tircp.replace({"grant_recipient": new_dict})

In [None]:
#see that some rows were consolidated
display(tircp.grant_recipient.nunique())

## Agreement Allocations-Export Cleaned data

In [None]:
tircp.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

## Agreement Allocations-Read in Cleaned data from GCS

In [None]:
tircp = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

In [None]:
display(tircp.shape, tircp.columns, tircp.head())

## Agreement Allocations-Cost per Bus, per agency

In [None]:
# filer to project with bus count values
# caveat: some rows in "component" column state some variation of "purchased buses", but did not specify the amount of buses. 
# only rows stating the specificy number of buses purchased are included
only_bus = tircp[tircp['#_of_buses']>0]


In [None]:
display(only_bus.shape)

In [None]:
#aggregate # of buses and allocation by transit agency
bus_cost = only_bus.groupby('grant_recipient').agg({
    '#_of_buses':"sum",
    'allocation_amount':'sum'
}).reset_index()

In [None]:
bus_cost

In [None]:
bus_cost['cost_per_bus']= ((bus_cost['allocation_amount'])/(bus_cost['#_of_buses'])).astype('int64')

In [None]:
display(bus_cost.dtypes,bus_cost)

In [None]:
#exporting cost per bus
bus_cost.to_csv("gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocation_cost_per_bus.csv")

## Agreement Allocations - Stat analysis

In [None]:
bus_cost

In [None]:
plt.figure(
plt.hist(bus_cost['cost_per_bus'],density=True)
plt.show()

## project tracking -  read raw data


In [2]:
url = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/TIRCP Tracking Sheets 2_1-10-2024.xlsx"
sheet_name = "Project Tracking"

In [None]:
project = pd.read_excel(url, sheet_name)

In [None]:
display(
    project.shape,
    project.columns,
    project.dtypes,
)

## Project Tracking- data cleaning

### initial data cleaning

In [None]:
#drop columns
#tircp = tircp.iloc[:, :12]
project = project.iloc[:, :20]

In [None]:
list(project.columns)

In [None]:
drop_col=[
 'Master Agreement Expiration Date',
 'Project Manager',
 'Regional Coordinator',
 'Technical Assistance-CALITP (Y/N)',
 'Technical Assistance-Fleet (Y/N)',
 'Technical Assistance-Network Integration (Y/N)',
 'Technical Assistance-Priority Population (Y/N)',]

In [None]:
project.drop(columns=drop_col, inplace=True)

In [None]:
list(project.columns)

In [None]:
#replace space with _
project.columns =project.columns.str.replace(' ','_')

In [None]:
#lower case everything
project.columns=project.columns.str.lower()

In [None]:
#check work
project.columns

### filter df for project descriptions that contain bus

In [None]:
bus_only = project[project['bus_count']>0]

In [None]:
#this looks correct
display(project.shape,
        bus_only.shape)

### export project- bus only df

In [None]:
bus_only.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.csv')

### Read in project bus only data


In [3]:
bus_only= pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.csv')

In [4]:
display(bus_only.shape,
        bus_only.columns,
        bus_only.head())

(35, 14)

Index(['Unnamed: 0', 'award_year', 'project_#', 'grant_recipient',
       'project_title', 'ppno', 'district', 'county', 'project_description',
       'bus_count', 'master_agreement_number', 'total_project_cost',
       'tircp_award_amount_($)', 'allocated_amount'],
      dtype='object')

Unnamed: 0.1,Unnamed: 0,award_year,project_#,grant_recipient,project_title,ppno,district,county,project_description,bus_count,master_agreement_number,total_project_cost,tircp_award_amount_($),allocated_amount
0,0,2015,1,Antelope Valley Transit Authority,Regional Transit Interconnectivity & Environme...,CP005,7,LA,Purchase 13 60-foot articulated BRT buses and ...,13.0,64AVTA2015MA,39478000,24403000,24403000
1,5,2015,6,Orange County Transportation Authority,Bravo! Route 560 Rapid Buses,CP004,12,ORA,Purchase five 40-foot CNG buses for BRT Route ...,40.0,64OCTAMA,2900000,2320000,2320000
2,11,2015,12,San Joaquin Regional Transit District (SJRTD),BRT Expansion: MLK Corridor and Crosstown Mine...,CP011,10,SJ,Bus rapid transit infrastructure along the MLK...,12.0,64SJRRCMA A1,19118776,6841000,6841000
3,16,2016,3,Foothill Transit,"Transforming California: Bus Electrification, ...",CP076,7,LA,Purchase 20 zero-emission buses to extend Rout...,20.0,64FOOTHILLMA,16580000,5000000,5000000
4,29,2018,2,Anaheim Transportation Network (ATN),#Electrify Anaheim: Changing the Transit Parad...,CP027,12,ORA,Deploys 40 zero-emission electric buses to dou...,40.0,64ATNMA A1,45201000,28617000,28617000


In [21]:
#inspect columns values.
list(bus_only['grant_recipient'].sort_values().unique())
#everything looks good

TypeError: 'int' object is not iterable

### Consolidate up grant recipient name

In [22]:
new_dict ={
 'Antelope Valley Transit Authority ':'Antelope Valley Transit Authority (AVTA)',
 'Humboldt Transit Authority':'Humboldt Transit Authority (HTA)',
 'Orange County Transportation Authority':'Orange County Transportation Authority (OCTA)',
}

In [26]:
#df.replace({'bus_desc': new_dict}, inplace=True)
bus_only.replace({'grant_recipient': new_dict}, inplace=True)

In [31]:
list(bus_only['grant_recipient'].sort_values().unique())

['Anaheim Transportation Network (ATN)',
 'Antelope Valley Transit Authority (AVTA)',
 'Antelope Valley Transit Authority (AVTA) & Long Beach Transit (LBT)',
 'City of Fresno',
 'City of Glendale and Arroyo Verdugo Communities',
 'City of Los Angeles (LA DOT)',
 'City of Pasadena',
 'City of Santa Monica',
 'City of Simi Valley',
 'City of Torrance',
 'City of Wasco',
 'Culver City',
 'Foothill Transit',
 'Humboldt Transit Authority (HTA)',
 'Humboldt Transit Authority (HTA) with Yurok Tribe and Redwood Coast Transit Authority',
 'Lake Transit Authority (LTA)',
 'Long Beach Transit (LBT)',
 'Los Angeles County Metropolitan Transportation Authority (LA Metro)',
 'Orange County Transportation Authority (OCTA)',
 'San Joaquin Regional Transit District (SJRTD)',
 'Santa Barbara Metropolitan Transit District (SBMTD) ',
 'Santa Cruz Metropolitan Transit District (Metro)',
 'Santa Monica Big Blue Bus',
 'Shasta Regional Transportation Agency (SRTA)',
 'Solano Transportation Authority (STA)',


### aggregate up

In [32]:
#aggregate # of buses and allocation by transit agency
#bus_cost = only_bus.groupby('grant_recipient').agg({
#    '#_of_buses':"sum",
#    'allocation_amount':'sum'
#}).reset_index()

bus_cost = bus_only.groupby('grant_recipient').agg({
    'bus_count':'sum',
    'tircp_award_amount_($)': 'sum'
}).reset_index()

In [33]:
#confirm aggregation worked
bus_cost

Unnamed: 0,grant_recipient,bus_count,tircp_award_amount_($)
0,Anaheim Transportation Network (ATN),65.0,51395000
1,Antelope Valley Transit Authority (AVTA),36.0,35735000
2,Antelope Valley Transit Authority (AVTA) & Lon...,7.0,13156000
3,City of Fresno,6.0,7798000
4,City of Glendale and Arroyo Verdugo Communities,27.0,34648000
5,City of Los Angeles (LA DOT),112.0,36104000
6,City of Pasadena,40.0,14424000
7,City of Santa Monica,113.0,26027000
8,City of Simi Valley,6.0,7053000
9,City of Torrance,10.0,96000000


## create new cost per bus column

In [34]:
bus_cost['cost_per_bus']= (bus_cost['tircp_award_amount_($)']/bus_cost['bus_count']).astype('int64')

In [36]:
#confirm new column was created and values were populated
bus_cost.sort_values('cost_per_bus')

Unnamed: 0,grant_recipient,bus_count,tircp_award_amount_($),cost_per_bus
22,Santa Monica Big Blue Bus,7.0,1105000,157857
7,City of Santa Monica,113.0,26027000,230327
12,Foothill Transit,20.0,5000000,250000
5,City of Los Angeles (LA DOT),112.0,36104000,322357
10,City of Wasco,3.0,1000000,333333
6,City of Pasadena,40.0,14424000,360600
19,San Joaquin Regional Transit District (SJRTD),12.0,6841000,570083
18,Orange County Transportation Authority (OCTA),73.0,41727000,571602
23,Shasta Regional Transportation Agency (SRTA),14.0,8641000,617214
11,Culver City,5.0,3247000,649400


## Export cost per bus via project tracking sheet to gcs

In [37]:
bus_cost.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_cost_per_bus')