In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import shared_utils

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

## AGREEMENT ALLOCATIONS SHEET DATA

### Agreement Allocations - Read in Raw data

In [None]:
url = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/TIRCP Tracking Sheets 2_1-10-2024.xlsx"
sheet_name = "Agreement Allocations"

In [None]:
tircp = pd.read_excel(url, sheet_name)

### Agreement Allocations -Data Cleaning and QC

In [None]:
# reducing initialdf to first 11 columns.
tircp = tircp.iloc[:, :12]

In [None]:
# dictionary for column name update
new_col = [
    "award_year",
    "project_#",
    "grant_recipient",
    "implementing_agency",
    "ppno",
    "project_id",
    "ea",
    "components",
    "#_of_buses",
    "phase",
    "allocation_amount",
    "expended_amount",
]

In [None]:
tircp.columns = new_col
tircp.columns

In [None]:
tircp = tircp.drop("expended_amount", axis=1)

In [None]:
# fill NaN with zero?
# see if you can sum the bus column
tircp.agg({"#_of_buses": "sum"})
# nope this is correct

In [None]:
display(tircp.shape, list(tircp.columns), tircp.head())

In [None]:
tircp.grant_recipient.nunique()

In [None]:
# use strip to help combine names
tircp["grant_recipient"] = tircp["grant_recipient"].str.strip()

tircp.grant_recipient.nunique()

In [None]:
# see list of unique names
# may be able to consolidate a few
tircp.grant_recipient.sort_values().unique()

In [None]:
new_dict = {
    "Antelope Valley Transit Authority": "Antelope Valley Transit Authority (AVTA)",
    "Bay Area Rapid Transit District": "Bay Area Rapid Transit (BART)",
    "Capitol Corridor Joint Powers Authority": "Capitol Corridor Joint Powers Authority (CCJPA)",
    "Los Angeles County Metropolitan Transportation (LA Metro)": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Los Angeles County Metropolitan Transportation Authority": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Sacramento Regional Transit (SacRT)": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District (SacRT)",
    "San Diego Metropolitan Transit System (SDMTS)": "San Diego Metropolitan Transit System (MTS)",
    "San Francisco Bay Area Water Emergency Transportation Authority": "San Francisco Bay Area Water Emergency Transportation Authority (WETA)",
    "San Francisco Municipal Transportation Agency": "San Francisco Municipal Transportation Authority (SFMTA)",
    "Santa Barbara County Association of Governments\n(SBCAG)": "Santa Barbara County Association of Governments (SBCAG)",
    "Santa Clara Valley Transportation Authority": "Santa Clara Valley Transportation Authority (VTA)",
    "Transportation Agency for Monterey County": "Transportation Agency for Monterey County (TAMC)",
}

In [None]:
# replace the values in grant_recipient using dict
# df.replace({'bus_desc': new_dict}, inplace=True)
tircp = tircp.replace({"grant_recipient": new_dict})

In [None]:
#see that some rows were consolidated
display(tircp.grant_recipient.nunique())

### Agreement Allocations-Export Cleaned data

In [None]:
tircp.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

### Agreement Allocations-Read in Cleaned data from GCS

In [None]:
tircp = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

In [None]:
display(tircp.shape, tircp.columns, tircp.head())

### Agreement Allocations-Cost per Bus, per agency

In [None]:
# filer to project with bus count values
# caveat: some rows in "component" column state some variation of "purchased buses", but did not specify the amount of buses. 
# only rows stating the specificy number of buses purchased are included
only_bus = tircp[tircp['#_of_buses']>0]


In [None]:
display(only_bus.shape)

In [None]:
#aggregate # of buses and allocation by transit agency
bus_cost = only_bus.groupby('grant_recipient').agg({
    '#_of_buses':"sum",
    'allocation_amount':'sum'
}).reset_index()

In [None]:
bus_cost

In [None]:
bus_cost['cost_per_bus']= ((bus_cost['allocation_amount'])/(bus_cost['#_of_buses'])).astype('int64')

In [None]:
display(bus_cost.dtypes,bus_cost)

In [None]:
#exporting cost per bus
bus_cost.to_csv("gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocation_cost_per_bus.csv")

### Agreement Allocations - Stat analysis

In [None]:
bus_cost

In [None]:
plt.figure(
plt.hist(bus_cost['cost_per_bus'],density=True)
plt.show()

## PROJECT TRACKING SHEET DATA

### project tracking -  read raw data


In [4]:
url = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/TIRCP Tracking Sheets 2_1-10-2024.xlsx"
sheet_name = "Project Tracking"

In [5]:
project = pd.read_excel(url, sheet_name)

  warn(msg)


In [6]:
display(
    project.shape,
    project.columns,
    project.dtypes,
)

(124, 49)

Index(['Award Year', 'Project #', 'Grant Recipient', 'Project Title', 'PPNO',
       'District', 'County', 'Project Description', 'bus_count',
       'Master Agreement Number', 'Master Agreement Expiration Date',
       'Project Manager', 'Regional Coordinator',
       'Technical Assistance-CALITP (Y/N)', 'Technical Assistance-Fleet (Y/N)',
       'Technical Assistance-Network Integration (Y/N)',
       'Technical Assistance-Priority Population (Y/N)', 'Total Project Cost',
       'TIRCP Award Amount ($)', 'Allocated Amount', 'Unallocated Amount',
       'Percentage Allocated', 'Expended Amount', 'Other Funds Involved',
       'Award Cycle', 'Is SB1?', 'Is GGRF?', 'Is IIJA?', 'ON SHS?', 'CalITP',
       'Estimated TIRCP GHG Reductions', 'Estemated Project Completion',
       'Estimated TIRCP GHG Reductions2', 'Increased Ridership',
       'Service Integration', 'Improve Safety', 'Project Readiness',
       'Funding Leverage', 'Multi-Agency Coordination/Integration',
       'AB 1550 Com

Award Year                                          int64
Project #                                           int64
Grant Recipient                                    object
Project Title                                      object
PPNO                                               object
District                                           object
County                                             object
Project Description                                object
bus_count                                         float64
Master Agreement Number                            object
Master Agreement Expiration Date                   object
Project Manager                                    object
Regional Coordinator                               object
Technical Assistance-CALITP (Y/N)                  object
Technical Assistance-Fleet (Y/N)                   object
Technical Assistance-Network Integration (Y/N)     object
Technical Assistance-Priority Population (Y/N)     object
Total Project 

## Project Tracking- data cleaning

### data frame cleaning

In [7]:
#only keep first couple of columns
#tircp = tircp.iloc[:, :12]
project = project.iloc[:, :20]

In [8]:
list(project.columns)

['Award Year',
 'Project #',
 'Grant Recipient',
 'Project Title',
 'PPNO',
 'District',
 'County',
 'Project Description',
 'bus_count',
 'Master Agreement Number',
 'Master Agreement Expiration Date',
 'Project Manager',
 'Regional Coordinator',
 'Technical Assistance-CALITP (Y/N)',
 'Technical Assistance-Fleet (Y/N)',
 'Technical Assistance-Network Integration (Y/N)',
 'Technical Assistance-Priority Population (Y/N)',
 'Total Project Cost',
 'TIRCP Award Amount ($)',
 'Allocated Amount']

In [9]:
#drop specific columns
drop_col=[
 'Master Agreement Expiration Date',
 'Project Manager',
 'Regional Coordinator',
 'Technical Assistance-CALITP (Y/N)',
 'Technical Assistance-Fleet (Y/N)',
 'Technical Assistance-Network Integration (Y/N)',
 'Technical Assistance-Priority Population (Y/N)',]

In [10]:
project.drop(columns=drop_col, inplace=True)

In [12]:
len(project.columns)

13

In [13]:
#replace space with _ & lower everything
project.columns =project.columns.str.replace(' ','_')
project.columns=project.columns.str.lower()

In [14]:
#check work
project.columns

Index(['award_year', 'project_#', 'grant_recipient', 'project_title', 'ppno',
       'district', 'county', 'project_description', 'bus_count',
       'master_agreement_number', 'total_project_cost',
       'tircp_award_amount_($)', 'allocated_amount'],
      dtype='object')

### check columns
check values of all columns to see if:
-any duplicates values
-invalid int/str values


In [16]:
project.columns

Index(['award_year', 'project_#', 'grant_recipient', 'project_title', 'ppno',
       'district', 'county', 'project_description', 'bus_count',
       'master_agreement_number', 'total_project_cost',
       'tircp_award_amount_($)', 'allocated_amount'],
      dtype='object')

In [49]:
#function to check column information

def col_checker(col):
    display(f'Displaying column: {col}',
        len(project[col]),
        list(project[col].sort_values(ascending=True).unique())
           )

In [None]:
#Col is OK, but will drop
col_checker('allocated_amount')

In [None]:
#col is OK, all numbers
col_checker('tircp_award_amount_($)')

In [None]:
#col is good, everything is a number
col_checker('total_project_cost')

In [None]:
#col is OK
col_checker('master_agreement_number')

In [None]:
#col is OK
col_checker('bus_count')

In [None]:
# column is OK
col_checker('project_description')

In [None]:
#may need to clean, there are rows that say '3, 4' 
col_checker('county')

In [None]:
#Move to cleaning, check what is 'VAR'. various?
#may be ok just check to make sure
project.district.unique()

In [None]:
project[project['district']=='VAR']

In [None]:
#couldnt run col_checker, guessing because some PPNO numbers are inconsistent
#may need to clean, there is a ppno of CP052/CP053
project.ppno.unique()

In [None]:
#Project title OK, 
col_checker('project_title')

In [None]:
# grant_recipient need to clean
col_checker('grant_recipient')

In [None]:
#award year OK
col_checker('award_year')

In [None]:
#project num OK
col_checker('project_#')

### Cleaning 

### filter df for project descriptions that contain bus

In [None]:
bus_only = project[project['bus_count']>0]

In [None]:
#this looks correct
display(project.shape,
        bus_only.shape)

### export project- bus only df

In [None]:
bus_only.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.csv')

### Read in project bus only data


In [2]:
bus_only= pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.csv')

In [None]:
display(bus_only.shape,
        bus_only.columns,
        bus_only.head())

In [None]:
#inspect columns values.
list(bus_only['grant_recipient'].sort_values().unique())
#everything looks good

### Consolidate up grant recipient name

In [3]:
new_dict ={
 'Antelope Valley Transit Authority ':'Antelope Valley Transit Authority (AVTA)',
 'Humboldt Transit Authority':'Humboldt Transit Authority (HTA)',
 'Orange County Transportation Authority':'Orange County Transportation Authority (OCTA)',
}

In [4]:
#df.replace({'bus_desc': new_dict}, inplace=True)
bus_only.replace({'grant_recipient': new_dict}, inplace=True)

In [None]:
list(bus_only['grant_recipient'].sort_values().unique())

### aggregate up

In [5]:
#aggregate # of buses and allocation by transit agency
#bus_cost = only_bus.groupby('grant_recipient').agg({
#    '#_of_buses':"sum",
#    'allocation_amount':'sum'
#}).reset_index()

bus_cost = bus_only.groupby('grant_recipient').agg({
    'bus_count':'sum',
    'tircp_award_amount_($)': 'sum'
}).reset_index()

In [None]:
#confirm aggregation worked
bus_cost

## create new cost per bus column

In [6]:
bus_cost['cost_per_bus']= (bus_cost['tircp_award_amount_($)']/bus_cost['bus_count']).astype('int64')

In [7]:
#confirm new column was created and values were populated
bus_cost.sort_values('cost_per_bus')

Unnamed: 0,grant_recipient,bus_count,tircp_award_amount_($),cost_per_bus
22,Santa Monica Big Blue Bus,7.0,1105000,157857
7,City of Santa Monica,113.0,26027000,230327
12,Foothill Transit,20.0,5000000,250000
5,City of Los Angeles (LA DOT),112.0,36104000,322357
10,City of Wasco,3.0,1000000,333333
6,City of Pasadena,40.0,14424000,360600
19,San Joaquin Regional Transit District (SJRTD),12.0,6841000,570083
18,Orange County Transportation Authority (OCTA),73.0,41727000,571602
23,Shasta Regional Transportation Agency (SRTA),14.0,8641000,617214
11,Culver City,5.0,3247000,649400


## Export cost per bus via project tracking sheet to gcs

In [8]:
bus_cost.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_cost_per_bus.csv')