In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import shared_utils

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

ModuleNotFoundError: No module named 'shared_utils'

## AGREEMENT ALLOCATIONS SHEET DATA

### Agreement Allocations - Read in Raw data

In [None]:
url = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/TIRCP Tracking Sheets 2_1-10-2024.xlsx"
sheet_name = "Agreement Allocations"

In [None]:
tircp = pd.read_excel(url, sheet_name)

### Agreement Allocations -Data Cleaning and QC

In [None]:
# reducing initialdf to first 11 columns.
tircp = tircp.iloc[:, :12]

In [None]:
# dictionary for column name update
new_col = [
    "award_year",
    "project_#",
    "grant_recipient",
    "implementing_agency",
    "ppno",
    "project_id",
    "ea",
    "components",
    "#_of_buses",
    "phase",
    "allocation_amount",
    "expended_amount",
]

In [None]:
tircp.columns = new_col
tircp.columns

In [None]:
tircp = tircp.drop("expended_amount", axis=1)

In [None]:
# fill NaN with zero?
# see if you can sum the bus column
tircp.agg({"#_of_buses": "sum"})
# nope this is correct

In [None]:
display(tircp.shape, list(tircp.columns), tircp.head())

In [None]:
tircp.grant_recipient.nunique()

In [None]:
# use strip to help combine names
tircp["grant_recipient"] = tircp["grant_recipient"].str.strip()

tircp.grant_recipient.nunique()

In [None]:
# see list of unique names
# may be able to consolidate a few
tircp.grant_recipient.sort_values().unique()

In [None]:
new_dict = {
    "Antelope Valley Transit Authority": "Antelope Valley Transit Authority (AVTA)",
    "Bay Area Rapid Transit District": "Bay Area Rapid Transit (BART)",
    "Capitol Corridor Joint Powers Authority": "Capitol Corridor Joint Powers Authority (CCJPA)",
    "Los Angeles County Metropolitan Transportation (LA Metro)": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Los Angeles County Metropolitan Transportation Authority": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
    "Sacramento Regional Transit (SacRT)": "Sacramento Regional Transit District (SacRT)",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District (SacRT)",
    "San Diego Metropolitan Transit System (SDMTS)": "San Diego Metropolitan Transit System (MTS)",
    "San Francisco Bay Area Water Emergency Transportation Authority": "San Francisco Bay Area Water Emergency Transportation Authority (WETA)",
    "San Francisco Municipal Transportation Agency": "San Francisco Municipal Transportation Authority (SFMTA)",
    "Santa Barbara County Association of Governments\n(SBCAG)": "Santa Barbara County Association of Governments (SBCAG)",
    "Santa Clara Valley Transportation Authority": "Santa Clara Valley Transportation Authority (VTA)",
    "Transportation Agency for Monterey County": "Transportation Agency for Monterey County (TAMC)",
}

In [None]:
# replace the values in grant_recipient using dict
# df.replace({'bus_desc': new_dict}, inplace=True)
tircp = tircp.replace({"grant_recipient": new_dict})

In [None]:
#see that some rows were consolidated
display(tircp.grant_recipient.nunique())

### Agreement Allocations-Export Cleaned data

In [None]:
tircp.to_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

### Agreement Allocations-Read in Cleaned data from GCS

In [None]:
tircp = pd.read_csv(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocations_clean.csv"
)

In [None]:
display(tircp.shape, tircp.columns, tircp.head())

### Agreement Allocations-Cost per Bus, per agency

In [None]:
# filer to project with bus count values
# caveat: some rows in "component" column state some variation of "purchased buses", but did not specify the amount of buses. 
# only rows stating the specificy number of buses purchased are included
only_bus = tircp[tircp['#_of_buses']>0]


In [None]:
display(only_bus.shape)

In [None]:
#aggregate # of buses and allocation by transit agency
bus_cost = only_bus.groupby('grant_recipient').agg({
    '#_of_buses':"sum",
    'allocation_amount':'sum'
}).reset_index()

In [None]:
bus_cost

In [None]:
bus_cost['cost_per_bus']= ((bus_cost['allocation_amount'])/(bus_cost['#_of_buses'])).astype('int64')

In [None]:
display(bus_cost.dtypes,bus_cost)

In [None]:
#exporting cost per bus
bus_cost.to_csv("gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_allocation_cost_per_bus.csv")

### Agreement Allocations - Stat analysis

In [None]:
bus_cost

In [None]:
plt.figure(
plt.hist(bus_cost['cost_per_bus'],density=True)
plt.show()

## PROJECT TRACKING SHEET DATA

### project tracking -  read raw data


In [None]:
url = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/TIRCP Tracking Sheets 2_1-10-2024.xlsx"
sheet_name = "Project Tracking"

In [None]:
project = pd.read_excel(url, sheet_name)

In [None]:
display(
    project.shape,
    project.columns,
    project.dtypes,
)

## Project Tracking- data cleaning

### data frame cleaning

In [None]:
#only keep first couple of columns
#tircp = tircp.iloc[:, :12]
project = project.iloc[:, :20]

In [None]:
list(project.columns)

In [None]:
#drop specific columns
drop_col=[
 'Master Agreement Expiration Date',
 'Project Manager',
 'Regional Coordinator',
 'Technical Assistance-CALITP (Y/N)',
 'Technical Assistance-Fleet (Y/N)',
 'Technical Assistance-Network Integration (Y/N)',
 'Technical Assistance-Priority Population (Y/N)',]

In [None]:
project.drop(columns=drop_col, inplace=True)

In [None]:
len(project.columns)

In [None]:
#replace space with _ & lower everything
project.columns =project.columns.str.replace(' ','_')
project.columns=project.columns.str.lower()

In [None]:
#check work
project.columns

### check columns
check values of all columns to see if:
-any duplicates values
-invalid int/str values


In [None]:
project.columns

In [None]:
#function to check column information

def col_checker(col):
    display(f'Displaying column: {col}',
        len(project[col]),
        list(project[col].sort_values(ascending=True).unique())
           )

In [None]:
#col is OK, all numbers
col_checker('tircp_award_amount_($)')

In [None]:
#col is good, everything is a number
col_checker('total_project_cost')

In [None]:
#col is OK
col_checker('master_agreement_number')

In [None]:
#col is OK
col_checker('bus_count')

In [None]:
# column is OK
col_checker('project_description')

In [None]:
project[project['district']=='VAR']

In [None]:
#Project title OK, 
col_checker('project_title')

In [None]:
#award year OK
col_checker('award_year')

In [None]:
#project num OK
col_checker('project_#')

---

In [None]:
# DROP COL
#Col is OK
col_checker('allocated_amount')

In [None]:
# NEEDS CLEANING grant_recipient need to clean
col_checker('grant_recipient')

In [None]:
#may need to clean, there are rows that say '3, 4' 
col_checker('county')

In [None]:
#Move to cleaning, check what is 'VAR'. various?
#may be ok just check to make sure
project.district.unique()

In [None]:
#couldnt run col_checker, guessing because some PPNO numbers are inconsistent
#may need to clean, there is a ppno of CP052/CP053
project.ppno.unique()

### dropping allocated amount column

In [None]:
#dropping allocated amount column
project.drop(columns=['allocated_amount'], inplace=True)

In [None]:
#checking work
project.columns

### Clean `grant_recipient` column

In [None]:
list(project.grant_recipient.sort_values(ascending=True).unique())

In [None]:
new_dict ={
 'Antelope Valley Transit Authority ':'Antelope Valley Transit Authority (AVTA)',
 'Humboldt Transit Authority':'Humboldt Transit Authority (HTA)',
 'Orange County Transportation Authority':'Orange County Transportation Authority (OCTA)',
 'Capitol Corridor Joint Powers Authority':'Capitol Corridor Joint Powers Authority (CCJPA)',
 'Los Angeles County Metropolitan Transportation Authority': 'Los Angeles County Metropolitan Transportation Authority (LA Metro)',
 'Monterey-Salinas Transit':'Monterey-Salinas Transit District (MST)',
 'Sacramento Regional Transit (SacRT)':'Sacramento Regional Transit District (SacRT)',
 'Sacramento Regional Transit District':'Sacramento Regional Transit District (SacRT)',
 'Sacramento Regional Transit District (SacRT) ':'Sacramento Regional Transit District (SacRT)',
 'San Diego Association of Governments': 'San Diego Association of Governments (SANDAG)',
 'Santa Clara Valley Transportation Authority (SCVTA)':'Santa Clara Valley Transportation Authority (VTA)',
 'Southern California  Regional Rail Authority (SCRRA)':'Southern California Regional Rail Authority (SCRRA - Metrolink)',
 'Southern California Regional Rail Authority':'Southern California Regional Rail Authority (SCRRA - Metrolink)',
}

In [None]:
#df.replace({'bus_desc': new_dict}, inplace=True)
project.replace({'grant_recipient': new_dict}, inplace=True)

In [None]:
#check work. looks good
list(project['grant_recipient'].sort_values().unique())

### Cleaning `county` column

In [None]:
col_checker('county')

In [None]:
project[project['county']=='3, 4']

In [None]:
#change county value from '3, 4' to 'VAR' like the other rows.
project.at[3,'county']='VAR'

In [None]:
#check work
project.iloc[3]

### Cleaning `district`column
This is good as is, no cleaning requried. All rows with VAR district has VAR in county as well.

In [None]:
project.district.unique()

In [None]:
project[project['district']=='VAR']

### Clean `ppno` column
This should all be fine as is, no cleaning needed

In [None]:
list(project.ppno.unique())

In [None]:
project[project['ppno']=='CP052/CP053']

### Skim the project description column?
double check to ensure bus count is accurate to what the description says?

Saw that some rows mention procuring both zero and non-zero emission buses (count total buses in `bus count` and `VAR` in prop type and bus size?


In [42]:
project[project['project_title']== 'ATN FAST (Family of Advanced Solutions for Transit): Revolutionizing Transit for a Global Audience']

Unnamed: 0.1,Unnamed: 0,award_year,project_#,grant_recipient,project_title,ppno,district,county,project_description,bus_count,master_agreement_number,total_project_cost,tircp_award_amount_($)
73,73,2022,1,Anaheim Transportation Network (ATN),ATN FAST (Family of Advanced Solutions for Tra...,CP106,12,ORA,Creates a zero-emission transit ecosystem that...,25.0,64ATNMA A1,48433722,22778000


In [43]:
#iloc check
project.iloc[73]

Unnamed: 0                                                                73
award_year                                                              2022
project_#                                                                  1
grant_recipient                         Anaheim Transportation Network (ATN)
project_title              ATN FAST (Family of Advanced Solutions for Tra...
ppno                                                                   CP106
district                                                                  12
county                                                                   ORA
project_description        Creates a zero-emission transit ecosystem that...
bus_count                                                               25.0
master_agreement_number                                           64ATNMA A1
total_project_cost                                                  48433722
tircp_award_amount_($)                                              22778000

In [44]:
#code to update value at specific index and column
project.loc[73, 'bus_count'] = 42

#check work
project.iloc[73]

Unnamed: 0                                                                73
award_year                                                              2022
project_#                                                                  1
grant_recipient                         Anaheim Transportation Network (ATN)
project_title              ATN FAST (Family of Advanced Solutions for Tra...
ppno                                                                   CP106
district                                                                  12
county                                                                   ORA
project_description        Creates a zero-emission transit ecosystem that...
bus_count                                                               42.0
master_agreement_number                                           64ATNMA A1
total_project_cost                                                  48433722
tircp_award_amount_($)                                              22778000

---

## Export cleaned Project df 

In [None]:
#exproject cleaned project df
project.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_clean.csv')

## Read in cleaned project data

In [3]:
project = pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_clean.csv')

In [4]:
#ensure df is able to read in
display(project.shape, project.columns)

(124, 13)

Index(['Unnamed: 0', 'award_year', 'project_#', 'grant_recipient',
       'project_title', 'ppno', 'district', 'county', 'project_description',
       'bus_count', 'master_agreement_number', 'total_project_cost',
       'tircp_award_amount_($)'],
      dtype='object')

### filter df for project descriptions that contain bus

In [None]:
bus_only = project[project['bus_count']>0]

In [None]:
#this looks correct
display(project.shape,
        bus_only.shape)

### export project- bus only df

In [None]:
bus_only.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.csv')

### Read in project bus only data


In [None]:
bus_only= pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.csv')

In [None]:
display(bus_only.shape,
        bus_only.columns,
        bus_only.head())

In [None]:
#inspect columns values.
list(bus_only['grant_recipient'].sort_values().unique())
#everything looks good

### Consolidate up grant recipient name

### aggregate up

In [None]:
#aggregate # of buses and allocation by transit agency
#bus_cost = only_bus.groupby('grant_recipient').agg({
#    '#_of_buses':"sum",
#    'allocation_amount':'sum'
#}).reset_index()

bus_cost = bus_only.groupby('grant_recipient').agg({
    'bus_count':'sum',
    'tircp_award_amount_($)': 'sum'
}).reset_index()

In [None]:
#confirm aggregation worked
bus_cost

## create new cost per bus column

In [None]:
bus_cost['cost_per_bus']= (bus_cost['tircp_award_amount_($)']/bus_cost['bus_count']).astype('int64')

In [None]:
#confirm new column was created and values were populated
bus_cost.sort_values('cost_per_bus')

## Export cost per bus via project tracking sheet to gcs

In [None]:
bus_cost.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_cost_per_bus.csv')