In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import shared_utils
from scipy.stats import zscore
import seaborn as sns
import numpy as np

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

## Read in cost per bus data

In [41]:
# cleaned FTA Grant Award Press Release Data for buses only
fta = pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/fta_bus_cost_clean.csv')

In [42]:
# cleaned TIRCP Project Tracking Data project tracking sheet
tircp = pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.csv')

## Game Plan
- bring in both data sets (FTA Press Release and TIRCP bus data
- FTA data, make sure it only has rows with bus count > 0
- may need to clean up the prop type and bus size type if there are any similar categories (completed at FTA notebook)
- Create shorten data frames for each. include the following columns:
    1. agency name (project_sponsor & grant_recipient)
    2. project title? (project_title)
    3. project award amount (funding and tircp_award_amount($))
    4. bus count (bus_count)
    5. propulsion type (prop_type)
    6. bus size type (bus_size_type)
<br>
<br>
- concat the short dataframes
- start aggregation. sum/count bus count, funding and project #
    * agg by agency name
    * agg by prop type
    * agg by bus size type

## Read in FTA and TIRCP Data

In [43]:
# peaking into each dataset
data=[fta, tircp]

for x in data:
    display(x.shape,
           x.columns,
           x.head(),
           x.dtypes)

(130, 16)

Index(['Unnamed: 0', 'state', 'project_sponsor', 'project_title',
       'description', 'funding', 'approx_#_of_buses', 'project_type',
       'propulsion_category', 'area_served', 'congressional_districts',
       'fta_region', 'bus/low-no_program', 'bus_count', 'prop_type',
       'bus_size_type'],
      dtype='object')

Unnamed: 0.1,Unnamed: 0,state,project_sponsor,project_title,description,funding,approx_#_of_buses,project_type,propulsion_category,area_served,congressional_districts,fta_region,bus/low-no_program,bus_count,prop_type,bus_size_type
0,0,DC,Washington Metropolitan Area Transit Authority...,Battery-Electric Metrobus Procurement and Elec...,WMATA will receive funding to convert its Cind...,104000000,100(beb),bus/chargers,zero,Large Urban,DC-001 ; MD-004 ; MD-008 ; VA-008 ; VA-011,3,Low-No,100,BEB,not specified
1,1,TX,Dallas Area Rapid Transit (DART),DART CNG Bus Fleet Modernization Project,Dallas Area Rapid Transit will receive funding...,103000000,90(estimated-CNGbuses),bus,low,Large Urban,TX-003 ; TX-004 ; TX-005 ; TX-006 ; TX-024 ; T...,6,Low-No,90,CNG,not specified
2,2,PA,Southeastern Pennsylvania Transportation Autho...,SEPTA Zero-Emission Bus Transition Facility Sa...,The Southeastern Pennsylvania Transportation A...,80000000,0,facility,zero,Large Urban,PA-002 ; PA-003 ; PA-004 ; PA-005,3,Low-No,0,,not specified
3,3,LA,New Orleans Regional Transit Authority,Accelerating Zero-Emissions Mobility for a Res...,The New Orleans Regional Transit Authority wil...,71439261,20(zero-emission),bus/chargers/equipment,zero,Large Urban,LA-002 ; LA-001,6,Low-No,20,zero-emission bus (not specified),not specified
4,4,NJ,New Jersey Transit Corporation,Hilton Bus Garage Modernization,New Jersey Transit will receive funding to mod...,47000000,0,facility/chargers,zero,Large Urban,nj-011,2,Bus,0,,not specified


Unnamed: 0                  int64
state                      object
project_sponsor            object
project_title              object
description                object
funding                     int64
approx_#_of_buses          object
project_type               object
propulsion_category        object
area_served                object
congressional_districts    object
fta_region                  int64
bus/low-no_program         object
bus_count                   int64
prop_type                  object
bus_size_type              object
dtype: object

(37, 17)

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'award_year', 'project_#',
       'grant_recipient', 'project_title', 'ppno', 'district', 'county',
       'project_description', 'bus_count', 'master_agreement_number',
       'total_project_cost', 'tircp_award_amount_($)', 'prop_type',
       'bus_size_type'],
      dtype='object')

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,award_year,project_#,grant_recipient,project_title,ppno,district,county,project_description,bus_count,master_agreement_number,total_project_cost,tircp_award_amount_($),prop_type,bus_size_type
0,0,0,0,2015,1,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environme...,CP005,7,LA,Purchase 13 60-foot articulated BRT buses and ...,29.0,64AVTA2015MA,39478000,24403000,electric (not specified),conventional (40-ft like)
1,5,5,5,2015,6,Orange County Transportation Authority (OCTA),Bravo! Route 560 Rapid Buses,CP004,12,ORA,Purchase five 40-foot CNG buses for BRT Route ...,40.0,64OCTAMA,2900000,2320000,CNG,conventional (40-ft like)
2,11,11,11,2015,12,San Joaquin Regional Transit District (SJRTD),BRT Expansion: MLK Corridor and Crosstown Mine...,CP011,10,SJ,Bus rapid transit infrastructure along the MLK...,12.0,64SJRRCMA A1,19118776,6841000,zero-emission bus (not specified),not specified
3,16,16,16,2016,3,Foothill Transit,"Transforming California: Bus Electrification, ...",CP076,7,LA,Purchase 20 zero-emission buses to extend Rout...,20.0,64FOOTHILLMA,16580000,5000000,zero-emission bus (not specified),not specified
4,29,29,29,2018,2,Anaheim Transportation Network (ATN),#Electrify Anaheim: Changing the Transit Parad...,CP027,12,ORA,Deploys 40 zero-emission electric buses to dou...,40.0,64ATNMA A1,45201000,28617000,electric (not specified),not specified


Unnamed: 0.2                 int64
Unnamed: 0.1                 int64
Unnamed: 0                   int64
award_year                   int64
project_#                    int64
grant_recipient             object
project_title               object
ppno                        object
district                     int64
county                      object
project_description         object
bus_count                  float64
master_agreement_number     object
total_project_cost           int64
tircp_award_amount_($)       int64
prop_type                   object
bus_size_type               object
dtype: object

## Dataset cleaning

### drop some columns

In [44]:
# examine columns
display(fta.columns,
tircp.columns)

Index(['Unnamed: 0', 'state', 'project_sponsor', 'project_title',
       'description', 'funding', 'approx_#_of_buses', 'project_type',
       'propulsion_category', 'area_served', 'congressional_districts',
       'fta_region', 'bus/low-no_program', 'bus_count', 'prop_type',
       'bus_size_type'],
      dtype='object')

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'award_year', 'project_#',
       'grant_recipient', 'project_title', 'ppno', 'district', 'county',
       'project_description', 'bus_count', 'master_agreement_number',
       'total_project_cost', 'tircp_award_amount_($)', 'prop_type',
       'bus_size_type'],
      dtype='object')

In [45]:
fta = fta.drop(columns=['Unnamed: 0','area_served', 'congressional_districts', 'fta_region', 'bus/low-no_program'])

In [46]:
tircp = tircp.drop(columns=['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'award_year','district', 'county','total_project_cost',])

In [47]:
display(fta.columns,
tircp.columns)

Index(['state', 'project_sponsor', 'project_title', 'description', 'funding',
       'approx_#_of_buses', 'project_type', 'propulsion_category', 'bus_count',
       'prop_type', 'bus_size_type'],
      dtype='object')

Index(['project_#', 'grant_recipient', 'project_title', 'ppno',
       'project_description', 'bus_count', 'master_agreement_number',
       'tircp_award_amount_($)', 'prop_type', 'bus_size_type'],
      dtype='object')

### drop zero bus counts from FTA

In [48]:
fta = fta[fta['bus_count'] >0]

In [49]:
#no more '0' values
fta.bus_count.unique()

array([100,  90,  20,  40,  35,  16,  30,  31,  69,  23,   7,  25,  13,
         4,  17,  18,  39,  12,   8, 160,  37, 134,  14,  50,  42,   6,
        11,  56,  10,   5,   9,  15,   2,   3,   1])

## Shorten data frames

FTA & TIRCP columns names respecitvely

    1. agency name (project_sponsor & grant_recipient)
    2. project title? (project_title)
    3. project award amount (funding and tircp_award_amount($))
    4. bus count (bus_count)
    5. propulsion type (prop_type)
    6. bus size type (bus_size_type)

In [74]:
fta_short = fta[['project_sponsor','project_title', 'funding', 'bus_count', 'prop_type', 'bus_size_type']]

In [78]:
len(fta_short) == len(fta)

True

In [79]:
tircp_short = tircp[['grant_recipient', 'project_title', 'tircp_award_amount_($)', 'bus_count', 'prop_type', 'bus_size_type']]

In [80]:
len(tircp_short) == len(tircp)

True

## rename columns to match eachother

In [81]:
#list of new column names. to be applied in same order as short df
new_col =['agency_name', 'project_title', 'project_award_amount', 'bus_count', 'prop_type', 'bus_size_type']


In [82]:
fta_short.columns=new_col
tircp_short.columns=new_col

In [83]:
fta_short.columns == tircp_short.columns

array([ True,  True,  True,  True,  True,  True])

In [84]:
# add new col to identify source
fta_short['source']='fta_press_release'
tircp_short['source']='tircp_project_tracking'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fta_short['source']='fta_press_release'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tircp_short['source']='tircp_project_tracking'


In [86]:
display(fta_short.head(),tircp_short.head())

Unnamed: 0,agency_name,project_title,project_award_amount,bus_count,prop_type,bus_size_type,source
0,Washington Metropolitan Area Transit Authority...,Battery-Electric Metrobus Procurement and Elec...,104000000,100,BEB,not specified,fta_press_release
1,Dallas Area Rapid Transit (DART),DART CNG Bus Fleet Modernization Project,103000000,90,CNG,not specified,fta_press_release
3,New Orleans Regional Transit Authority,Accelerating Zero-Emissions Mobility for a Res...,71439261,20,zero-emission bus (not specified),not specified,fta_press_release
5,Metropolitan Transit Authority of Harris Count...,FY 2023 Renewable Natural Gas Path to Zero Emi...,40402548,40,CNG,not specified,fta_press_release
6,"University of Maryland, College Park","35 Battery Electric Transit Buses, Infrastruct...",39863156,35,BEB,not specified,fta_press_release


Unnamed: 0,agency_name,project_title,project_award_amount,bus_count,prop_type,bus_size_type,source
0,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environme...,24403000,29.0,electric (not specified),conventional (40-ft like),tircp_project_tracking
1,Orange County Transportation Authority (OCTA),Bravo! Route 560 Rapid Buses,2320000,40.0,CNG,conventional (40-ft like),tircp_project_tracking
2,San Joaquin Regional Transit District (SJRTD),BRT Expansion: MLK Corridor and Crosstown Mine...,6841000,12.0,zero-emission bus (not specified),not specified,tircp_project_tracking
3,Foothill Transit,"Transforming California: Bus Electrification, ...",5000000,20.0,zero-emission bus (not specified),not specified,tircp_project_tracking
4,Anaheim Transportation Network (ATN),#Electrify Anaheim: Changing the Transit Parad...,28617000,40.0,electric (not specified),not specified,tircp_project_tracking


## Concat both dataframes

In [87]:
concat = pd.concat([fta_short,tircp_short], axis=0).reset_index()

In [90]:
display(len(fta_short),
        len(tircp_short),
       len(fta_short)+len(tircp_short) == len(concat)
       )

96

37

True

In [92]:
concat.sample(3)

Unnamed: 0,index,agency_name,project_title,project_award_amount,bus_count,prop_type,bus_size_type,source
83,114,The Colorado Department of Transportation (CDO...,The Colorado Department of Transportation (CDO...,1145951,1.0,zero-emission bus (not specified),not specified,fta_press_release
127,31,City of Simi Valley,Simi Valley Regional Transit Center and Pathwa...,7053000,6.0,zero-emission bus (not specified),not specified,tircp_project_tracking
6,10,King County Metro Transit,Zero Emissions Battery Electric Bus Fleet Conv...,33552634,30.0,BEB,not specified,fta_press_release


In [93]:
concat = concat.drop('index', axis=1)

In [95]:
#concat looks good
list(concat.prop_type.sort_values().unique())

['BEB',
 'CNG',
 'FCEB',
 'electrc (not specified)',
 'electric (not specified)',
 'low emission (hybrid)',
 'low emission (propane)',
 'mix (BEB and FCEB)',
 'mix (low emission)',
 'mix (zero and low emission buses)',
 'not specified',
 'zero-emission bus (not specified)',
 nan]

## Export concat data

In [96]:
concat.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/fta_tircp_concat.csv')

### aggregate by agency?
- to get total funding and bus count

In [69]:
fta_agg = fta.groupby('project_sponsor').agg(
    total_project_count = ('project_title', 'count'),
    total_funds = ('funding', 'sum'),
    total_bus_count = ('bus_count','sum')).reset_index()

tircp_agg = tircp.groupby('grant_recipient').agg(
    total_project_count = ('project_#','count'),
    total_funds = ('tircp_award_amount_($)','sum'),
    total_bus_count = ('bus_count','sum')).reset_index()

In [70]:
display(
fta_agg.shape,
fta_agg.columns,
tircp_agg.shape,
tircp_agg.columns)

(95, 4)

Index(['project_sponsor', 'total_project_count', 'total_funds',
       'total_bus_count'],
      dtype='object')

(32, 4)

Index(['grant_recipient', 'total_project_count', 'total_funds',
       'total_bus_count'],
      dtype='object')

In [72]:
display(fta_agg, tircp_agg)

Unnamed: 0,project_sponsor,total_project_count,total_funds,total_bus_count
0,AUTORIDAD METROPOLITANA DE AUTOBUSES (PRMBA),1,10000000,8
1,Alameda-Contra Costa Transit District,1,25513684,25
2,Berkshire Regional Transit Authority,1,2212747,2
3,Brazos Transit District,1,9650646,11
4,Cape Fear Public Transportation Authority,1,2860250,5
5,Central Oklahoma Transportation and Parking Au...,1,4278772,9
6,Champaign-Urbana Mass Transit District,1,6635394,10
7,Charleston Area Regional Transportation Authority,1,25906730,7
8,City Of Tallahassee,1,20370793,8
9,City of Albuquerque,1,18262255,20


Unnamed: 0,grant_recipient,total_project_count,total_funds,total_bus_count
0,Anaheim Transportation Network (ATN),2,51395000,82.0
1,Antelope Valley Transit Authority (AVTA),3,35735000,52.0
2,Antelope Valley Transit Authority (AVTA) & Lon...,1,13156000,12.0
3,City of Fresno,1,7798000,6.0
4,City of Glendale and Arroyo Verdugo Communities,1,34648000,27.0
5,City of Los Angeles (LA DOT),1,36104000,112.0
6,City of Pasadena,1,14424000,40.0
7,City of Santa Monica,2,26027000,113.0
8,City of Simi Valley,1,7053000,6.0
9,City of Torrance,1,96000000,10.0


### aggregate by prop type
-funding and sum

In [68]:
fta_prop_agg = fta.groupby('prop_type').agg(
    total_project_count = ('project_title', 'count'),
    total_funds = ('funding', 'sum'),
    total_bus_count = ('bus_count', 'sum')).reset_index()

tircp_prop_agg = tircp.groupby('prop_type').agg(
    total_project_count =('project_#', 'count'),
    total_funds = ('tircp_award_amount_($)','sum'),
    total_bus_count =('bus_count','sum')).reset_index()

In [61]:
display(
fta_prop_agg.shape,
fta_prop_agg.columns,
tircp_prop_agg.shape,
tircp_prop_agg.columns)

(11, 4)

Index(['prop_type', 'project_title', 'funding', 'bus_count'], dtype='object')

(6, 4)

Index(['prop_type', 'project_#', 'tircp_award_amount_($)', 'bus_count'], dtype='object')

In [62]:
display(fta_prop_agg, tircp_prop_agg)

Unnamed: 0,prop_type,project_title,funding,bus_count
0,BEB,18,431933634,395
1,CNG,14,232601904,290
2,FCEB,4,84010483,66
3,electrc (not specified),6,68473822,36
4,low emission (hybrid),15,89918358,136
5,low emission (propane),5,7397087,38
6,mix (BEB and FCEB),1,8740728,6
7,mix (low emission),1,3303600,9
8,mix (zero and low emission buses),3,39650838,41
9,not specified,1,7443765,56


Unnamed: 0,prop_type,project_#,tircp_award_amount_($),bus_count
0,BEB,2,57426000,69.0
1,CNG,1,2320000,40.0
2,FCEB,2,21606000,8.0
3,electric (not specified),14,268701000,170.0
4,not specified,1,8641000,14.0
5,zero-emission bus (not specified),17,444855000,704.0


## Read in concat data (TIRCP and FTA data)

In [None]:
concat = pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/fta_tircp_concat.csv')

In [None]:
display(concat.shape,
        concat.dtypes,
        concat.head(),
       )
        

In [None]:
# add new column for z-score
concat['zscore_' + 'cost_per_bus'] = zscore(concat['cost_per_bus'])

In [None]:
concat.shape

In [None]:
# remove outliers
#filter df for zscores =>-3<=3
filtered = concat[(concat['zscore_cost_per_bus'] >= -3) & (concat['zscore_cost_per_bus'] <=3)]

In [None]:
# 2 rows were dropped
filtered.shape

In [None]:
filtered.zscore_cost_per_bus.max()

In [None]:
filtered.sort_values(by='cost_per_bus', ascending=False).head()

## Stats Analysis

In [None]:
mean = np.mean(filtered['cost_per_bus'])
std_dev = np.std(filtered['cost_per_bus'])

#zscore
mean2 = np.mean(filtered['zscore_cost_per_bus'])
std_dev2 = np.std(filtered['zscore_cost_per_bus'])


In [None]:
mean2 + (std_dev2*3)

## Summary

In [None]:
concat.head()

In [None]:
concat.sort_values(by='cost_per_bus',ascending=True).head()

In [None]:
# Variables
total_unique_projects = len(concat)
total_bus_count = sum(concat.bus_count)
total_funding = sum(concat.funding)
min_bus_cost = concat.cost_per_bus.min()
max_bus_cost = concat.cost_per_bus.max()
max_bus_count = concat.bus_count.max()

mean = np.mean(filtered['cost_per_bus'])
std_dev = np.std(filtered['cost_per_bus'])

agency_with_most_bus = concat.loc[concat['bus_count'].idxmax(), 'project_sponsor']
#how many buses do they have? already answered
agency_with_highest_funds = concat.loc[concat['funding'].idxmax(), 'project_sponsor']
#what is the highest amount? already answered
agency_max_cpb = concat.loc[concat['cost_per_bus'].idxmax(), 'project_sponsor']
agency_min_cpb = concat.loc[concat['cost_per_bus'].idxmin(), 'project_sponsor']

In [None]:
summary = f'''
As of today, data was scraped from mutltiple sources:
    1. FTA Bus and Low- and No-Emission Grant Awards press release (federaly funded, nationwide data)
    2. TIRCP project data (state-funded, California only)
    
Data from DGS usage reports, Georgia and Washington contracts to be analyzed next.

Note, some projects included additional compoments besides bus purchases (chargers, transit facilities, parts, training) which may cause project costs to increase dramaticly, whereas other projects specified only bus purcahses, and some did not include and bus purchases at all.

Datasets was filtered to only include data that specificed the number of buses to purchase. The compiled data was aggregated by agencies and a 'cost per_bus' metric was calculated by dividing the total funding the agency received by the total number of buses they specify.

In total:
    - {total_unique_projects} projects with bus purchases were analyzed.
    - ${total_funding:,.2f} was awarded to agencies for projects including bus purchases.
    - {total_bus_count} total buses are to be purchased.
    - The highest cost per bus for an agency was ${max_bus_cost:,.2f}, belonging too {agency_max_cpb} 
    - The lowest cost per bus for an agency was ${min_bus_cost:,.2f}, belonging too {agency_min_cpb}
    

The agency with the most buses specified was {agency_with_most_bus} with {max_bus_count} buses.

After removing outliers, the following was discovered:
    - the mean cost per bus is ${mean:,.2f}.
    - the standard deviation is ${std_dev:,.2f}. 

Below are charts that summarize the data.
'''

In [None]:
concat.head()

In [None]:
print(summary)

In [None]:
#bar chart of highest cost per bus
concat.sort_values(by='cost_per_bus', ascending=False).head(10).plot(x='project_sponsor', y='cost_per_bus', kind='bar', color='skyblue')
plt.title('Top 10 Agencies with highest Cost per bus')
plt.xlabel('Transit Agnecies')
plt.ylabel('$ (million)')
plt.show()

In [None]:
#bar chart of highest bus count
concat.sort_values(by='bus_count', ascending=False).head(10).plot(x='project_sponsor', y='bus_count', kind='bar', color='skyblue')
plt.title('Top 10 Agencies with most bus count')
plt.xlabel('Transit Agnecies')
plt.ylabel('# of buses')
plt.show()

In [None]:
# distribution curve of cost per bus. no outliers
sns.histplot(filtered['cost_per_bus'], kde=True, color='skyblue', bins=20)
plt.axvline(mean, color='red', linestyle='dashed', linewidth=2, label=f'Mean: ${mean:,.2f}')
plt.axvline(mean + std_dev, color='green', linestyle='dashed', linewidth=2, label=f'Standard Deviation: ${std_dev:,.2f}')
plt.axvline(mean - std_dev, color='green', linestyle='dashed', linewidth=2)
plt.axvline(mean + std_dev*2, color='green', linestyle='dashed', linewidth=2)
plt.axvline(mean + std_dev*3, color='green', linestyle='dashed', linewidth=2)

plt.title('Cost Per Bus Distribution with Mean and Standard Deviation')
plt.xlabel('cost per bus ($ million(s))')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
#distribution curve of zscore
sns.histplot(filtered['zscore_cost_per_bus'], kde=True, color='skyblue', bins=20)
#plt.axvline(mean2, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean2:.2f}')
#plt.axvline(mean2 + std_dev2, color='green', linestyle='dashed', linewidth=2, label=f'Standard Deviation: {std_dev2:,.2f}')
#plt.axvline(mean2 - std_dev2, color='green', linestyle='dashed', linewidth=2)
#plt.axvline(mean2 + (std_dev2*2), color='green', linestyle='dashed', linewidth=2)
#plt.axvline(mean2 + (std_dev2*3), color='green', linestyle='dashed', linewidth=2)
#plt.axvline(mean2 - (std_dev2*2), color='green', linestyle='dashed', linewidth=2)
#plt.axvline(mean2 - (std_dev2*3), color='green', linestyle='dashed', linewidth=2)

plt.title('Cost Per Bus Z-Score Distribution')
plt.xlabel('zscore cost per bus')
plt.ylabel('Frequency')
plt.show()