In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import shared_utils
from scipy.stats import zscore
import seaborn as sns
import numpy as np

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

## Read in cost per bus data

In [None]:
# cleaned FTA Grant Award Press Release Data for buses only
fta = pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/fta_bus_cost_clean.csv')

In [None]:
# cleaned TIRCP Project Tracking Data project tracking sheet
tircp = pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/tircp_project_bus_only.csv')

## Game Plan
- bring in both data sets (FTA Press Release and TIRCP bus data
- FTA data, make sure it only has rows with bus count > 0
- may need to clean up the prop type and bus size type if there are any similar categories (completed at FTA notebook)
- Create shorten data frames for each. include the following columns:
    1. agency name (project_sponsor & grant_recipient)
    2. project title? (project_title)
    3. project award amount (funding and tircp_award_amount($))
    4. bus count (bus_count)
    5. propulsion type (prop_type)
    6. bus size type (bus_size_type)
<br>
<br>
- concat the short dataframes
- start aggregation. sum/count bus count, funding and project #
    * agg by agency name
    * agg by prop type
    * agg by bus size type

- new column for Z-score of `cost_per_bus`
- 

In [39]:
# function to display df info
def df_peek(df):
    display(type(df),
            df.shape,
            df.dtypes,
            df.sample(2)
       )

## Read in FTA and TIRCP Data

In [None]:
# peaking into each dataset
data=[fta, tircp]

for x in data:
    display(x.shape,
           x.columns,
           x.head(),
           x.dtypes)

## Dataset cleaning

### drop some columns

In [None]:
# examine columns
display(fta.columns,
tircp.columns)

In [None]:
fta = fta.drop(columns=['Unnamed: 0','area_served', 'congressional_districts', 'fta_region', 'bus/low-no_program'])

In [None]:
tircp = tircp.drop(columns=['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'award_year','district', 'county','total_project_cost',])

In [None]:
display(fta.columns,
tircp.columns)

### drop zero bus counts from FTA

In [None]:
fta = fta[fta['bus_count'] >0]

In [None]:
#no more '0' values
fta.bus_count.unique()

## Shorten data frames

FTA & TIRCP columns names respecitvely

    1. agency name (project_sponsor & grant_recipient)
    2. project title? (project_title)
    3. project award amount (funding and tircp_award_amount($))
    4. bus count (bus_count)
    5. propulsion type (prop_type)
    6. bus size type (bus_size_type)

In [None]:
fta_short = fta[['project_sponsor','project_title', 'funding', 'bus_count', 'prop_type', 'bus_size_type']]

In [None]:
len(fta_short) == len(fta)

In [None]:
tircp_short = tircp[['grant_recipient', 'project_title', 'tircp_award_amount_($)', 'bus_count', 'prop_type', 'bus_size_type']]

In [None]:
len(tircp_short) == len(tircp)

## rename columns to match eachother

In [None]:
#list of new column names. to be applied in same order as short df
new_col =['agency_name', 'project_title', 'project_award_amount', 'bus_count', 'prop_type', 'bus_size_type']


In [None]:
fta_short.columns=new_col
tircp_short.columns=new_col

In [None]:
fta_short.columns == tircp_short.columns

In [None]:
# add new col to identify source
fta_short['source']='fta_press_release'
tircp_short['source']='tircp_project_tracking'

In [None]:
display(fta_short.head(),tircp_short.head())

## Concat both dataframes

In [None]:
concat = pd.concat([fta_short,tircp_short], axis=0).reset_index()

In [None]:
display(len(fta_short),
        len(tircp_short),
       len(fta_short)+len(tircp_short) == len(concat)
       )

In [None]:
concat.sample(3)

In [None]:
concat = concat.drop('index', axis=1)

In [None]:
#concat looks good
list(concat.prop_type.sort_values().unique())

## Export concat data

In [None]:
concat.to_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/fta_tircp_concat.csv')

### Read in concat data (TIRCP and FTA data)

In [2]:
all_bus = pd.read_csv('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/fta_tircp_concat.csv')

In [3]:
display(all_bus.shape,
        all_bus.dtypes,
        all_bus.columns,
        all_bus.bus_count.unique(),
       )

(133, 8)

Unnamed: 0                int64
agency_name              object
project_title            object
project_award_amount      int64
bus_count               float64
prop_type                object
bus_size_type            object
source                   object
dtype: object

Index(['Unnamed: 0', 'agency_name', 'project_title', 'project_award_amount',
       'bus_count', 'prop_type', 'bus_size_type', 'source'],
      dtype='object')

array([100.,  90.,  20.,  40.,  35.,  16.,  30.,  31.,  69.,  23.,   7.,
        25.,  13.,   4.,  17.,  18.,  39.,  12.,   8., 160.,  37., 134.,
        14.,  50.,  42.,   6.,  11.,  56.,  10.,   5.,   9.,  15.,   2.,
         3.,   1.,  29., 112.,  27., 261., 103.,  33.,  24.])

## create cost_per_bus_column
on all_bus df

In [4]:
all_bus['cost_per_bus'] =((all_bus['project_award_amount']/all_bus['bus_count']).astype('int64'))

In [5]:
display(all_bus.shape,
        all_bus.head()
       )

(133, 9)

Unnamed: 0.1,Unnamed: 0,agency_name,project_title,project_award_amount,bus_count,prop_type,bus_size_type,source,cost_per_bus
0,0,Washington Metropolitan Area Transit Authority...,Battery-Electric Metrobus Procurement and Elec...,104000000,100.0,BEB,not specified,fta_press_release,1040000
1,1,Dallas Area Rapid Transit (DART),DART CNG Bus Fleet Modernization Project,103000000,90.0,CNG,not specified,fta_press_release,1144444
2,2,New Orleans Regional Transit Authority,Accelerating Zero-Emissions Mobility for a Res...,71439261,20.0,zero-emission bus (not specified),not specified,fta_press_release,3571963
3,3,Metropolitan Transit Authority of Harris Count...,FY 2023 Renewable Natural Gas Path to Zero Emi...,40402548,40.0,CNG,not specified,fta_press_release,1010063
4,4,"University of Maryland, College Park","35 Battery Electric Transit Buses, Infrastruct...",39863156,35.0,BEB,not specified,fta_press_release,1138947


## Aggregate
To get total funding and bus count
- per agency
- per propulsion type
- per bus size type

In [19]:
## function to agg by X col by project title, award and bus count

def bus_aggregate(column):
    df_agg = all_bus.groupby(column).agg(
        total_project_count=('project_title', 'count'),
        total_funds=('project_award_amount', 'sum'),
        total_bus_count=('bus_count', 'sum')
    ).reset_index()
    return df_agg

In [37]:
agency_agg = bus_aggregate('agency_name')
prop_agg = bus_aggregate('prop_type')
size_agg = bus_aggregate('bus_size_type')

In [38]:
df_peek(size_agg)

pandas.core.frame.DataFrame

(4, 4)

bus_size_type           object
total_project_count      int64
total_funds              int64
total_bus_count        float64
dtype: object

Unnamed: 0,bus_size_type,total_project_count,total_funds,total_bus_count
0,conventional (40-ft like),4,63542000,93.0
1,cutaway,4,32861322,183.0


## test 

In [None]:
# add new column for z-score
concat['zscore_' + 'cost_per_bus'] = zscore(concat['cost_per_bus'])

In [None]:
concat.shape

In [None]:
# remove outliers
#filter df for zscores =>-3<=3
filtered = concat[(concat['zscore_cost_per_bus'] >= -3) & (concat['zscore_cost_per_bus'] <=3)]

In [None]:
# 2 rows were dropped
filtered.shape

In [None]:
filtered.zscore_cost_per_bus.max()

In [None]:
filtered.sort_values(by='cost_per_bus', ascending=False).head()

## Stats Analysis

In [None]:
mean = np.mean(filtered['cost_per_bus'])
std_dev = np.std(filtered['cost_per_bus'])

#zscore
mean2 = np.mean(filtered['zscore_cost_per_bus'])
std_dev2 = np.std(filtered['zscore_cost_per_bus'])


In [None]:
mean2 + (std_dev2*3)

## Summary

In [None]:
concat.head()

In [None]:
concat.sort_values(by='cost_per_bus',ascending=True).head()

In [None]:
# Variables
total_unique_projects = len(concat)
total_bus_count = sum(concat.bus_count)
total_funding = sum(concat.funding)
min_bus_cost = concat.cost_per_bus.min()
max_bus_cost = concat.cost_per_bus.max()
max_bus_count = concat.bus_count.max()

mean = np.mean(filtered['cost_per_bus'])
std_dev = np.std(filtered['cost_per_bus'])

agency_with_most_bus = concat.loc[concat['bus_count'].idxmax(), 'project_sponsor']
#how many buses do they have? already answered
agency_with_highest_funds = concat.loc[concat['funding'].idxmax(), 'project_sponsor']
#what is the highest amount? already answered
agency_max_cpb = concat.loc[concat['cost_per_bus'].idxmax(), 'project_sponsor']
agency_min_cpb = concat.loc[concat['cost_per_bus'].idxmin(), 'project_sponsor']

In [None]:
summary = f'''
As of today, data was scraped from mutltiple sources:
    1. FTA Bus and Low- and No-Emission Grant Awards press release (federaly funded, nationwide data)
    2. TIRCP project data (state-funded, California only)
    
Data from DGS usage reports, Georgia and Washington contracts to be analyzed next.

Note, some projects included additional compoments besides bus purchases (chargers, transit facilities, parts, training) which may cause project costs to increase dramaticly, whereas other projects specified only bus purcahses, and some did not include and bus purchases at all.

Datasets was filtered to only include data that specificed the number of buses to purchase. The compiled data was aggregated by agencies and a 'cost per_bus' metric was calculated by dividing the total funding the agency received by the total number of buses they specify.

In total:
    - {total_unique_projects} projects with bus purchases were analyzed.
    - ${total_funding:,.2f} was awarded to agencies for projects including bus purchases.
    - {total_bus_count} total buses are to be purchased.
    - The highest cost per bus for an agency was ${max_bus_cost:,.2f}, belonging too {agency_max_cpb} 
    - The lowest cost per bus for an agency was ${min_bus_cost:,.2f}, belonging too {agency_min_cpb}
    

The agency with the most buses specified was {agency_with_most_bus} with {max_bus_count} buses.

After removing outliers, the following was discovered:
    - the mean cost per bus is ${mean:,.2f}.
    - the standard deviation is ${std_dev:,.2f}. 

Below are charts that summarize the data.
'''

In [None]:
concat.head()

In [None]:
print(summary)

In [None]:
#bar chart of highest cost per bus
concat.sort_values(by='cost_per_bus', ascending=False).head(10).plot(x='project_sponsor', y='cost_per_bus', kind='bar', color='skyblue')
plt.title('Top 10 Agencies with highest Cost per bus')
plt.xlabel('Transit Agnecies')
plt.ylabel('$ (million)')
plt.show()

In [None]:
#bar chart of highest bus count
concat.sort_values(by='bus_count', ascending=False).head(10).plot(x='project_sponsor', y='bus_count', kind='bar', color='skyblue')
plt.title('Top 10 Agencies with most bus count')
plt.xlabel('Transit Agnecies')
plt.ylabel('# of buses')
plt.show()

In [None]:
# distribution curve of cost per bus. no outliers
sns.histplot(filtered['cost_per_bus'], kde=True, color='skyblue', bins=20)
plt.axvline(mean, color='red', linestyle='dashed', linewidth=2, label=f'Mean: ${mean:,.2f}')
plt.axvline(mean + std_dev, color='green', linestyle='dashed', linewidth=2, label=f'Standard Deviation: ${std_dev:,.2f}')
plt.axvline(mean - std_dev, color='green', linestyle='dashed', linewidth=2)
plt.axvline(mean + std_dev*2, color='green', linestyle='dashed', linewidth=2)
plt.axvline(mean + std_dev*3, color='green', linestyle='dashed', linewidth=2)

plt.title('Cost Per Bus Distribution with Mean and Standard Deviation')
plt.xlabel('cost per bus ($ million(s))')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
#distribution curve of zscore
sns.histplot(filtered['zscore_cost_per_bus'], kde=True, color='skyblue', bins=20)
#plt.axvline(mean2, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean2:.2f}')
#plt.axvline(mean2 + std_dev2, color='green', linestyle='dashed', linewidth=2, label=f'Standard Deviation: {std_dev2:,.2f}')
#plt.axvline(mean2 - std_dev2, color='green', linestyle='dashed', linewidth=2)
#plt.axvline(mean2 + (std_dev2*2), color='green', linestyle='dashed', linewidth=2)
#plt.axvline(mean2 + (std_dev2*3), color='green', linestyle='dashed', linewidth=2)
#plt.axvline(mean2 - (std_dev2*2), color='green', linestyle='dashed', linewidth=2)
#plt.axvline(mean2 - (std_dev2*3), color='green', linestyle='dashed', linewidth=2)

plt.title('Cost Per Bus Z-Score Distribution')
plt.xlabel('zscore cost per bus')
plt.ylabel('Frequency')
plt.show()