In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shared_utils
from matplotlib.ticker import ScalarFormatter
from scipy.stats import zscore

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

# DELETE LATER
# function to display df info
def df_peek(df):
    display(type(df), df.shape, df.dtypes, df.sample(2))

## Read in merged fta/tircp/dgs data from cleaner script

In [44]:
all_bus = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/cpb_analysis_data_merge.parquet"
)

In [48]:
df_peek(all_bus)

pandas.core.frame.DataFrame

(195, 7)

agency_name              object
project_title            object
project_award_amount      int64
bus_count               float64
prop_type                object
bus_size_type            object
source                   object
dtype: object

Unnamed: 0,agency_name,project_title,project_award_amount,bus_count,prop_type,bus_size_type,source
145,Torrance Transit Department,Torrance Transit Bus Service Enhancement Program,6000000,7.0,electric (not specified),not specified,tircp_project_tracking
27,The Brockton Area Transit Authority,Brockton Area Transit Replacement Battery Elec...,10694736,7.0,electric (not specified),not specified,fta_press_release


## create cost_per_bus_column
on all_bus df

In [None]:
all_bus["cost_per_bus"] = (
    all_bus["project_award_amount"] / all_bus["bus_count"]
).astype("int64")

## Aggregate
To get total funding and bus count
- per agency
- per propulsion type
- per bus size type

In [None]:
# test to edit total cpb to be a calculated col (outside of the .aggfuc())


def bus_aggregate(column, df=all_bus):
    df_agg = (
        df.groupby(column)
        .agg(
            total_project_count=("project_title", "count"),
            total_funds=("project_award_amount", "sum"),
            total_bus_count=("bus_count", "sum"),
        )
        .reset_index()
    )
    df_agg["cpb"] = (df_agg["total_funds"] / df_agg["total_bus_count"]).astype("int64")
    return df_agg


# create dfs that group by agency, prop type and size type
agency_agg = bus_aggregate("agency_name")
prop_agg = bus_aggregate("prop_type")
size_agg = bus_aggregate("bus_size_type")

In [None]:
# add new column for z-score
all_bus["zscore_cost_per_bus"] = zscore(all_bus["cost_per_bus"])

# remove outliers
# filter df for zscores between -3 and 3
zscore_bus = all_bus[
    (all_bus["zscore_cost_per_bus"] >= -3) & (all_bus["zscore_cost_per_bus"] <= 3)
]

## Zeb Only DF

In [None]:
# Zeb only df and Count of ZEB
zeb_only = all_bus[
    all_bus["prop_type"].isin(
        [
            "BEB",
            "zero-emission bus (not specified)",
            "FCEB",
            "electric (not specified)",
            "mix (BEB and FCEB)",
        ]
    )
]

# zeb agg
zeb_agg = bus_aggregate("prop_type", df=zeb_only)


# zeb-only, no outliers
zeb_only_no_outliers = zeb_only[
    (zeb_only["zscore_cost_per_bus"] >= -3) & (zeb_only["zscore_cost_per_bus"] <= 3)
]

## Non-ZEB only DF

In [None]:
# non-zeb df and Count of non-ZEB
non_zeb_only = all_bus[
    ~all_bus["prop_type"].isin(
        [
            "BEB",
            "zero-emission bus (not specified)",
            "FCEB",
            "electric (not specified)",
            "mix (BEB and FCEB)",
            "not specified",
        ]
    )
]

non_zeb_agg = bus_aggregate("prop_type", df=non_zeb_only)

### Z-score for cost_per_bus 

## Chart functions 

In [None]:
# distribution curve of cost per bus. no outliers


def dist_curve(
    df,
    mean,
    std,
    title="Cost Per Bus Distribution",
    xlabel='"cost per bus, $ million(s)"',
):
    sns.histplot(df["cost_per_bus"], kde=True, color="skyblue", bins=20)
    plt.axvline(
        mean, color="red", linestyle="dashed", linewidth=2, label=f"Mean: ${mean:,.2f}"
    )

    plt.axvline(
        mean + std,
        color="green",
        linestyle="dashed",
        linewidth=2,
        label=f"Standard Deviation: ${std:,.2f}",
    )
    plt.axvline(mean - std, color="green", linestyle="dashed", linewidth=2)
    plt.axvline(mean + std * 2, color="green", linestyle="dashed", linewidth=2)
    plt.axvline(mean + std * 3, color="green", linestyle="dashed", linewidth=2)

    plt.title(title + " with Mean and Standard Deviation")
    plt.xlabel(xlabel)
    plt.ylabel("Frequency")

    # Turn off scientific notation on x-axis?
    plt.gca().xaxis.set_major_formatter(ScalarFormatter(useMathText=False))

    plt.legend()
    plt.show()

    return

In [None]:
# chart function


def make_chart(y_col, title, data=zscore_bus, x_col="agency_name"):
    data.sort_values(by=y_col, ascending=False).head(10).plot(
        x=x_col, y=y_col, kind="bar", color="skyblue"
    )
    plt.title(title)
    plt.xlabel(x_col)
    plt.ylabel(y_col)

    plt.ticklabel_format(style="plain", axis="y")
    plt.show()

## Summary

In [None]:
# Variables
total_unique_projects = len(all_bus)
total_bus_count = sum(all_bus.bus_count)
total_funding = sum(all_bus.project_award_amount)
min_bus_cost = all_bus.cost_per_bus.min()
max_bus_cost = all_bus.cost_per_bus.max()
max_bus_count = all_bus.bus_count.max()


cpb_mean = zscore_bus.cost_per_bus.mean()
cpb_std = zscore_bus.cost_per_bus.std()

# agency with highest bus count
agency_with_most_bus = all_bus.loc[all_bus["bus_count"].idxmax(), "agency_name"]

# propulsion type max count and name
prop_type_name_max_freq = all_bus["prop_type"].value_counts().idxmax()
prop_type_max = all_bus["prop_type"].value_counts().max()

# prop type min count and anme
prop_type_name_min_freq = all_bus["prop_type"].value_counts().idxmin()
prop_type_min = all_bus["prop_type"].value_counts().min()

# how many buses do they have? already answered
agency_with_highest_funds = all_bus.loc[
    all_bus["project_award_amount"].idxmax(), "agency_name"
]

# what is the highest amount? already answered
agency_max_cpb = all_bus.loc[all_bus["cost_per_bus"].idxmax(), "agency_name"]
agency_min_cpb = all_bus.loc[all_bus["cost_per_bus"].idxmin(), "agency_name"]
prop_type_max_cpb = all_bus.loc[all_bus["cost_per_bus"].idxmax(), "prop_type"]
prop_type_min_cpb = all_bus.loc[all_bus["cost_per_bus"].idxmin(), "prop_type"]


zeb_count = len(zeb_only.prop_type)


non_zeb_count = len(non_zeb_only.prop_type)

# zeb only, no outliers cpb curve
zeb_only_mean = zeb_only_no_outliers.cost_per_bus.mean()
zeb_only_std = zeb_only_no_outliers.cost_per_bus.std()

# non-zeb cpb mean and std dev
non_zeb_only_mean = non_zeb_only.cost_per_bus.mean()
non_zeb_only_std = non_zeb_only.cost_per_bus.std()

# start summary narative
summary = f"""

This analysis examines the 'cost' of buses as it relates to grant award dollars. Specifically, transit agencies who were awarded grants to fund projects that include procuring buses and/or other transit related equipment, and how much variance there was in procuring buses. 

As of today, data was scraped from these sources:
    1. FTA Bus and Low- and No-Emission Grant Awards press release (federally funded, nationwide data)
    2. TIRCP project data (state-funded, California only)

Analyzing the dataset uncovered several nuances. Some projects included additional components besides bus purchases (chargers, transit facilities, parts, training), whereas other projects only purchased bus, and some did not include any bus purchases at all. The variety in these projects may contribute to high variances in “cost per bus”.
Additionally, some projects do not accurately or describe the propulsion or bus size type. There are numerous instances where transit agencies reported procuring “zero-emission buses” (ZEBs) but does not specify if the buses are battery electric, fuel-cell, etc. Or transit agencies state conflicting information such as procuring “hybrid electric ZEBs”. In all cases, the dataset was examined for inconsistencies and data was validated to complete the analysis.
Datasets was filtered to only include data that specified the number of buses to purchase. The compiled data was aggregated by agencies and a 'cost per bus' metric was calculated by dividing the total funding received by the total number of buses they procured.
Initial finding uncovered some outliers where a transit agency’s cost per bus figure exceeded 3 standard deviations away from the mean. Deeper investigations conclude that these projects also include major infrastructure replacements.

Overall:
    - {total_unique_projects} projects with bus purchases were analyzed.
    - ${total_funding:,.2f} was awarded to agencies for projects including bus purchases.
    - {total_bus_count} total buses are to be purchased.
    - The highest awarded dollars per bus for an agency was ${max_bus_cost:,.2f} for a {prop_type_max_cpb}, belonging too {agency_max_cpb}. 
    - The lowest awarded dollars per bus for an agency was ${min_bus_cost:,.2f} for a {prop_type_min_cpb}, belonging too {agency_min_cpb}.

The agency with the most buses procured was {agency_with_most_bus} with {max_bus_count} buses.


Propulsion type values varied wildly amongst the datasets. Data was validated and grouped as best as possible based on project description or other indications of specific propulsion type.
The following is a summary of propulsion type metrics.
     - The most common propulsion type that was proceeded was "{prop_type_name_max_freq}".
     - The number of zero-emission buses procured (electric, battery-electric and fuel-cell electric) is {zeb_count}.
     - the number of non-zero emission buses procured (CNG, hybrids, other alternate fuels) is {non_zeb_count}.
     
The following was discovered after removing outliers, :
    - overall the average awarded dollars per bus is ${cpb_mean:,.2f}, with a standard deviation of  ${cpb_std:,.2f}. 
    - the average awarded dollars per ZEB is ${zeb_only_mean:,.2f}, with a standard deviation of ${zeb_only_std:,.2f}.
    - the average awarded dollars per non-ZEB ${non_zeb_only_mean:,.2f}, with a standard deviation of ${non_zeb_only_std:,.2f}.


Below are key charts that summarize the findings.

"""

In [None]:
print(summary)


# CHARTS
# ZEB only, cpb distribution
print(
    """
ZEB only cost/bus Distribution Chart. 
The majority of the distribution is within +/-1 standard deviation of the mean, however the standard deviation is quite wide at ~$800,000."""
)
dist_curve(
    zeb_only_no_outliers,
    zeb_only_mean,
    zeb_only_std,
    title="ZEB only cost/bus Distribution",
)

# non_zeb distribution
print(
    """
non-ZEB only cost/bus Distribution. This distrubtion is is much more spread out and with a smaller standard deviation."""
)
dist_curve(
    non_zeb_only,
    non_zeb_only_mean,
    non_zeb_only_std,
    title="non-ZEB only cost/bus Distribution",
)

# Higest awarded funds by agency
print(
    """
Most funds Awarded by Transit Agency. LA Metro was awarded almost double the next agency."""
)
make_chart("project_award_amount", "Most funds Awarded by Transit Agency")

# Highest bus count
print(
    """
Highest Bus Count by Agency. LA Metro plans to procure the most buses."""
)
make_chart("bus_count", "Highest Bus Count by Agency")

# COST PER BUS BY PROP TYPE
print(
    """
Cost per bus by propulsion type. the total cost per bus for ZEB categories do fall within a similar range of eachother."""
)
make_chart("cpb", "Cost per bus by propulsion type", x_col="prop_type", data=prop_agg)

# bus count BY PROP TYPE
print(
    """
Bus count by propulsion type. The most common bus type procured were zero-emissions related."""
)
make_chart(
    "total_bus_count", "Bus count by propulsion type", x_col="prop_type", data=prop_agg
)

print(
    """
Based on the findings so far, there is data to  supports that bus procurement cost vary widely amongst transit agencies all over the country. 
More so with non-ZEB buses than ZEB buses. Upon reading deeper into the data, it appears transit agencies are still in the early stages of adopting ZEB and capital improvement (building improvement, initial charging infrastructure installation) are needed at this time. 
Cost per bus is suspected to normalize once infrastructure changes are no longer included in projects, and bus only purchases remain. """
)

---
# Deprecated

## Game Plan
- <s>bring in both data sets (FTA Press Release and TIRCP bus data
- <s>FTA data, make sure it only has rows with bus count > 0
- may need to clean up the prop type and bus size type if there are any similar categories (completed at FTA notebook)
- <s>Create shorten data frames for each. include the following columns:
    1. agency name (project_sponsor & grant_recipient)
    2. project title? (project_title)
    3. project award amount (funding and tircp_award_amount($))
    4. bus count (bus_count)
    5. propulsion type (prop_type)
    6. bus size type (bus_size_type)
<br>
<br>
- <s>concat the short dataframes
- <s>start aggregation. sum/count bus count, funding and project #
    * agg by agency name
    * agg by prop type
    * agg by bus size type

- <s>new column for Z-score of `cost_per_bus`
- rerun stats sumamry
- use functions to make charts

## Charts
By Agency
- most awarded dollars
- most bus count
- highest cpb
- Most frequent prop_type procured

By prop_type
- most awarded dollars
- most bus count
- highest cpb
- Most frequent prop_type

Just zero emission
(electric, beb, fceb, mix(beb and fceb))
- most awarded dollars
- most bus count
- highest cpb

Stats Curves
- distribution of cpb
- 


### Charts by Agency

In [None]:
# most awarded dollars by agency
make_chart("project_award_amount", "Most funds awarded by Transit Agency")

In [None]:
# Highest bus count by agency
make_chart("bus_count", "Most buses procured by Transit Agency")

In [None]:
# highest cost per bus by agency
make_chart("cost_per_bus", "Highest overall cost per bus (cps) by Transit Agency")

In [None]:
# most frequent prop_type by agency
prop_type_freq = all_bus.prop_type.value_counts().reset_index()
prop_type_freq.columns = ["prop_type", "freq"]
make_chart(
    "freq",
    "Most frequent propulsion type mentioned",
    x_col="prop_type",
    data=prop_type_freq,
)

### Charts By `prop_type`

In [None]:
# COST PER BUS BY PROP TYPE
# using prop_agg df and 'total_funds'
make_chart(
    "total_funds", "award amount by propulsion type", x_col="prop_type", data=prop_agg
)

In [None]:
# COST PER BUS BY PROP TYPE
make_chart("cpb", "cost per bus by propulsion type", x_col="prop_type", data=prop_agg)

In [None]:
# bus count BY PROP TYPE
make_chart(
    "total_bus_count",
    "cost per bus by propulsion type",
    x_col="prop_type",
    data=prop_agg,
)

In [None]:
# what is the average cps for each prop_type?
prop_agg

### ZEB Only chart
`zeb_only`

In [None]:
make_chart(
    "total_funds",
    "award amount for ZEBs, by propulsion type",
    x_col="prop_type",
    data=zeb_agg,
)
make_chart(
    "total_cpb", "ZEB cost per bus by propulsion type", x_col="prop_type", data=zeb_agg
)
make_chart(
    "total_bus_count",
    "ZEB bus count by propulsion type",
    x_col="prop_type",
    data=zeb_agg,
)

### Non ZEB df
`non_zeb_only`

In [None]:
make_chart(
    "total_funds",
    "award amount for non-ZEB by propulsion type",
    x_col="prop_type",
    data=non_zeb_agg,
)
make_chart(
    "total_cpb",
    "non-ZEB cost per bus by propulsion type",
    x_col="prop_type",
    data=non_zeb_agg,
)
make_chart(
    "total_bus_count",
    "non-ZEB bus count by propulsion type",
    x_col="prop_type",
    data=non_zeb_agg,
)

### Stat Dist Curves

In [None]:
# overall cpb
dist_curve(zscore_bus, cpb_mean, cpb_std)

In [None]:
# zeb-only, no outliers
zeb_only_no_outliers = zeb_only[
    (zeb_only["zscore_cost_per_bus"] >= -3) & (zeb_only["zscore_cost_per_bus"] <= 3)
]

# zeb only, no outliers cpb curve
zeb_only_mean = zeb_only_no_outliers.cost_per_bus.mean()
zeb_only_std = zeb_only_no_outliers.cost_per_bus.std()

# non-zeb cpb mean and std dev
non_zeb_only_mean = non_zeb_only.cost_per_bus.mean()
non_zeb_only_std = non_zeb_only.cost_per_bus.std()

dist_curve(
    zeb_only_no_outliers,
    zeb_only_mean,
    zeb_only_std,
    title="ZEB only cost/bus Distribution",
)

In [None]:
# non-zeb cpb curve
non_zeb_only_mean = non_zeb_only.cost_per_bus.mean()
non_zeb_only_std = non_zeb_only.cost_per_bus.std()

dist_curve(
    non_zeb_only,
    non_zeb_only_mean,
    non_zeb_only_std,
    title="non-ZEB only cost/bus Distribution",
)

In [None]:
# distribution curve of zscore
sns.histplot(zscore_bus["zscore_cost_per_bus"], kde=True, color="skyblue", bins=20)
plt.title("Cost Per Bus Z-Score Distribution")
plt.xlabel("zscore cost per bus")
plt.ylabel("Frequency")
plt.show()