In [1]:
import pandas as pd
pd.set_option("display.max_rows", 300)
pd.set_option("display.max_columns", 100)

In [2]:
# Old script imports
from fta_data_cleaner import *
from dgs_data_cleaner import *
from tircp_data_cleaner import *

In [3]:
GCS_PATH = "gs://calitp-analytics-data/data-analyses/bus_procurement_cost/"

In [4]:
# All Raw Data
fta_raw = pd.read_csv(f"{GCS_PATH}raw_data-analyses_bus_procurement_cost_fta_press_release_data_csv.csv")
tirp_raw = pd.read_excel(f"{GCS_PATH}raw_TIRCP Tracking Sheets 2_1-10-2024.xlsx", sheet_name="Project Tracking")
dgs17b_raw = pd.read_excel(f"{GCS_PATH}raw_17b compiled.xlsx", sheet_name = "Usage Report Template")
dgs17c_raw = pd.read_excel(f"{GCS_PATH}raw_17c compiled-Proterra Compiled Contract Usage Report .xlsx", sheet_name = "Proterra ")

  warn(msg)


In [66]:
# what does the final table look like again?
final = pd.read_parquet(f'{gcs_path}old/cpb_analysis_data_merge.parquet')

In [58]:
final.shape

(89, 11)

# Scripts to Save
for new `bus_cost_utils.py` script

In [6]:
# NEW PROP FINDER
def new_prop_finder(description: str) -> str:
    """
    function that matches keywords from each propulsion type list against the item description col, returns a standardized prop type
    now includes variable that make description input lowercase.
    to be used with .assign()
    """

    BEB_list = [
        "battery electric",
        "BEBs paratransit buses"
    ]

    cng_list = [
        "cng",
        "compressed natural gas"    
    ]

    electric_list = [
        "electric buses",
        "electric commuter",
        "electric",
    ]

    FCEB_list = [
        "fuel cell",
        "hydrogen",
        #"fuel cell electric",
        #"hydrogen fuel cell",
        #"fuel cell electric bus",
        #"hydrogen electric bus",
    ]

    # low emission (hybrid)
    hybrid_list = [
        #"diesel electric hybrids",
        #"diesel-electric hybrids",
        #"hybrid electric",
        #"hybrid electric buses",
        #"hybrid electrics",
        "hybrids",
        "hybrid",
    ]

    # low emission (propane)
    propane_list = [
        #"propane buses",
        #"propaned powered vehicles",
        "propane",
    ]

    mix_beb_list = [
        "2 BEBs and 4 hydrogen fuel cell buses",
    ]

    mix_lowe_list = [
        "diesel and gas",
    ]

    mix_zero_low_list = [
        "15 electic, 16 hybrid",
        "4 fuel cell / 3 CNG",
        "estimated-cutaway vans (PM- award will not fund 68 buses",
        "1:CNGbus ;2 cutaway CNG buses",
    ]

    zero_e_list = [
        #"zero emission buses",
        #"zero emission electric",
        #"zero emission vehicles",
        "zero-emission",
        "zero emission",
    ]

    item_description = description.lower().replace("‐", " ").strip()

    if any(word in item_description for word in BEB_list) and not any(
        word in item_description for word in ["diesel", "hybrid", "fuel cell"]
    ):
        return "BEB"

    elif any(word in item_description for word in FCEB_list):
        return "FCEB"

    elif any(word in item_description for word in hybrid_list):
        return "low emission (hybrid)"

    elif any(word in item_description for word in mix_beb_list):
        return "mix (BEB and FCEB)"

    elif any(word in item_description for word in mix_lowe_list):
        return "mix (low emission)"

    elif any(word in item_description for word in mix_zero_low_list):
        return "mix (zero and low emission)"

    elif any(word in item_description for word in zero_e_list):
        return "zero-emission bus (not specified)"

    elif any(word in item_description for word in propane_list):
        return "low emission (propane)"

    elif any(word in item_description for word in electric_list):
        return "electric (not specified)"
    
    elif any(word in item_description for word in cng_list):
        return "CNG"

    else:
        return "not specified"

In [7]:
def new_bus_size_finder(description: str) -> str:
    """
    Similar to prop_type_find, matches keywords to item description col and return standardized bus size type.
    now includes variable that make description input lowercase.
    To be used with .assign()
    """

    articulated_list = [
        "60 foot",
        "articulated",
    ]

    standard_bus_list = [
        "30 foot",
        "35 foot",
        "40 foot",
        "40ft",
        "45 foot",
        "standard",
    ]

    cutaway_list = [
        "cutaway",
    ]

    other_bus_size_list = ["feeder bus"]

    otr_bus_list = [
        "coach style",
        "over the road",
    ]

    item_description = description.lower().replace("-", " ").strip()

    if any(word in item_description for word in articulated_list):
        return "articulated"

    elif any(word in item_description for word in standard_bus_list):
        return "standard/conventional (30ft-45ft)"

    elif any(word in item_description for word in cutaway_list):
        return "cutaway"

    elif any(word in item_description for word in otr_bus_list):
        return "over-the-road"

    elif any(word in item_description for word in other_bus_size_list):
        return "other"

    else:
        return "not specified"

In [8]:
def project_type_finder(description: str) -> str:
    """
    function to match keywords to project description col to identify projects that only have bus procurement.
    used to identify projects into diffferent categories: bus only, bus + others, no bus procurement.
    use with .assign() to get a new col.
    """
    bus_list =[
        "bus",
        "transit vehicles",# for fta list
        "cutaway vehicles",# for fta list
        "zero-emission vehicles", # for tircp list
        "zero emission vehicles",
        "zero‐emissions vans",
        "hybrid-electric vehicles",
        "battery-electric vehicles",
        "buy new replacement vehicles", # specific string for fta list
    ]
    
    exclude_list =[
        "facility",
        #"station",
        "stops",
        "installation",
        "depot",
        "construct",
        "infrastructure",
        "signal priority",
        "improvements",
        "build",
        "chargers",
        "charging equipment",
        "install",
        "rail",
        "garage",
        "facilities",
        "bus washing system",
        "build a regional transit hub" # specific string needed for fta list
        #"associated infrastructure" may need to look at what is associated infrastructure is for ZEB 
        
    ]
    proj_description = description.lower().strip()

    if any(word in proj_description for word in bus_list) and not any(
        word in proj_description for word in exclude_list
    ):
        return "bus only"
    
    elif any(word in proj_description for word in exclude_list) and not any(
        word in proj_description for word in bus_list
    ):
        return "non-bus components"
    
    elif any(word in proj_description for word in exclude_list) and any(
        word in proj_description for word in bus_list
    ):
        return "includes bus and non-bus components"
    
    else:
        return "needs review"

In [9]:
def col_row_updater(df: pd.DataFrame, col1: str, val1, col2: str, new_val):
    """
    function used to update values at specificed columns and row value.
    """
    df.loc[df[col1] == val1, col2] = new_val
    
    return

# Chagnes to current scripts


In [10]:
# FTA
import numpy as np
import pandas as pd
import shared_utils
from calitp_data_analysis.sql import to_snakecase
from bus_cost_utils import *
# from dgs_data_cleaner import new_bus_size_finder, new_prop_finder, project_type_checker
#from tircp_data_cleaner import col_row_updater

def col_splitter(
    df: pd.DataFrame, 
    col_to_split: str, 
    new_col1: str, 
    new_col2: str, 
    split_char: str
)-> pd.DataFrame:
    """
    function to split a column into 2 columns by specific character.
    ex. split 100(beb) to "100" & "(beb)"
    """
    df[[new_col1, new_col2]] = df[col_to_split].str.split(
        pat=split_char, n=1, expand=True
    )

    df[new_col2] = df[new_col2].str.replace(")", "")

    return df

def fta_agg_bus_only(df: pd.DataFrame) -> pd.DataFrame:
    """
    filters FTA data to only show projects with bus procurement (bus count > 0).
    then filters projects for new_project_type = bus only
    then aggregates
    """
    df1 = df[(df["bus_count"] > 0) & (df["new_project_type"] == "bus only")]

    df2 = (
        df1.groupby(
            [
                "project_sponsor",
                "project_title",
                "new_prop_type_finder",
                "new_bus_size_type",
                "description",
                "new_project_type"
            ]
        )
        .agg(
            {
                "funding": "sum",
                "bus_count": "sum",
            }
        )
        .reset_index()
    )

    return df2

def clean_fta_columns() -> pd.DataFrame:
    """
    Main function to clean FTA data. Reads in data, changes datatypes, change specific values.
    """
    # params
    
    file = "data-analyses_bus_procurement_cost_fta_press_release_data_csv.csv"

    # read in data
    df = pd.read_csv(f"{gcs_path}{file}")

    # snakecase df
    df = to_snakecase(df)

    # clean funding values
    df["funding"] = (
        df["funding"]
        .str.replace("$", "")
        .str.replace(",", "")
        .str.strip()
    )

    # rename initial propulsion type col to propulsion category
    df = df.rename(columns={"propulsion_type": "prosulsion_category"})

    # splittign `approx_#_of_buses col to get bus count
    df1 = col_splitter(df, "approx_#_of_buses", "bus_count", "extract_prop_type", "(")

    # assign new columns via new_prop_finder and new_bus_size_finder
    df2 = df1.assign(
        new_prop_type_finder=df1["description"].apply(new_prop_finder),
        new_bus_size_type=df1["description"].apply(new_bus_size_finder),
        new_project_type=df1["description"].apply(project_type_checker)
    )

    # cleaning specific values
    col_row_updater(df2, "funding", "7443765", "bus_count", 56)
    col_row_updater(df2, "funding", "17532900", "bus_count", 12)
    col_row_updater(df2, "funding", "40402548", "new_prop_type_finder", "CNG")
    col_row_updater(df2, "funding", "30890413", "new_prop_type_finder", "mix (zero and low emission)")
    col_row_updater(df2, "funding", "29331665", "new_prop_type_finder", "mix (zero and low emission)")
    col_row_updater(df2, "funding", "7598425", "new_prop_type_finder", "mix (zero and low emission)")
    col_row_updater(df2, "funding", "7443765", "new_prop_type_finder", "mix (zero and low emission)")
    col_row_updater(df2, "funding", "3303600", "new_prop_type_finder", "mix (diesel and gas)")
    col_row_updater(df2, "funding", "2063160", "new_prop_type_finder", "low emission (hybrid)")
    col_row_updater(df2, "funding", "1760000", "new_prop_type_finder", "low emission (propane)")
    col_row_updater(df2, "funding", "1006750", "new_prop_type_finder", "ethanol")
    col_row_updater(df2, "funding", "723171", "new_prop_type_finder", "low emission (propane)")
    col_row_updater(df2, "funding", "23280546", "new_prop_type_finder", "BEB")

    # update data types
    update_cols = ["funding", "bus_count"]

    df2[update_cols] = df2[update_cols].astype("int64")

    return df2

#if __name__ == "__main__":

    # initial df (all projects)
#    all_projects = clean_fta_columns()

    # projects with bus count > 0 only.
#    just_bus = fta_agg_bus_only(all_projects)

    # export both DFs
#    all_projects.to_parquet(f"{gcs_path}clean_fta_all_projects.parquet")
#    just_bus.to_parquet(f"{gcs_path}clean_fta_bus_only.parquet")

In [11]:
# TIRCP
import numpy as np
import pandas as pd
import shared_utils
from calitp_data_analysis.sql import to_snakecase
from bus_cost_utils import *

def clean_tircp_columns() -> pd.DataFrame:
    """
    main function that reads in and cleans TIRCP data.
    """
    from fta_data_cleaner import gcs_path
    file_name = "TIRCP Tracking Sheets 2_1-10-2024.xlsx"
    tircp_name = "Project Tracking"

    # read in data
    df = pd.read_excel(f"{gcs_path}{file_name}", sheet_name=tircp_name)

    # keep specific columns
    keep_col = [
        "Award Year",
        "Project #",
        "Grant Recipient",
        "Project Title",
        "PPNO",
        "District",
        "County",
        "Project Description",
        "bus_count",
        "Master Agreement Number",
        "Total Project Cost",
        "TIRCP Award Amount ($)",
    ]

    df1 = df[keep_col]

    # snakecase
    df2 = to_snakecase(df1)

    # dict of replacement values
    value_replace_dict = {
        "Antelope Valley Transit Authority ": "Antelope Valley Transit Authority (AVTA)",
        "Humboldt Transit Authority": "Humboldt Transit Authority (HTA)",
        "Orange County Transportation Authority": "Orange County Transportation Authority (OCTA)",
        "Capitol Corridor Joint Powers Authority": "Capitol Corridor Joint Powers Authority (CCJPA)",
        "Los Angeles County Metropolitan Transportation Authority": "Los Angeles County Metropolitan Transportation Authority (LA Metro)",
        "Monterey-Salinas Transit": "Monterey-Salinas Transit District (MST)",
        "Sacramento Regional Transit (SacRT)": "Sacramento Regional Transit District (SacRT)",
        "Sacramento Regional Transit District": "Sacramento Regional Transit District (SacRT)",
        "Sacramento Regional Transit District (SacRT) ": "Sacramento Regional Transit District (SacRT)",
        "San Diego Association of Governments": "San Diego Association of Governments (SANDAG)",
        "Santa Clara Valley Transportation Authority (SCVTA)": "Santa Clara Valley Transportation Authority (VTA)",
        "Southern California  Regional Rail Authority (SCRRA)": "Southern California Regional Rail Authority (SCRRA - Metrolink)",
        "Southern California Regional Rail Authority": "Southern California Regional Rail Authority (SCRRA - Metrolink)",
        "3, 4": "VAR",
    }
    
    # replacing values in agency & county col
    df3 = df2.replace(
        {"grant_recipient": value_replace_dict}
    ).replace(
        {"county": value_replace_dict}
    )
    
    # using update function to update values at specific columns and rows
    col_row_updater(df3, 'ppno', 'CP106', 'bus_count', 42)
    col_row_updater(df3, 'ppno', 'CP005', 'bus_count', 29)
    col_row_updater(df3, 'ppno', 'CP028', 'bus_count', 12)
    col_row_updater(df3, 'ppno', 'CP048', 'bus_count', 5)
    col_row_updater(df3, 'ppno', 'CP096', 'bus_count', 6)
    col_row_updater(df3, 'ppno', 'CP111', 'bus_count', 5)
    col_row_updater(df3, 'ppno', 'CP130', 'bus_count', 7)
    col_row_updater(df3, 'total_project_cost', 203651000, 'bus_count', 8)
    
    # columns to change dtype to str
    dtype_update = [
        'ppno',
        'district'
    ]
    
    df3[dtype_update] = df3[dtype_update].astype('str')
    
    # assigning new columns using imported functions.
    df4 = df3.assign(
        prop_type = df3['project_description'].apply(new_prop_finder),
        bus_size_type = df3['project_description'].apply(new_bus_size_finder),
        new_project_type  = df3['project_description'].apply(project_type_checker)
    )

    return df4

def tircp_agg_bus_only(df: pd.DataFrame) -> pd.DataFrame:
    """
    filters df to only include projects with bus procurement and for project type = bus only 
    does not include engineering, planning or construction only projects.
    then, aggregates the df by agency name and ppno. Agencies may have multiple projects that procure different types of buses
    """
    df2 = df[
        (df["bus_count"] > 0) & (df["new_project_type"] == "bus only")
    ]
    
    df3 = (
        df2.groupby(
            [
                "grant_recipient",
                "ppno",
                "prop_type",
                "bus_size_type",
                "project_description",
                "new_project_type"
            ]
        )
        .agg({"total_project_cost": "sum", "bus_count": "sum"})
        .reset_index()
    )
    return df3

#if __name__ == "__main__":
    
    
    
    # initial df
#    df1 = clean_tircp_columns()
    
    # aggregate 
#    df2 = tircp_agg_bus_only(df1)
    
    # export both df's as parquets to GCS
#    df1.to_parquet(f'{gcs_path}clean_tircp_all_project.parquet')
#    df2.to_parquet(f'{gcs_path}clean_tircp_bus_only_clean.parquet')

In [12]:
# DGS
import numpy as np
import pandas as pd
import shared_utils
from calitp_data_analysis.sql import to_snakecase
from bus_cost_utils import *

def calculate_total_cost(row):
    """
    Calculate new column for total cost by checking if total_with_options_per_unit is present or not.
    if not, then calculate using contract_unit_price.
    to be used with .assign()
    """
    if row["total_with_options_per_unit"] > 0:
        return row["total_with_options_per_unit"] * row["quantity"]
    else:
        return row["contract_unit_price"] * row["quantity"]
    
def clean_dgs_columns() -> pd.DataFrame:
    """
    reads in 2 dgs sheets, adds source column, merges both DFs, snakecase columns, update dtypes for monetary columns.
    merged first becaues the snakecase function messes with the dtypes for some reason
    """
    
    from fta_data_cleaner import gcs_path
    
    # params
    file_17c = "17c compiled-Proterra Compiled Contract Usage Report .xlsx"
    file_17b = "17b compiled.xlsx"
    sheet_17c = "Proterra "
    sheet_17b = "Usage Report Template"

    # merge columns for dataframes
    merge_col = [
        "Agency Billing Code",
        "Contract Line Item Number (CLIN)                (RFP ID)",
        "Contract Unit Price",
        "Delivery Date",
        "Extended Contract Price Paid",
        "Index Date / Catalog Version",
        "Item Description",
        "List Price/MSRP",
        "Manufacturer (OEM)",
        "Manufacturer Part Number (OEM #)",
        "Ordering Agency Name",
        "Purchase Order Date",
        "Purchase Order Number",
        "Purchasing Authority Number                    (for State departments)",
        "Quantity in \nUnit of Measure\n",
        "Quantity",
        "source",
        "State (S) or Local (L) agency",
        "Unit of Measure",
        "UNSPSC Code\n(Version 10)",
        "Supplier Contract Usage ID",
    ]

    # columns to change dtype
    to_int64 = [
        "contract_unit_price",
        "extended_contract_price_paid",
        "total_with_options_per_unit",
        "grand_total",
    ]
    
    # read in data
    dgs_17c = pd.read_excel(f"{gcs_path}{file_17c}", sheet_name=sheet_17c)
    dgs_17b = pd.read_excel(f"{gcs_path}{file_17b}", sheet_name=sheet_17b)

    # add new column to identify source
    dgs_17c["source"] = "17c"
    dgs_17b["source"] = "17b"

    # merge
    dgs_17bc = pd.merge(dgs_17b, dgs_17c, how="outer", on=merge_col).fillna(0)

    # snakecase
    dgs_17bc = to_snakecase(dgs_17bc)

    # takes list of columns and updates to int64
    dgs_17bc[to_int64] = dgs_17bc[to_int64].astype("int64")

    # change purchase_order_number col to str
    dgs_17bc["purchase_order_number"] = dgs_17bc["purchase_order_number"].astype("str")

    # adds 3 new columns from functions
    dgs_17bc2 = dgs_17bc.assign(
        total_cost=dgs_17bc.apply(calculate_total_cost, axis=1),
        new_prop_type=dgs_17bc["item_description"].apply(new_prop_finder),
        new_bus_size=dgs_17bc["item_description"].apply(new_bus_size_finder),
    )

    return dgs_17bc2

def dgs_agg_by_agency(df: pd.DataFrame) -> pd.DataFrame:
    """
    function that aggregates the DGS data frame by transit agency and purchase order number (PPNO) to get total cost of just buses without options.
    first, dataframe is filtered for rows containing buses (does not include rows with 'not specified').
    then, group by agency, PPNO, prop type and bus size. and aggregate the quanity and total cost of just buses.
    Possible for agencies to have multiple PPNOs for different bus types and sizes.
    """
    # filter for rows containing bus, does not include accessories/warranties/parts/etc.
    agg_agency_bus_count = df[~df["new_prop_type"].str.contains("not specified")]

    agg_agency_bus_count2 = agg_agency_bus_count[
        [
            "ordering_agency_name",
            "purchase_order_number",
            "item_description",
            "quantity",
            "source",
            "total_cost",
            "new_prop_type",
            "new_bus_size",
        ]
    ]

    agg_agency_bus_count3 = (
        agg_agency_bus_count2.groupby(
            [
                "ordering_agency_name",
                "purchase_order_number",
                "new_prop_type",
                "new_bus_size",
            ]
        )
        .agg(
            {
                "quantity": "sum",
                "total_cost": "sum",
                "source": "max",
            }
        )
        .reset_index()
    )

    return agg_agency_bus_count3

def dgs_agg_by_agency_w_options(df: pd.DataFrame) -> pd.DataFrame:
    """
    similar to the previous function, aggregates the DGS dataframe by transit agency to get total cost of buses with options.
    agencies may order buses with different configurations, resulting in different total cost.
    function creates 1 df of only buses to retain initial proulsion type, size type and quanity of buses.
    then, creates 2nd df of aggregated total cost of buses+options, by transit agency.
    lastly, both df's are merged together.
    """
    # filter df for rows NOT containing 'not specified'. only returns rows with buses
    dfa = df[~df["new_prop_type"].str.contains("not specified")]

    # keep specific columns
    df2 = dfa[
        [
            "ordering_agency_name",
            "purchase_order_number",
            "quantity",
            "new_prop_type",
            "new_bus_size",
            "source",
        ]
    ]

    # aggregate by agency and PPNO, get total cost of buses with options
    df3 = (
        df.groupby(["ordering_agency_name", "purchase_order_number"])
        .agg({"total_cost": "sum"})
        .reset_index()
    )

    # merge both dataframes on agency and PPNO to get bus only rows & total cost with options.
    merge = pd.merge(
        df2, df3, on=["ordering_agency_name", "purchase_order_number"], how="left"
    )

    return merge

#if __name__ == "__main__":
    

    # initial df
#    df1 = clean_dgs_columns()
    
    #df of just bus cost (no options)
#    just_bus = dgs_agg_by_agency(df1)
    
    #df of bus cost+options
#    bus_w_options = dgs_agg_by_agency_w_options(df1)
    
    #export serperate df's as parquet to GCS
#    just_bus.to_parquet(f'{gcs_path}clean_dgs_all_projects.parquet')
#    bus_w_options.to_parquet(f'{gcs_path}clean_dgs_bus_only_options.parquet')

In [13]:
# cost per bus cleaner
# rename to all_bus_cost_cleaner?

import pandas as pd
from bus_cost_utils import *
from scipy.stats import zscore

def prepare_all_data() ->pd.DataFrame:
    """
    primary function to read-in, merge data across FTA, TIRCP and DGS data.
    standardizes columns names, then exports as parquet.
    """
    # variables for file names


    
    # dictionary to update columns names 
    col_dict = {
        "funding": "total_cost",
        "grant_recipient": "transit_agency",
        "new_bus_size": "bus_size_type",
        "new_bus_size_type": "bus_size_type",
        "new_prop_type": "prop_type",
        "new_prop_type_finder": "prop_type",
        "ordering_agency_name": "transit_agency",
        "purchase_order_number": "ppno",
        "quantity": "bus_count",
        "total_project_cost": "total_cost",
        "project_sponsor": "transit_agency",
    }

    # reading in data
    # bus only projects for each datase
    fta = pd.read_parquet(f"{gcs_path}clean_fta_bus_only.parquet")
    tircp = pd.read_parquet(f"{gcs_path}clean_tircp_bus_only_clean.parquet")
    dgs = pd.read_parquet(f"{gcs_path}clean_dgs_bus_only_options.parquet")
    
    # adding new column to identify source
    fta["source"] = "fta"
    tircp["source"] = "tircp"
    dgs["source"] = "dgs"

    # using .replace() with dictionary to update column names
    fta2 = fta.rename(columns=col_dict)
    tircp2 = tircp.rename(columns=col_dict)
    dgs2 = dgs.rename(columns=col_dict)
    
    # merging fta2 and tircp 2
    merge1 = pd.merge(fta2,
        tircp2,
        on=[
            "transit_agency",
            "prop_type",
            "bus_size_type",
            "total_cost",
            "bus_count",
            "source",
            "new_project_type"
        ],
        how="outer",
    )
    
    # mergeing merge1 and dgs2
    merge2 = pd.merge(merge1,
        dgs2,
        on=[
            "transit_agency",
            "prop_type",
            "bus_size_type",
            "total_cost",
            "bus_count",
            "source",
            "ppno",
        ],
        how="outer",
    )
    
    return merge2

def cpb_zscore_outliers(df: pd.DataFrame, zscore_col: str) -> pd.DataFrame:
    """
    function that calculated cost per bus col, z-score col, then flags outliers
    """
    #calculate cost per bus (aka unit cost per bus)
    df['cpb'] = (df['total_cost'] / df['bus_count']).astype("int64")
    
    #calculate zscore
    df["zscore_cost_per_bus"] = zscore(df["cpb"])
    
    #flag outliers


    
    return df


#if __name__ == "__main__":
    
    # initial df
    #df1 = prepare_all_data()
    
    # export to gcs
    #df1.to_parquet(f'{gcs_path}cpb_analysis_data_merge.parquet')


In [14]:
# new function to tag rows with an outlier flag
def outlier_flag(col):
    """
    function to flag rows 
    """
    return col <= -3 or col >= 3

#df["is cpb outlier?"] = df["zscore_cost_per_bus"].apply(outlier_flag)

In [67]:
# initial final df from old code
# 89 rows and 11 columns
display(
    final.shape,
    final.columns
)

(89, 11)

Index(['transit_agency', 'project_title', 'prop_type', 'bus_size_type',
       'description', 'new_project_type', 'total_cost', 'bus_count', 'source',
       'ppno', 'project_description'],
      dtype='object')

In [68]:
# making copy of final 
# test = final #THIS DOES NOT WORK! this is just assigning a new name to final
test = final.copy()

In [69]:
test.shape

(89, 11)

In [70]:
# using function to on test df to get zscore 
test = cpb_zscore_outliers(test,"zscore_cost_per_bus")

In [72]:
# why does the shape of `final` change after calling `cpb_zscore_outlier` on `test` df?
display(
    final.shape,
    test.shape,
    final.columns,
    test.columns
)

(89, 11)

(89, 13)

Index(['transit_agency', 'project_title', 'prop_type', 'bus_size_type',
       'description', 'new_project_type', 'total_cost', 'bus_count', 'source',
       'ppno', 'project_description'],
      dtype='object')

Index(['transit_agency', 'project_title', 'prop_type', 'bus_size_type',
       'description', 'new_project_type', 'total_cost', 'bus_count', 'source',
       'ppno', 'project_description', 'cpb', 'zscore_cost_per_bus'],
      dtype='object')

In [79]:
test["is cpb outlier?"] = test["zscore_cost_per_bus"].apply(outlier_flag)

In [78]:
test["is cpb outlier?"] = ((test["zscore_cost_per_bus"] <= -3) or (test["zscore_cost_per_bus"] >= 3))

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [80]:
display(
    final.shape,
    test.shape,
    test.head(),
    test["zscore_cost_per_bus"].value_counts(),
    test["is cpb outlier?"].value_counts()
)

(89, 11)

(89, 14)

Unnamed: 0,transit_agency,project_title,prop_type,bus_size_type,description,new_project_type,total_cost,bus_count,source,ppno,project_description,cpb,zscore_cost_per_bus,is cpb outlier?
0,AUTORIDAD METROPOLITANA DE AUTOBUSES (PRMBA),Puerto Rico Initiative Minimizing Emissions Pl...,electric (not specified),not specified,The Metropolitan Bus Authority will receive fu...,bus only,10000000,8.0,fta,,,1250000,0.917956,False
1,Cape Fear Public Transportation Authority,Wave Transit Low Emissions Replacement Vehicles,CNG,not specified,Wave Transit will receive funding to buy compr...,bus only,2860250,5.0,fta,,,572050,-0.529139,False
2,Central Oklahoma Transportation and Parking Au...,"COTPA, dba EMBARK Elimination of Fixed Route D...",CNG,not specified,The Central Oklahoma Transportation and Parkin...,bus only,4278772,9.0,fta,,,475419,-0.735399,False
3,Champaign-Urbana Mass Transit District,MTD 40-Foot Hybrid Replacement Buses,low emission (hybrid),not specified,The Champaign-Urbana Mass Transit District wil...,bus only,6635394,10.0,fta,,,663539,-0.333854,False
4,City of Beaumont,Beaumont Municipal Transit Zips to Improve Low...,CNG,not specified,Beaumont Municipal Transit will receive fundin...,bus only,2819460,5.0,fta,,,563892,-0.546552,False


 0.917956    1
-0.652337    1
 0.227613    1
 0.684585    1
 0.268993    1
 0.293092    1
 0.308767    1
 1.586696    1
 0.579979    1
 0.202398    1
 0.688137    1
 0.239078    1
 0.183435    1
 0.174359    1
 0.175072    1
 0.706705    1
 0.879900    1
 0.445312    1
-0.299328    1
-1.595436    1
 0.019325    1
 0.229141    1
 0.557801    1
 0.257558    1
 0.420721    1
 0.217728    1
 0.237321    1
 0.807602    1
 5.130048    1
 1.689929    1
 0.083937    1
 0.464745    1
 0.355695    1
 0.806018    1
 0.421807    1
 0.168741    1
 2.661856    1
 0.455833    1
 0.469117    1
 0.150764    1
 0.713839    1
 0.393747    1
 0.058203    1
-0.235963    1
 0.208800    1
-0.529139    1
 1.155549    1
-0.597550    1
 0.341421    1
-0.547645    1
 0.281905    1
-1.549482    1
-0.522767    1
 1.359348    1
 0.692645    1
 0.651142    1
-1.332774    1
-0.573985    1
-1.473871    1
-1.318859    1
-0.612123    1
-0.064318    1
-0.355957    1
-0.546552    1
-0.333854    1
-0.735399    1
-0.923420 

False    88
True      1
Name: is cpb outlier?, dtype: int64

In [82]:
test[test["is cpb outlier?"] == True]


Unnamed: 0,transit_agency,project_title,prop_type,bus_size_type,description,new_project_type,total_cost,bus_count,source,ppno,project_description,cpb,zscore_cost_per_bus,is cpb outlier?
84,Transit Joint Powers Authority for Merced County,,BEB,standard/conventional (30ft-45ft),,,3223324,1.0,dgs,EBUS002,,3223324,5.130048,True


In [81]:
# need to compare the results from outlier_flag to initial method
# can i read in the df from the old NB with outliers identified?

display(
    final.columns,
    test.columns
)

Index(['transit_agency', 'project_title', 'prop_type', 'bus_size_type',
       'description', 'new_project_type', 'total_cost', 'bus_count', 'source',
       'ppno', 'project_description'],
      dtype='object')

Index(['transit_agency', 'project_title', 'prop_type', 'bus_size_type',
       'description', 'new_project_type', 'total_cost', 'bus_count', 'source',
       'ppno', 'project_description', 'cpb', 'zscore_cost_per_bus',
       'is cpb outlier?'],
      dtype='object')