In [1]:
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import numpy as np
import pandas as pd
import seaborn as sns
import shared_utils
from scipy.stats import zscore

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

# function to display df info
def df_peek(df):
    display(type(df), df.shape, df.dtypes)

## Read in Raw Data
17C and 17B via excels

In [2]:
# Params to read in dgs data
url_17c = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/17c compiled-Proterra Compiled Contract Usage Report .xlsx'
url_17b = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/17b compiled.xlsx'

sheet_17c = 'Proterra '
sheet_17b = 'Usage Report Template'

def read_excel(url, sheet):
    df = pd.read_excel(url, sheet_name=sheet)
    
    return df

dgs_17c = read_excel(url_17c, sheet_17c)
dgs_17b = read_excel(url_17b, sheet_17b)

#add new column to identify source
dgs_17c['source'] = '17c'
dgs_17b['source'] = '17b'

## Merge data frames

In [3]:
merge_col=['Supplier Contract Usage ID',
          'Ordering Agency Name',
          'State (S) or Local (L) agency',
          'Purchasing Authority Number                    (for State departments)',
          'Agency Billing Code',
          'Purchase Order Number',
          'Purchase Order Date',
          'Delivery Date',
          'Contract Line Item Number (CLIN)                (RFP ID)',
          'UNSPSC Code\n(Version 10)',
          'Manufacturer Part Number (OEM #)',
          'Manufacturer (OEM)',
          'Item Description',
          'Unit of Measure',
          'Quantity in \nUnit of Measure\n',
          'Quantity',
          'List Price/MSRP',
          'Index Date / Catalog Version',
          'Contract Unit Price',
          'Extended Contract Price Paid',
          'source']

dgs_17bc = pd.merge(dgs_17b, dgs_17c, how='outer', on= merge_col).fillna(0)

In [4]:
# check work
# number of rows is correct (107 rows)
# will need to work on merge columns

#df_peek(dgs_17bc)


pandas.core.frame.DataFrame

(107, 31)

Supplier Contract Usage ID                                                  int64
Ordering Agency Name                                                       object
State (S) or Local (L) agency                                              object
Purchasing Authority Number                    (for State departments)    float64
Agency Billing Code                                                       float64
Purchase Order Number                                                      object
Purchase Order Date                                                        object
Delivery Date                                                              object
Contract Line Item Number (CLIN)                (RFP ID)                  float64
UNSPSC Code\n(Version 10)                                                 float64
Manufacturer Part Number (OEM #)                                           object
Manufacturer (OEM)                                                         object
SKU # / Item #_x

## Data Cleaning and QC

In [5]:
#snake case columns
def snake_case(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.strip()
    
snake_case(dgs_17bc)

In [6]:
# check work
dgs_17bc.columns

Index(['supplier_contract_usage_id', 'ordering_agency_name',
       'state_(s)_or_local_(l)_agency',
       'purchasing_authority_number____________________(for_state_departments)',
       'agency_billing_code', 'purchase_order_number', 'purchase_order_date',
       'delivery_date',
       'contract_line_item_number_(clin)________________(rfp_id)',
       'unspsc_code\n(version_10)', 'manufacturer_part_number_(oem_#)',
       'manufacturer_(oem)', 'sku_#_/_item_#_x', 'item_description',
       'unit_of_measure', 'quantity_in_\nunit_of_measure', 'epp_(y/n)_x',
       'quantity', 'list_price/msrp', 'index_date_/_catalog_version',
       'contract_unit_price', 'contract_discount_x',
       'extended_contract_price_paid', 'total_with_options_per_unit',
       'grand_total', 'source', 'sku_#_/_item_#_y', 'epp_(y/n)_y',
       'contract_discount_y', 'core/_noncore', 'group_id/_segment_id'],
      dtype='object')

In [7]:
# check financial columns to be `int64`
money =['contract_unit_price',
       'extended_contract_price_paid',
       'total_with_options_per_unit',
       'grand_total']

# loop that takes money list to convert to int64 dtype
for column in money:
    dgs_17bc[column] = dgs_17bc[column].astype('int64')

In [58]:
# check work. GTG
# the columns are int64 dtype
dgs_17bc[money].dtypes


contract_unit_price             int64
extended_contract_price_paid    int64
total_with_options_per_unit     int64
grand_total                     int64
dtype: object

In [9]:
# drop unnessary columns?
drops =['supplier_contract_usage_id',
       'state_(s)_or_local_(l)_agency',
       'purchasing_authority_number____________________(for_state_departments)',
       'agency_billing_code',
       'unspsc_code\n(version_10)',
       'unit_of_measure',
       'epp_(y/n)_x',
       'epp_(y/n)_y',
       'list_price/msrp',
       'index_date_/_catalog_version',
       'core/_noncore',
       'group_id/_segment_id']

dgs_17bc.drop(columns=drops, inplace=True)

In [10]:
# check work. GTG
# initial merge had like 30 columns

df_peek(dgs_17bc)

pandas.core.frame.DataFrame

(107, 19)

ordering_agency_name                                         object
purchase_order_number                                        object
purchase_order_date                                          object
delivery_date                                                object
contract_line_item_number_(clin)________________(rfp_id)    float64
manufacturer_part_number_(oem_#)                             object
manufacturer_(oem)                                           object
sku_#_/_item_#_x                                             object
item_description                                             object
quantity_in_\nunit_of_measure                                 int64
quantity                                                      int64
contract_unit_price                                           int64
contract_discount_x                                          object
extended_contract_price_paid                                  int64
total_with_options_per_unit                     

In [11]:
# new column for total cost
# 17b >> `grand total` = total_with_options * quanity
# 17c >> `extended contract price paid` = contract unit price * quanity

# what im trying to do: create a new column called "total_cost". for each row, if `totals_with_options_per_unit` is >=0, then multiply `totals_with_options_per_unit` by `quanity'
# if 0, then multiple `contract_unit_price` by `quanity`.

# df['total_cost'] = df['quanity'] * df['total_with_options'] or df['contract unit price']???

def calculate_total_cost(row):
    if row['total_with_options_per_unit'] > 0:
        return row['total_with_options_per_unit'] * row['quantity']
    else:
        return row['contract_unit_price'] * row['quantity']


In [12]:
# new colum for total cost

dgs_17bc['total_cost'] = dgs_17bc.apply(calculate_total_cost, axis=1)

In [13]:
# comparing totals columns to new `total_cost` column to see if logic works
# 17b = Grand total
# 17c = Extended Contract Price Paid
keep_col=['ordering_agency_name',
         'purchase_order_number',
          'item_description',
          'source',
          'grand_total',
          'extended_contract_price_paid',
         'total_cost']

col_compare = dgs_17bc[keep_col]
col_compare.shape

(107, 7)

In [59]:
# check work. GTG
# found some zero values, but manuals and warranty items that didnt have a cost initially.
# should be good to go.

dgs_17bc[dgs_17bc['total_cost'] ==0].item_description.unique()

array(['Extended warranty for Propulsion System and Battery', 'Manuals',
       'Extended warranty for Propulsion System, Battery and Air Compressor',
       'Manuals, Diagnostic equipment and tooling',
       'Extended warranty for Battery and Air Compressor',
       'Manuals and Training',
       'Training, Manuals, Diagnostic equipment and tooling',
       'Extended warranty for Propulsion System, Battery, HVAC, APC and Camera system',
       'Extended warranty for Air Compressor',
       'Diagnostic equipment and tooling', 'Other - Tools'], dtype=object)

In [16]:
# new column for prop_type

prop_list = ['Battery Electric Bus',
            'battery electric bus',
            'Fuel Cell Electric Bus',
            'fuel cell electric bus',
            'Hydrogen Electic Bus',
            'hydrogen electric bus',
            'battery electric',
            ]

# function to match keywords to list
def prop_type_finder(description):
    for keyword in prop_list:
        if keyword in description:
            return keyword
    return "not specified"

# add new col `prop_type`, fill it with values based on project_description using prop_type_finder function
dgs_17bc["prop_type"] = dgs_17bc["item_description"].apply(prop_type_finder)

In [17]:
# check work. GTG
# all good. no more mention of buses in value counts. 
display(dgs_17bc.prop_type.value_counts(),
        dgs_17bc[dgs_17bc['prop_type']=='not specified'].item_description.value_counts()
       )

not specified             70
Battery Electric Bus      20
battery electric bus       8
Hydrogen Electic Bus       4
battery electric           3
Fuel Cell Electric Bus     1
hydrogen electric bus      1
Name: prop_type, dtype: int64

Options - Configurables                                                          13
Additional 220 kWH battery pack                                                  13
Other - Tools                                                                     8
Options - Charging Equipment                                                      5
Extended warranty for Propulsion System and Battery                               3
Options - Configurables (DuoPower)                                                3
Other - Spare Parts                                                               3
Other - Training, manuals and Tools                                               3
Options - Configurables (non taxable)                                             2
Extended Warranty                                                                 2
Options - Warranty                                                                2
Other - Warranty                                                            

In [18]:
# new column for bus size type

size_list =['35 Foot',
           '40 Foot',
           '60 foot',
           '40 foot',
           '35 foot',
           ]

def bus_size_finder(description):
    for keyword in size_list:
        if keyword in description:
            return keyword
    return "not specified"

dgs_17bc["bus_size_type"] = dgs_17bc["item_description"].apply(bus_size_finder)

In [19]:
# check work. GTG
# good to go, included more str in list, checked value counts and no more mentions of bus.
display(dgs_17bc.shape,
        dgs_17bc.bus_size_type.value_counts(),
        #value counts `item_descrip` column, filtered by 'bus_size_type' == not specified'
        dgs_17bc[dgs_17bc['bus_size_type'] =='not specified'].item_description.value_counts()
       )



(107, 22)

not specified    70
40 Foot          16
35 Foot           9
40 foot           8
35 foot           3
60 foot           1
Name: bus_size_type, dtype: int64

Options - Configurables                                                          13
Additional 220 kWH battery pack                                                  13
Other - Tools                                                                     8
Options - Charging Equipment                                                      5
Extended warranty for Propulsion System and Battery                               3
Options - Configurables (DuoPower)                                                3
Other - Spare Parts                                                               3
Other - Training, manuals and Tools                                               3
Options - Configurables (non taxable)                                             2
Extended Warranty                                                                 2
Options - Warranty                                                                2
Other - Warranty                                                            

## Aggregate by Agency name number?
PPNO isnt stable, UCSD has a different PPNO for each line item (bus, manuals, other) but are still part of the same purchase

1. get df of prop_type, by PPNO for 17c, by item_des includes 'bus'
2. df of PPNO and prop_type

In [62]:
list(dgs_17bc.columns)

['ordering_agency_name',
 'purchase_order_number',
 'purchase_order_date',
 'delivery_date',
 'contract_line_item_number_(clin)________________(rfp_id)',
 'manufacturer_part_number_(oem_#)',
 'manufacturer_(oem)',
 'sku_#_/_item_#_x',
 'item_description',
 'quantity_in_\nunit_of_measure',
 'quantity',
 'contract_unit_price',
 'contract_discount_x',
 'extended_contract_price_paid',
 'total_with_options_per_unit',
 'grand_total',
 'source',
 'sku_#_/_item_#_y',
 'contract_discount_y',
 'total_cost',
 'prop_type',
 'bus_size_type']

In [87]:
prop_type_ppno = dgs_17bc[(dgs_17bc['source']=='17c') & (dgs_17bc['item_description'].str.contains('bus'))]

prop_type_ppno

Unnamed: 0,ordering_agency_name,purchase_order_number,purchase_order_date,delivery_date,contract_line_item_number_(clin)________________(rfp_id),manufacturer_part_number_(oem_#),manufacturer_(oem),sku_#_/_item_#_x,item_description,quantity_in_\nunit_of_measure,...,contract_discount_x,extended_contract_price_paid,total_with_options_per_unit,grand_total,source,sku_#_/_item_#_y,contract_discount_y,total_cost,prop_type,bus_size_type


In [94]:
# df of just aggregated ppno

ppno = dgs_17bc.groupby('ordering_agency_name').agg({
    'quantity':'sum',
    'total_cost':'sum'
}).reset_index()

# total cost looks good. 
display(ppno.shape,
        ppno.head()
       )

(32, 3)

Unnamed: 0,ordering_agency_name,quantity,total_cost
0,Alameda County Transit Authority,40,22846640
1,"CITY OF PORTERVILLE (PORTERVILLE, CA)",3,2781891
2,CULVER CITY TRANSPORTATION DEPARTMENT (CULVER ...,12,3623536
3,City of Roseville,32,9651507
4,City of San Luis Obispo,5,859270


In [99]:
# df of prop_type without `not_specified` per ppno

just_prop = dgs_17bc[dgs_17bc['prop_type'] != 'not specified']

#32 unique agency names in `just_prop`, but 32 unique agency in `ppno`.
display(just_prop.shape, just_prop.ordering_agency_name.nunique(), just_prop.head())


(37, 22)

32

Unnamed: 0,ordering_agency_name,purchase_order_number,purchase_order_date,delivery_date,contract_line_item_number_(clin)________________(rfp_id),manufacturer_part_number_(oem_#),manufacturer_(oem),sku_#_/_item_#_x,item_description,quantity_in_\nunit_of_measure,...,contract_discount_x,extended_contract_price_paid,total_with_options_per_unit,grand_total,source,sku_#_/_item_#_y,contract_discount_y,total_cost,prop_type,bus_size_type
0,SUNLINE TRANSIT AGENCY (THOUSAND PALMS),11819,2020-07-21 00:00:00,ALL 5 HAVE BEEN DELIVERED,10.2,XHE40,New Flyer of America Inc,SR-2559,40 Foot Low Floor Fuel Cell Electric Bus,1,...,Volume Discount Applied,5059893,1151031,5755155,17b,0.0,0.0,5755155,Fuel Cell Electric Bus,40 Foot
3,Lane Transit (Oregon),2020-061,0,ALL 11 HAVE BEEN DELIVERED,3.3,XE40,New Flyer of America Inc,SR-2506,40 Foot Low Floor Battery Electric Bus,1,...,0,9738237,901966,9921636,17b,0.0,0.0,9921626,Battery Electric Bus,40 Foot
6,VICTOR VALLEY TRANSIT AUTHORITY (VVTA),1416,2020-07-17 00:00:00,ALL 5 HAVE BEEN DELIVERED,3.2,XE40,New Flyer of America Inc,SR-2554,40 Foot Low Floor Battery Electric Bus,1,...,Volume Discount Applied,3703843,901632,4508162,17b,0.0,0.0,4508160,Battery Electric Bus,40 Foot
8,CULVER CITY TRANSPORTATION DEPARTMENT (CULVER ...,22100367 - 00,2020-08-19 00:00:00,ALL 4 HAVE BEEN DELIVERED,3.1,XE40,New Flyer of America Inc,SR-2564,40 Foot Low Floor Battery Electric Bus,1,...,Volumn Discount Not Applicable,2967072,905884,3623536,17b,0.0,0.0,3623536,Battery Electric Bus,40 Foot
11,ORANGE COUNTY TRANSPORTATION AUTHORITY (ORANGE...,CO2165,2020-12-01 00:00:00,ALL 10 HAVE BEEN DELIVERED,3.3,XE40,New Flyer of America Inc,SR-2588,40 Foot Low Floor Battery Electric Bus,1,...,Volume Discount Applied,7387686,931952,9319524,17b,0.0,0.0,9319520,Battery Electric Bus,40 Foot


In [None]:
# !!!!DEPRECATED!!!!

# to consolidate all the individual line items up to bus row?
# observe what will happen to item_desc col. hope it consolidates upwards to only show the bus row?

groupby1= dgs_17bc.groupby('ordering_agency_name').agg({
    'purchase_order_number':'count',
'total_cost': 'sum',
'prop_type':'min'}).reset_index()

groupby2=dgs_17bc.groupby('ordering_agency_name').agg({
    'purchase_order_number':'count',
'total_cost': 'sum',
'prop_type':'max'}).reset_index()

display(groupby1.head(),
        groupby2.head(),
        groupby1.prop_type.value_counts(),
        groupby2.prop_type.value_counts()
       )

In [None]:
# !!!!DEPRECATED!!!!

# comparing the prop types of the aggregated df and dgs_17bc

display(
    groupby1.prop_type.value_counts(),
    dgs_17bc.prop_type.value_counts()
)


In [None]:
# !!!!DEPRECATED!!!!

# maybe compare the count of agency names from the 2 instead?
# when looking at unique agency names, both have 32. values match everything. 
groupby1.ordering_agency_name.sort_values().unique() == dgs_17bc.ordering_agency_name.sort_values().unique()


## Export Cleaned data
save out as parquet

Initial Summary Stats