In [1]:
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import numpy as np
import pandas as pd
import seaborn as sns
import shared_utils
from scipy.stats import zscore

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

# function to display df info
def df_peek(df):
    display(type(df), df.shape, df.dtypes)

## Read in Raw Data
17C and 17B via excels

In [25]:
# Params to read in dgs data
url_17c = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/17c compiled-Proterra Compiled Contract Usage Report .xlsx'
url_17b = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/17b compiled.xlsx'

sheet_17c = 'Proterra '
sheet_17b = 'Usage Report Template'

def read_excel(url, sheet):
    df = pd.read_excel(url, sheet_name=sheet)
    
    return df

dgs_17c = read_excel(url_17c, sheet_17c)
dgs_17b = read_excel(url_17b, sheet_17b)

#add new column to identify source
dgs_17c['source'] = '17c'
dgs_17b['source'] = '17b'

## Merge data frames

In [33]:
merge_col=['Supplier Contract Usage ID',
          'Ordering Agency Name',
          'State (S) or Local (L) agency',
          'Purchasing Authority Number                    (for State departments)',
          'Agency Billing Code',
          'Purchase Order Number',
          'Purchase Order Date',
          'Delivery Date',
          'Contract Line Item Number (CLIN)                (RFP ID)',
          'UNSPSC Code\n(Version 10)',
          'Manufacturer Part Number (OEM #)',
          'Manufacturer (OEM)',
          'Item Description',
          'Unit of Measure',
          'Quantity in \nUnit of Measure\n',
          'Quantity',
          'List Price/MSRP',
          'Index Date / Catalog Version',
          'Contract Unit Price',
          'Extended Contract Price Paid',
          'source']

dgs_17bc = pd.merge(dgs_17b, dgs_17c, how='outer', on= merge_col).fillna(0)

In [None]:
# check work
# number of rows is correct (107 rows)
# will need to work on merge columns

df_peek(dgs_17bc)


## Data Cleaning and QC

In [34]:
#snake case columns
def snake_case(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.strip()
    
snake_case(dgs_17bc)

In [35]:
# check work
dgs_17bc.columns

Index(['supplier_contract_usage_id', 'ordering_agency_name',
       'state_(s)_or_local_(l)_agency',
       'purchasing_authority_number____________________(for_state_departments)',
       'agency_billing_code', 'purchase_order_number', 'purchase_order_date',
       'delivery_date',
       'contract_line_item_number_(clin)________________(rfp_id)',
       'unspsc_code\n(version_10)', 'manufacturer_part_number_(oem_#)',
       'manufacturer_(oem)', 'sku_#_/_item_#_x', 'item_description',
       'unit_of_measure', 'quantity_in_\nunit_of_measure', 'epp_(y/n)_x',
       'quantity', 'list_price/msrp', 'index_date_/_catalog_version',
       'contract_unit_price', 'contract_discount_x',
       'extended_contract_price_paid', 'total_with_options_per_unit',
       'grand_total', 'source', 'sku_#_/_item_#_y', 'epp_(y/n)_y',
       'contract_discount_y', 'core/_noncore', 'group_id/_segment_id'],
      dtype='object')

In [39]:
# check financial columns to be `int64
money =['contract_unit_price',
       'extended_contract_price_paid',
       'total_with_options_per_unit',
       'grand_total']

# loop that takes money list to convert to intt64 dtype
for column in money:
    dgs_17bc[column] = dgs_17bc[column].astype('int64')

In [40]:
# check work 
# the columns are not int64 dtype
dgs_17bc.dtypes


supplier_contract_usage_id                                                  int64
ordering_agency_name                                                       object
state_(s)_or_local_(l)_agency                                              object
purchasing_authority_number____________________(for_state_departments)    float64
agency_billing_code                                                       float64
purchase_order_number                                                      object
purchase_order_date                                                        object
delivery_date                                                              object
contract_line_item_number_(clin)________________(rfp_id)                  float64
unspsc_code\n(version_10)                                                 float64
manufacturer_part_number_(oem_#)                                           object
manufacturer_(oem)                                                         object
sku_#_/_item_#_x

In [41]:
# drop unnessary columns?
drops =['supplier_contract_usage_id',
       'state_(s)_or_local_(l)_agency',
       'purchasing_authority_number____________________(for_state_departments)',
       'agency_billing_code',
       'unspsc_code\n(version_10)',
       'unit_of_measure',
       'epp_(y/n)_x',
       'epp_(y/n)_y',
       'list_price/msrp',
       'index_date_/_catalog_version',
       'core/_noncore',
       'group_id/_segment_id']

dgs_17bc.drop(columns=drops, inplace=True)

In [43]:
# check work
# initial merge had like 30 columns

df_peek(dgs_17bc)

pandas.core.frame.DataFrame

(107, 19)

ordering_agency_name                                         object
purchase_order_number                                        object
purchase_order_date                                          object
delivery_date                                                object
contract_line_item_number_(clin)________________(rfp_id)    float64
manufacturer_part_number_(oem_#)                             object
manufacturer_(oem)                                           object
sku_#_/_item_#_x                                             object
item_description                                             object
quantity_in_\nunit_of_measure                                 int64
quantity                                                      int64
contract_unit_price                                           int64
contract_discount_x                                          object
extended_contract_price_paid                                  int64
total_with_options_per_unit                     

In [51]:
# new column for total cost
# 17b >> `grand total` = total_with_options * quanity
# 17c >> `extended contract price paid` = contract unit price * quanity

# what im trying to do: create a new column called "total_cost". for each row, if `totals_with_options_per_unit` is >=0, then multiply `totals_with_options_per_unit` by `quanity'
# if 0, then multiple `contract_unit_price` by `quanity`.

# df['total_cost'] = df['quanity'] * df['total_with_options'] or df['contract unit price']???

def calculate_total_cost(row):
    if row['total_with_options_per_unit'] > 0:
        return row['total_with_options_per_unit'] * row['quantity']
    else:
        return row['contract_unit_price'] * row['quantity']


In [52]:
test['total_cost'] = test.apply(calculate_total_cost, axis=1)

In [56]:
# check work
# found some zero values, but manuals and warranty items that didnt have a cost initially.
# should be good to go.

test[test['total_cost'] ==0]

Unnamed: 0,ordering_agency_name,purchase_order_number,purchase_order_date,delivery_date,contract_line_item_number_(clin)________________(rfp_id),manufacturer_part_number_(oem_#),manufacturer_(oem),sku_#_/_item_#_x,item_description,quantity_in_\nunit_of_measure,quantity,contract_unit_price,contract_discount_x,extended_contract_price_paid,total_with_options_per_unit,grand_total,source,sku_#_/_item_#_y,contract_discount_y,total_cost
1,SUNLINE TRANSIT AGENCY (THOUSAND PALMS),11819,2020-07-21 00:00:00,ALL 5 HAVE BEEN DELIVERED,10.2,XHE40,New Flyer of America Inc,0,Extended warranty for Propulsion System and Ba...,1,5,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
2,SUNLINE TRANSIT AGENCY (THOUSAND PALMS),11819,2020-07-21 00:00:00,ALL 5 HAVE BEEN DELIVERED,10.2,XHE40,New Flyer of America Inc,0,Manuals,1,5,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
4,Lane Transit (Oregon),2020-061,0,ALL 11 HAVE BEEN DELIVERED,3.3,XE40,New Flyer of America Inc,0,"Extended warranty for Propulsion System, Batte...",1,11,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
5,Lane Transit (Oregon),2020-061,0,ALL 11 HAVE BEEN DELIVERED,3.3,XE40,New Flyer of America Inc,0,"Manuals, Diagnostic equipment and tooling",1,11,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
7,VICTOR VALLEY TRANSIT AUTHORITY (VVTA),1416,2020-07-17 00:00:00,ALL 5 HAVE BEEN DELIVERED,3.2,XE40,New Flyer of America Inc,0,Extended warranty for Battery and Air Compressor,1,5,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
9,CULVER CITY TRANSPORTATION DEPARTMENT (CULVER ...,22100367 - 00,2020-08-19 00:00:00,ALL 4 HAVE BEEN DELIVERED,3.1,XE40,New Flyer of America Inc,0,Extended warranty for Propulsion System and Ba...,1,4,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
10,CULVER CITY TRANSPORTATION DEPARTMENT (CULVER ...,22100367 - 00,2020-08-19 00:00:00,ALL 4 HAVE BEEN DELIVERED,3.1,XE40,New Flyer of America Inc,0,Manuals and Training,1,4,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
12,ORANGE COUNTY TRANSPORTATION AUTHORITY (ORANGE...,CO2165,2020-12-01 00:00:00,ALL 10 HAVE BEEN DELIVERED,3.3,XE40,New Flyer of America Inc,0,Extended warranty for Propulsion System and Ba...,1,10,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
13,ORANGE COUNTY TRANSPORTATION AUTHORITY (ORANGE...,CO2165,2020-12-01 00:00:00,ALL 10 HAVE BEEN DELIVERED,3.3,XE40,New Flyer of America Inc,0,"Training, Manuals, Diagnostic equipment and to...",1,10,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0
15,Alameda County Transit Authority,57071,2021-03-12 00:00:00,ALL 20 HAVE BEEN DELIVERED,10.4,XHE40,New Flyer of America Inc,0,"Extended warranty for Propulsion System, Batte...",1,20,0,Price included in the base bus price,0,0,0,17b,0.0,0.0,0


In [None]:
# new column for prop_type

In [None]:
# new column for bus size type


In [None]:
# aggregate by PPNO number?
# to consolidate all the individual line items up to bus row?


## Export Cleaned data
save out as parquet

Initial Summary Stats