In [1]:
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import numpy as np
import pandas as pd
import seaborn as sns
import shared_utils
from scipy.stats import zscore

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

# function to display df info
def df_peek(df):
    display(type(df), df.shape, df.dtypes)

## Read in Raw Data
17C and 17B via excels

In [2]:
# Params to read in dgs data
url_17c = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/17c compiled-Proterra Compiled Contract Usage Report .xlsx'
url_17b = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/17b compiled.xlsx'

sheet_17c = 'Proterra '
sheet_17b = 'Usage Report Template'

def read_excel(url, sheet):
    df = pd.read_excel(url, sheet_name=sheet)
    
    return df

dgs_17c = read_excel(url_17c, sheet_17c)
dgs_17b = read_excel(url_17b, sheet_17b)

#add new column to identify source
dgs_17c['source'] = '17c'
dgs_17b['source'] = '17b'

## Merge data frames

In [3]:
merge_col=['Supplier Contract Usage ID',
          'Ordering Agency Name',
          'State (S) or Local (L) agency',
          'Purchasing Authority Number                    (for State departments)',
          'Agency Billing Code',
          'Purchase Order Number',
          'Purchase Order Date',
          'Delivery Date',
          'Contract Line Item Number (CLIN)                (RFP ID)',
          'UNSPSC Code\n(Version 10)',
          'Manufacturer Part Number (OEM #)',
          'Manufacturer (OEM)',
          'Item Description',
          'Unit of Measure',
          'Quantity in \nUnit of Measure\n',
          'Quantity',
          'List Price/MSRP',
          'Index Date / Catalog Version',
          'Contract Unit Price',
          'Extended Contract Price Paid',
          'source']

dgs_17bc = pd.merge(dgs_17b, dgs_17c, how='outer', on= merge_col).fillna(0)

In [4]:
# check work
# number of rows is correct (107 rows)
# will need to work on merge columns

#df_peek(dgs_17bc)


## Data Cleaning and QC

In [5]:
#snake case columns
def snake_case(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.strip()
    
snake_case(dgs_17bc)

In [6]:
# check work
dgs_17bc.columns

Index(['supplier_contract_usage_id', 'ordering_agency_name',
       'state_(s)_or_local_(l)_agency',
       'purchasing_authority_number____________________(for_state_departments)',
       'agency_billing_code', 'purchase_order_number', 'purchase_order_date',
       'delivery_date',
       'contract_line_item_number_(clin)________________(rfp_id)',
       'unspsc_code\n(version_10)', 'manufacturer_part_number_(oem_#)',
       'manufacturer_(oem)', 'sku_#_/_item_#_x', 'item_description',
       'unit_of_measure', 'quantity_in_\nunit_of_measure', 'epp_(y/n)_x',
       'quantity', 'list_price/msrp', 'index_date_/_catalog_version',
       'contract_unit_price', 'contract_discount_x',
       'extended_contract_price_paid', 'total_with_options_per_unit',
       'grand_total', 'source', 'sku_#_/_item_#_y', 'epp_(y/n)_y',
       'contract_discount_y', 'core/_noncore', 'group_id/_segment_id'],
      dtype='object')

In [7]:
# check financial columns to be `int64`
money =['contract_unit_price',
       'extended_contract_price_paid',
       'total_with_options_per_unit',
       'grand_total']

# loop that takes money list to convert to int64 dtype
for column in money:
    dgs_17bc[column] = dgs_17bc[column].astype('int64')

In [8]:
# check work. GTG
# the columns are int64 dtype
dgs_17bc[money].dtypes


contract_unit_price             int64
extended_contract_price_paid    int64
total_with_options_per_unit     int64
grand_total                     int64
dtype: object

In [9]:
# drop unnessary columns?
drops =['supplier_contract_usage_id',
       'state_(s)_or_local_(l)_agency',
       'purchasing_authority_number____________________(for_state_departments)',
       'agency_billing_code',
       'unspsc_code\n(version_10)',
       'unit_of_measure',
       'epp_(y/n)_x',
       'epp_(y/n)_y',
       'list_price/msrp',
       'index_date_/_catalog_version',
       'core/_noncore',
       'group_id/_segment_id']

dgs_17bc.drop(columns=drops, inplace=True)

In [10]:
# check work. GTG
# initial merge had like 30 columns

df_peek(dgs_17bc)

pandas.core.frame.DataFrame

(107, 19)

ordering_agency_name                                         object
purchase_order_number                                        object
purchase_order_date                                          object
delivery_date                                                object
contract_line_item_number_(clin)________________(rfp_id)    float64
manufacturer_part_number_(oem_#)                             object
manufacturer_(oem)                                           object
sku_#_/_item_#_x                                             object
item_description                                             object
quantity_in_\nunit_of_measure                                 int64
quantity                                                      int64
contract_unit_price                                           int64
contract_discount_x                                          object
extended_contract_price_paid                                  int64
total_with_options_per_unit                     

In [11]:
# new column for total cost
# 17b >> `grand total` = total_with_options * quanity
# 17c >> `extended contract price paid` = contract unit price * quanity

# what im trying to do: create a new column called "total_cost". for each row, if `totals_with_options_per_unit` is >=0, then multiply `totals_with_options_per_unit` by `quanity'
# if 0, then multiple `contract_unit_price` by `quanity`.

# df['total_cost'] = df['quanity'] * df['total_with_options'] or df['contract unit price']???

def calculate_total_cost(row):
    if row['total_with_options_per_unit'] > 0:
        return row['total_with_options_per_unit'] * row['quantity']
    else:
        return row['contract_unit_price'] * row['quantity']


In [12]:
# new colum for total cost

dgs_17bc['total_cost'] = dgs_17bc.apply(calculate_total_cost, axis=1)

In [13]:
# comparing totals columns to new `total_cost` column to see if logic works
# 17b = Grand total
# 17c = Extended Contract Price Paid
keep_col=['ordering_agency_name',
         'purchase_order_number',
          'item_description',
          'source',
          'grand_total',
          'extended_contract_price_paid',
         'total_cost']

col_compare = dgs_17bc[keep_col]
col_compare.shape

(107, 7)

In [14]:
# check work. GTG
# found some zero values, but manuals and warranty items that didnt have a cost initially.
# should be good to go.

dgs_17bc[dgs_17bc['total_cost'] ==0].item_description.unique()

array(['Extended warranty for Propulsion System and Battery', 'Manuals',
       'Extended warranty for Propulsion System, Battery and Air Compressor',
       'Manuals, Diagnostic equipment and tooling',
       'Extended warranty for Battery and Air Compressor',
       'Manuals and Training',
       'Training, Manuals, Diagnostic equipment and tooling',
       'Extended warranty for Propulsion System, Battery, HVAC, APC and Camera system',
       'Extended warranty for Air Compressor',
       'Diagnostic equipment and tooling', 'Other - Tools'], dtype=object)

In [15]:
# new column for prop_type

prop_list = ['Battery Electric Bus',
            'battery electric bus',
            'Fuel Cell Electric Bus',
            'fuel cell electric bus',
            'Hydrogen Electic Bus',
            'hydrogen electric bus',
            'battery electric',
            ]

# function to match keywords to list
def prop_type_finder(description):
    for keyword in prop_list:
        if keyword in description:
            return keyword
    return "not specified"

# add new col `prop_type`, fill it with values based on project_description using prop_type_finder function
dgs_17bc["prop_type"] = dgs_17bc["item_description"].apply(prop_type_finder)

In [16]:
# check work. GTG
# all good. no more mention of buses in value counts. 
display(dgs_17bc.prop_type.value_counts(),
        dgs_17bc[dgs_17bc['prop_type']=='not specified'].item_description.value_counts()
       )

not specified             70
Battery Electric Bus      20
battery electric bus       8
Hydrogen Electic Bus       4
battery electric           3
Fuel Cell Electric Bus     1
hydrogen electric bus      1
Name: prop_type, dtype: int64

Options - Configurables                                                          13
Additional 220 kWH battery pack                                                  13
Other - Tools                                                                     8
Options - Charging Equipment                                                      5
Extended warranty for Propulsion System and Battery                               3
Options - Configurables (DuoPower)                                                3
Other - Spare Parts                                                               3
Other - Training, manuals and Tools                                               3
Options - Configurables (non taxable)                                             2
Extended Warranty                                                                 2
Options - Warranty                                                                2
Other - Warranty                                                            

In [17]:
# new column for bus size type

size_list =['35 Foot',
           '40 Foot',
           '60 foot',
           '40 foot',
           '35 foot',
           ]

def bus_size_finder(description):
    for keyword in size_list:
        if keyword in description:
            return keyword
    return "not specified"

dgs_17bc["bus_size_type"] = dgs_17bc["item_description"].apply(bus_size_finder)

In [18]:
# check work. GTG
# good to go, included more str in list, checked value counts and no more mentions of bus.
display(dgs_17bc.shape,
        dgs_17bc.bus_size_type.value_counts(),
        #value counts `item_descrip` column, filtered by 'bus_size_type' == not specified'
        dgs_17bc[dgs_17bc['bus_size_type'] =='not specified'].item_description.value_counts()
       )


(107, 22)

not specified    70
40 Foot          16
35 Foot           9
40 foot           8
35 foot           3
60 foot           1
Name: bus_size_type, dtype: int64

Options - Configurables                                                          13
Additional 220 kWH battery pack                                                  13
Other - Tools                                                                     8
Options - Charging Equipment                                                      5
Extended warranty for Propulsion System and Battery                               3
Options - Configurables (DuoPower)                                                3
Other - Spare Parts                                                               3
Other - Training, manuals and Tools                                               3
Options - Configurables (non taxable)                                             2
Extended Warranty                                                                 2
Options - Warranty                                                                2
Other - Warranty                                                            

## Aggregate by Agency
need a df that:
1. each row is an agency
2. total quantity of buses only (not manuals, equipment, part, warranty)
3. aggregates total cost for the agency (bus, manals, etc)
4. keep the prop_type of the bus (should be no 'not specified')
5. keep the bus_size_type for the bus (no 'not specified')

In [75]:
# agency bus count
# filtered df by item desc containing 'bus' or 'Bus'
agg_agency_bus_count = dgs_17bc[(dgs_17bc['item_description'].str.contains('bus')) | (dgs_17bc['item_description'].str.contains('Bus'))]

agg_agency_bus_count = agg_agency_bus_count[['ordering_agency_name',
                                            'item_description',
                                            'quantity',
                                            'source',
                                            'total_cost',
                                            'prop_type',
                                            'bus_size_type']]

#i think this is it.. the numbers are matching up
agg_agency_bus_count = agg_agency_bus_count.groupby('ordering_agency_name').agg({
    'quantity':'sum',
    'total_cost':'sum',
    'prop_type':'max',
    'bus_size_type':'max',
    'source':'max',
}).reset_index()


# looks good. manualy double checked agsint pivot tables in excel. GOOD TO GO!
agg_agency_bus_count.shape
       

(30, 6)

## Export Cleaned data
save out as parquet

In [76]:
agg_agency_bus_count.to_parquet('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/dgs_agg_clean.parquet')

## Test to read in parquet

In [77]:
url= 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/dgs_agg_clean.parquet'
test = pd.read_parquet(url)

test.head(3)

Unnamed: 0,ordering_agency_name,quantity,total_cost,prop_type,bus_size_type,source
0,Alameda County Transit Authority,20,22846640,hydrogen electric bus,40 foot,17b
1,"CITY OF PORTERVILLE (PORTERVILLE, CA)",3,2781891,battery electric bus,35 foot,17b
2,CULVER CITY TRANSPORTATION DEPARTMENT (CULVER ...,4,3623536,Battery Electric Bus,40 Foot,17b
