In [1]:
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import numpy as np
import pandas as pd
import seaborn as sns
import shared_utils
from scipy.stats import zscore

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

# function to display df info
def df_peek(df):
    display(type(df), df.shape, df.dtypes)

## Read in Raw Data
17C and 17B via excels

In [None]:
GCS_FILE_PATH ='gs://calitp-analytics-data/data-analyses/bus_procurement_cost/'
file_17c = '17c compiled-Proterra Compiled Contract Usage Report .xlsx'
file_17b = '17b compiled.xlsx'
sheet_17c = 'Proterra '
sheet_17b = 'Usage Report Template'

def get_data(path, file, sheet):
    df = pd.read_excel(path + file, sheet_name=sheet)
    
    return df

dgs_17c = get_data(GCS_FILE_PATH, file_17c, sheet_17c)
dgs_17b = get_data(GCS_FILE_PATH, file_17b, sheet_17b)


In [6]:
display(dgs_17c.shape, dgs_17b.shape)

NameError: name 'dgs_17c' is not defined

## Merge data frames

In [3]:
merge_col=['Supplier Contract Usage ID',
          'Ordering Agency Name',
          'State (S) or Local (L) agency',
          'Purchasing Authority Number                    (for State departments)',
          'Agency Billing Code',
          'Purchase Order Number',
          'Purchase Order Date',
          'Delivery Date',
          'Contract Line Item Number (CLIN)                (RFP ID)',
          'UNSPSC Code\n(Version 10)',
          'Manufacturer Part Number (OEM #)',
          'Manufacturer (OEM)',
          'Item Description',
          'Unit of Measure',
          'Quantity in \nUnit of Measure\n',
          'Quantity',
          'List Price/MSRP',
          'Index Date / Catalog Version',
          'Contract Unit Price',
          'Extended Contract Price Paid',
          'source']

dgs_17bc = pd.merge(dgs_17b, dgs_17c, how='outer', on= merge_col).fillna(0)

NameError: name 'dgs_17c' is not defined

## Data Cleaning and QC

In [None]:
#snake case columns
def snake_case(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.strip()
    
snake_case(dgs_17bc)

In [None]:
# check financial columns to be `int64`
money =['contract_unit_price',
       'extended_contract_price_paid',
       'total_with_options_per_unit',
       'grand_total']

# loop that takes money list to convert to int64 dtype
for column in money:
    dgs_17bc[column] = dgs_17bc[column].astype('int64')

In [None]:
# drop unnessary columns?
drops =['supplier_contract_usage_id',
       'state_(s)_or_local_(l)_agency',
       'purchasing_authority_number____________________(for_state_departments)',
       'agency_billing_code',
       'unspsc_code\n(version_10)',
       'unit_of_measure',
       'epp_(y/n)_x',
       'epp_(y/n)_y',
       'list_price/msrp',
       'index_date_/_catalog_version',
       'core/_noncore',
       'group_id/_segment_id']

dgs_17bc.drop(columns=drops, inplace=True)

In [None]:
# new column for total cost
# 17b >> `grand total` = total_with_options * quanity
# 17c >> `extended contract price paid` = contract unit price * quanity

# what im trying to do: create a new column called "total_cost". for each row, if `totals_with_options_per_unit` is >=0, then multiply `totals_with_options_per_unit` by `quanity'
# if 0, then multiple `contract_unit_price` by `quanity`.

# df['total_cost'] = df['quanity'] * df['total_with_options'] or df['contract unit price']???

def calculate_total_cost(row):
    if row['total_with_options_per_unit'] > 0:
        return row['total_with_options_per_unit'] * row['quantity']
    else:
        return row['contract_unit_price'] * row['quantity']


In [None]:
# new colum for total cost

dgs_17bc['total_cost'] = dgs_17bc.apply(calculate_total_cost, axis=1)

In [None]:
# comparing totals columns to new `total_cost` column to see if logic works
# 17b = Grand total
# 17c = Extended Contract Price Paid
keep_col=['ordering_agency_name',
         'purchase_order_number',
          'item_description',
          'source',
          'grand_total',
          'extended_contract_price_paid',
         'total_cost']

col_compare = dgs_17bc[keep_col]


In [None]:
# new column for prop_type

prop_list = ['Battery Electric Bus',
            'battery electric bus',
            'Fuel Cell Electric Bus',
            'fuel cell electric bus',
            'Hydrogen Electic Bus',
            'hydrogen electric bus',
            'battery electric',
            ]

# function to match keywords to list
def prop_type_finder(description):
    for keyword in prop_list:
        if keyword in description:
            return keyword
    return "not specified"

# add new col `prop_type`, fill it with values based on project_description using prop_type_finder function
dgs_17bc["prop_type"] = dgs_17bc["item_description"].apply(prop_type_finder)

In [None]:
# new column for bus size type

size_list =['35 Foot',
           '40 Foot',
           '60 foot',
           '40 foot',
           '35 foot',
           ]

def bus_size_finder(description):
    for keyword in size_list:
        if keyword in description:
            return keyword
    return "not specified"

dgs_17bc["bus_size_type"] = dgs_17bc["item_description"].apply(bus_size_finder)

## Aggregate by Agency
need a df that:
1. each row is an agency
2. total quantity of buses only (not manuals, equipment, part, warranty)
3. aggregates total cost for the agency (bus, manals, etc)
4. keep the prop_type of the bus (should be no 'not specified')
5. keep the bus_size_type for the bus (no 'not specified')

In [75]:
# agency bus count
# filtered df by item desc containing 'bus' or 'Bus'
agg_agency_bus_count = dgs_17bc[(dgs_17bc['item_description'].str.contains('bus')) | (dgs_17bc['item_description'].str.contains('Bus'))]

agg_agency_bus_count = agg_agency_bus_count[['ordering_agency_name',
                                            'item_description',
                                            'quantity',
                                            'source',
                                            'total_cost',
                                            'prop_type',
                                            'bus_size_type']]

#i think this is it.. the numbers are matching up
agg_agency_bus_count = agg_agency_bus_count.groupby('ordering_agency_name').agg({
    'quantity':'sum',
    'total_cost':'sum',
    'prop_type':'max',
    'bus_size_type':'max',
    'source':'max',
}).reset_index()


# looks good. manualy double checked agsint pivot tables in excel. GOOD TO GO
       

(30, 6)

## Export Cleaned data
save out as parquet

In [76]:
agg_agency_bus_count.to_parquet('gs://calitp-analytics-data/data-analyses/bus_procurement_cost/dgs_agg_clean.parquet')

## Test to read in parquet

In [4]:
url= 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/dgs_agg_clean.parquet'
dgs = pd.read_parquet(url)

dgs.shape

(30, 6)

In [5]:
dgs

Unnamed: 0,ordering_agency_name,quantity,total_cost,prop_type,bus_size_type,source
0,Alameda County Transit Authority,20,22846640,hydrogen electric bus,40 foot,17b
1,"CITY OF PORTERVILLE (PORTERVILLE, CA)",3,2781891,battery electric bus,35 foot,17b
2,CULVER CITY TRANSPORTATION DEPARTMENT (CULVER ...,4,3623536,Battery Electric Bus,40 Foot,17b
3,City of Roseville,10,6990000,Battery Electric Bus,35 Foot,17c
4,City of San Luis Obispo,1,689000,Battery Electric Bus,35 Foot,17c
5,City of Santa Rosa(Santa Rosa CityBus),5,3495000,Battery Electric Bus,40 Foot,17c
6,City of Visalia - Visalia City Coach(Visalia T...,4,2756000,Battery Electric Bus,35 Foot,17c
7,"Foothill Transit, West Covina, CA",33,37642044,Hydrogen Electic Bus,40 Foot,17b
8,"GOLDEN EMPIRE TRANSIT (BAKERSFIELD, CA)",5,5406355,Hydrogen Electic Bus,40 Foot,17b
9,Golden Empire Transit,5,5458305,Hydrogen Electic Bus,40 Foot,17b
