In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import numpy as np
import pandas as pd
import seaborn as sns
import shared_utils
from scipy.stats import zscore

# set_option to increase max rows displayed to 200, to see entire df in 1 go/
pd.set_option("display.max_rows", 200)

# function to display df info
def df_peek(df):
    display(type(df), df.shape, df.dtypes)

## Read in Raw Data
17C and 17B via excels

In [None]:
# Params to read in dgs data
url_17c = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/17c compiled-Proterra Compiled Contract Usage Report .xlsx'
url_17b = 'gs://calitp-analytics-data/data-analyses/bus_procurement_cost/17b compiled.xlsx'

sheet_17c = 'Proterra '
sheet_17b = 'Usage Report Template'

def read_excel(url, sheet):
    df = pd.read_excel(url, sheet_name=sheet)
    
    return df

dgs_17c = read_excel(url_17c, sheet_17c)
dgs_17b = read_excel(url_17b, sheet_17b)

#add new column to identify source
dgs_17c['source'] = '17c'
dgs_17b['source'] = '17b'

## Merge data frames

In [None]:
merge_col=['Supplier Contract Usage ID',
          'Ordering Agency Name',
          'State (S) or Local (L) agency',
          'Purchasing Authority Number                    (for State departments)',
          'Agency Billing Code',
          'Purchase Order Number',
          'Purchase Order Date',
          'Delivery Date',
          'Contract Line Item Number (CLIN)                (RFP ID)',
          'UNSPSC Code\n(Version 10)',
          'Manufacturer Part Number (OEM #)',
          'Manufacturer (OEM)',
          'Item Description',
          'Unit of Measure',
          'Quantity in \nUnit of Measure\n',
          'Quantity',
          'List Price/MSRP',
          'Index Date / Catalog Version',
          'Contract Unit Price',
          'Extended Contract Price Paid',
          'source']

dgs_17bc = pd.merge(dgs_17b, dgs_17c, how='outer', on= merge_col).fillna(0)

In [None]:
# check work
# number of rows is correct (107 rows)
# will need to work on merge columns

df_peek(dgs_17bc)


## Data Cleaning and QC

In [None]:
#snake case columns
def snake_case(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.strip()
    
snake_case(dgs_17bc)

In [None]:
# check work
dgs_17bc.columns

In [None]:
# check financial columns to be `int64
money =['contract_unit_price',
       'extended_contract_price_paid',
       'total_with_options_per_unit',
       'grand_total']

# loop that takes money list to convert to intt64 dtype
for column in money:
    dgs_17bc[column] = dgs_17bc[column].astype('int64')

In [None]:
# check work 
# the columns are not int64 dtype
dgs_17bc.dtypes


In [None]:
# drop unnessary columns?
drops =['supplier_contract_usage_id',
       'state_(s)_or_local_(l)_agency',
       'purchasing_authority_number____________________(for_state_departments)',
       'agency_billing_code',
       'unspsc_code\n(version_10)',
       'unit_of_measure',
       'epp_(y/n)_x',
       'epp_(y/n)_y',
       'list_price/msrp',
       'index_date_/_catalog_version',
       'core/_noncore',
       'group_id/_segment_id']

dgs_17bc.drop(columns=drops, inplace=True)

In [None]:
# check work
# initial merge had like 30 columns

df_peek(dgs_17bc)

In [None]:
# new column for total cost
# 17b >> `grand total` = total_with_options * quanity
# 17c >> `extended contract price paid` = contract unit price * quanity

# what im trying to do: create a new column called "total_cost". for each row, if `totals_with_options_per_unit` is >=0, then multiply `totals_with_options_per_unit` by `quanity'
# if 0, then multiple `contract_unit_price` by `quanity`.

# df['total_cost'] = df['quanity'] * df['total_with_options'] or df['contract unit price']???

def calculate_total_cost(row):
    if row['total_with_options_per_unit'] > 0:
        return row['total_with_options_per_unit'] * row['quantity']
    else:
        return row['contract_unit_price'] * row['quantity']


In [None]:
test['total_cost'] = test.apply(calculate_total_cost, axis=1)

In [None]:
# check work
# found some zero values, but manuals and warranty items that didnt have a cost initially.
# should be good to go.

test[test['total_cost'] ==0]

In [None]:
# new column for prop_type

In [None]:
# new column for bus size type


In [None]:
# aggregate by PPNO number?
# to consolidate all the individual line items up to bus row?


## Export Cleaned data
save out as parquet

Initial Summary Stats