In [None]:
import sys
sys.path.insert(0,'/Users/jarad/fake_folder/Python Libraries/')

from jb_libraries import *
%matplotlib inline

date_start = '2017-09-09'
date_end = '2018-10-09'

ww_date_start = '2018-09-09'
ww_date_end = '2018-10-09'

a = str((pd.to_datetime(date_end) - pd.DateOffset(months = 3)).date())
three_months_ago = a[:7] + '-01'

one_month_ago = date_end[:7] + '-01'

predictor_date_start = '2018-04-01' # it was actually 2018-03-25 but start it in april

loss_date_start = three_months_ago
loss_date_end = date_end

### Stuff
* Info in email with subject line "Jupyter notebook for shipping box predictor"
* And in email with subject line "Box Predictor Assessment"
* And Shipping Audit (Monthly and Weekly) Basecamp

*****

### Links
* [UPS Actual vs Dim Weight](https://www.ups.com/us/en/help-center/packaging-and-supplies/determine-billable-weight.page)
* [Quick Reference Guide to Avoid Shipping Charge Corrections](https://www.ups.com/us/en/help-center/billing-payment/avoid-charges.page)

# Current state of UPS shipping

### Get UPS data

In [None]:
ups_super_main = pd.read_sql(
'''
SELECT
*
FROM ups_billing
WHERE orders_id IN (SELECT orders_id FROM orders WHERE DATE(date_purchased) BETWEEN ' '''+ date_start +''' ' AND ' '''+ date_end +''' ')
AND orders_id NOT IN (SELECT orders_id FROM subscriptions_history) # exclude adabox
''', db)

col_fix(ups_super_main)
ups_super_main['charge description'] = ups_super_main['charge description'].str.lower()

# clean up charge descriptions
df = ups_super_main['charge description'].str.split(' ', expand = True)
ups_super_main = ups_super_main.join(df)

# get the first two words of each charge description
ups_super_main['jb charge description'] = ups_super_main[0] + ' ' + ups_super_main[1]

# manual fix
ups_super_main['jb charge description'] = np.where(ups_super_main['charge description'] == 'qst', 'qst', ups_super_main['jb charge description'])

# clean further
def clean(x):
    if 'ground' in x['jb charge description'] and 'return' not in x['jb charge description']:
        return 'ground'
    elif 'return' in x['jb charge description']:
        return 'returns'
    elif 'broker' in x['jb charge description']:
        return 'brokerage fees'
    elif 'shipping charge' in x['jb charge description']:
        return 'shipping charge correction'    
    else:
        return x['jb charge description']
    
# this just shortens the original UPS charge description, doesn't change anything else
ups_super_main['jb charge description'] = ups_super_main.apply(clean, axis = 1)

### Get orders data

In [None]:
ot_main = pd.read_sql(
'''
SELECT
DATE(date_purchased) AS date_purchased,
DATE_FORMAT(date_purchased, '%Y-%m') AS year_and_month,
ot1.orders_id,
ot1.value + IFNULL(ot2.value,0) AS shipping_revenue
FROM orders_total ot1

RIGHT JOIN orders o ON ot1.orders_id = o.orders_id
AND DATE(date_purchased) BETWEEN ' '''+ date_start +''' ' AND ' '''+ date_end +''' '
AND o.orders_id NOT IN (SELECT orders_id FROM subscriptions_history) # exclude adabox

LEFT JOIN orders_total ot2 ON o.orders_id = ot2.orders_id
AND ot2.class = 'ot_ddp'

WHERE ot1.class = 'ot_shipping'
AND ot1.value > 0 # exclude "free shipping"
''', db)

col_fix(ot_main)

### Join the two and clean it

In [None]:
# join the two
# not that this is a left join onto orders_total
# recall that we excluded adabox and free shipping
df1 = ot_main.set_index('orders id')
df2 = ups_super_main.groupby('orders id')[['netAmount']].sum()

ups_main = df1.join(df2)

# find nulls
nulls = ups_main[ups_main.isnull().any(1)].copy()

# veiw shipping_modeul_codes of nulls
a = pd.read_sql(
'''
SELECT
shipping_module_code,
COUNT(shipping_module_code) AS count
FROM orders
WHERE orders_id IN '''+ str(tuple(nulls.index.tolist())) +'''
GROUP BY shipping_module_code
''', db)

print('shipping module codes of nulls\n')
print(a)
print('we only care about UPS and where we have UPS billing data, so drop these nulls')
print('assume the UPS nulls are because we do not yet have shipping billing data')

# clean it up
ups_main.reset_index(inplace = True)
ups_main.dropna(inplace = True)
ups_main.rename(columns = {'netAmount':'ups charge'}, inplace = True)

# create cols
ups_main['overcharge'] = np.where(ups_main['ups charge'] > ups_main['shipping revenue'], 'yes', 'no')
ups_main['shipping profit'] = ups_main['shipping revenue'] - ups_main['ups charge']

# put all charges on one line per OID
# omit fuel because it's pretty much in every order
a = ups_super_main.groupby(['orders id','jb charge description'], as_index = False)[['tracking']].count() # we're not really counting anything; this is just a grupby trick
a.drop(a[a['jb charge description'] == 'fuel surcharge'].index, inplace = True)

b = a.groupby('orders id')['jb charge description'].apply(lambda x: ', '.join(x))

c = pd.DataFrame(b)
c.reset_index(inplace = True)

# this is just a sentence that lists the ups_main charges, so no changes to any values
ups_main = pd.merge(ups_main, c, on = 'orders id')

# get date purchased
ups_main['date purchased'] = pd.to_datetime(ups_main['date purchased'])

In [None]:
ups_main.head()

### Overcharges by year and month

In [None]:
by_month = ups_main.groupby(['year and month','overcharge'])[['shipping profit']].sum().unstack(1).fillna(0)
by_month.columns = by_month.columns.droplevel(0)
by_month['net'] = by_month.sum(1)

print(by_month.describe())
by_month.format_(['m0'] * len(by_month.columns))

In [None]:
axes = by_month.plot(kind = 'bar',
                     subplots = True,
                     figsize = (5,7),
                     alpha = 0.65,
                     grid = True,
                     title = 'UPS shipping profit ($)',
                     legend = False,
                     rot = 45,
                     width = 0.45,
                     edgecolor = 'black')
axes[0].set_title('did not incur a loss')
axes[1].set_title('did incur a loss')
axes[2].set_title('overall profit')
for i in range(3):
    axes[i].set_yticklabels('')
plt.show()

### Losses only

In [None]:
losses_main = ups_main[(ups_main['overcharge'] == 'yes')
                     & (ups_main['date purchased'].between(loss_date_start, loss_date_end))].copy()

In [None]:
# recall that this groupby column is just a sentence of all the UPS charges, excluding fuel
# no values have been changed anywhere
by_charge = losses_main.groupby('jb charge description')[['shipping profit']].sum()
by_charge.sort_values('shipping profit', inplace = True)

for_chart = by_charge.iloc[:10][::-1]
for_chart = for_chart * -1


ax = for_chart.plot(kind = 'barh',
                    grid = True,
                    color = 'red',
                    alpha = 0.45,
                    title = 'UPS profit losses by charge descriptions per order\nfrom {} to {}'.format(loss_date_start, loss_date_end),
                    edgecolor = 'black',
                    legend = False)
ax.set_ylabel('')
ax.set_xticklabels(['-${:,.0f}'.format(x) for x in ax.get_xticks()])
plt.show()

for_chart = for_chart * -1
print(for_chart.iloc[::-1].format_(['m0']))

### Frequency of charge combinations

In [None]:
# get the value counts of each charge description
v = losses_main['jb charge description'].value_counts()

# get the count of all unique OIDs
t = len(set(losses_main['orders id']))

# divide count of charge descriptions by total OID count to get the frequency of each charge description combination
df = pd.DataFrame(v/t)
df.columns = ['frequency']

top = 10
for_chart = df.iloc[:top][::-1] # get the top ten; reverse the order of counts, for the chart

ax = for_chart.plot(kind = 'barh',
                    grid = True,
                    color = 'green',
                    alpha = 0.45,
                    title = 'frequency of top ten charges per order which incurred a loss\nfrom {} to {}'.format(loss_date_start, loss_date_end),
                    edgecolor = 'black',
                    legend = False)
ax.set_ylabel('')
ax.set_xticklabels(['{:,.0f}%'.format(x * 100) for x in ax.get_xticks()])
plt.show()

print(for_chart.iloc[::-1].format_(['p2']))

### Get box predictor data

In [None]:
print('predictor data from {} to {}'.format(predictor_date_start, date_end))

In [None]:
# get shipping quotes
sq_main = pd.read_sql(
'''
SELECT
DATE(date_purchased) AS date_purchased,
DATE_FORMAT(date_purchased, '%Y-%m') AS year_and_month,
s.*,
o.shipping_module_code AS service
FROM shipping_quotes s
LEFT JOIN orders o ON s.orders_id = o.orders_id
WHERE DATE(date_purchased) BETWEEN ' '''+ predictor_date_start +''' ' AND ' '''+ date_end +''' '
''', db)

col_fix(sq_main)

# get actual
sl_main = pd.read_sql(
'''
SELECT
orders_id,
sl_weight,
sl_box
FROM ship_log
WHERE shipped_date BETWEEN ' '''+ predictor_date_start +''' ' AND ' '''+ date_end +''' '
''', db)

col_fix(sl_main)

In [None]:
# put the data together
# note the left join into sl_main
box_main = pd.merge(sl_main, sq_main, how = 'left', on = 'orders id')

# rename cols
box_main.rename(columns = {'sl weight':'pred weight',
                           'sl box':'pred box',
                           'weight':'actual weight',
                           'box':'actual box'}, inplace = True)

# rearrange them,
ls = box_main.columns.tolist()
for x in ['pred weight','pred box','actual weight','actual box']:
    ls.remove(x)
ls = ls + ['pred weight','actual weight','pred box','actual box']

box_main = box_main[ls]

# find nulls
nulls = box_main[box_main.isnull().any(1)]
print('{:,.0f} nulls after joining ship_log and shipping_quotes'.format(len(nulls)))
print('or {:,.2f}% of total lines'.format(len(nulls)/len(box_main) * 100))
print('remove these')
box_main.dropna(inplace = True)

# find zeros
zero = box_main[box_main['actual weight'] == 0]
print('\n{:,.0f} lines where actual weight is zero'.format(len(zero)))
print('drop these')
box_main = box_main[box_main['actual weight'] > 0].copy()

# to int; round up
for col in ['actual','pred']:
    box_main[col + ' box'] = box_main[col + ' box'].astype(int)
    box_main[col + ' weight rounded'] = np.ceil(box_main[col + ' weight'])    

# get summary columns    
box_main['weight error %'] = np.abs((box_main['actual weight'] - box_main['pred weight'])/box_main['actual weight'])
box_main['weight error % rounded'] = np.abs((box_main['actual weight rounded'] - box_main['pred weight rounded'])/box_main['actual weight rounded'])
box_main['correct box'] = np.where(box_main['actual box'] == box_main['pred box'], 1, 0)    

# exclude these services
ls = ['','resellershipping','sameday']
box_main = box_main[~box_main['service'].isin(ls)].copy()

# view results
print('\ncount of services in this data:')
print(box_main['service'].value_counts())

# get box-qty
print('\nvalue counts of box-qty per OID:')
print(box_main['qty'].value_counts())
print('\nthe majority of box-qty\'s are x1\nexclude anything greater than one\n')
box_main = box_main[box_main['qty'] == 1].copy()

dupes = np.sum(box_main['orders id'].duplicated())
print('\n{} dupe OIDs\nremove them'.format(dupes))
box_main.drop_duplicates('orders id', inplace = True)

print('\ntotal count of lines in dataset: {:,.0f}'.format(len(box_main)))

### Choose your service

In [None]:
service = ['upsxml']

box_main.drop(box_main[~box_main['service'].isin(service)].index, inplace = True)

m1 = box_main['date purchased'].min()
m2 = box_main['date purchased'].max()
print('service is {}\ncount of lines is {:,.0f}\ndata is from {} to {}'.format(service, len(box_main), m1, m2))

In [None]:
box_main.head()

### Weight Error %

In [None]:
weight_summary = box_main.groupby('year and month')[['weight error %']].describe()
weight_summary.columns = weight_summary.columns.droplevel(0)

mean = box_main['weight error %'].mean()
print('mean weight error % for {} is {:,.2f}%'.format(service, mean * 100))

weight_summary.format_(['n0'] + ['p2'] * (len(weight_summary.columns) - 1))

In [None]:
ax = box_main['weight error %'].hist(bins = 900)
ax.set_xlim(0,4)
ax.set_title('Histogram of "weight error %" per box')
plt.show()

### Correct box?

In [None]:
box_summary = box_main.groupby('year and month').agg({'orders id':'count',
                                                      'correct box':'sum'}).rename(columns = {'orders id':'unique OID count'})
box_summary['% correct'] = box_summary['correct box']/box_summary['unique OID count']

val1 = box_main['correct box'].sum()
val2 = len(box_main)
print('from {} to {}'.format(predictor_date_start, date_end))
print('overall correct-box result is {:,.2f}% correct'.format(val1/val2 * 100))

box_summary.format_(['n0','n0','p2'])

In [None]:
ax = box_main['correct box'].value_counts().plot(kind = 'bar', color = 'purple', alpha = 0.65, edgecolor = 'black')
ax.set_xticklabels(['Correct Box','Incorrect Box'], rotation = 0)
ax.set_title('Box Prediction Results\nfrom {} to {}'.format(predictor_date_start, date_end))
plt.show()

### Only WW Services

In [None]:
charge = 'worldwide'

# get data
ww_main = ups_main[ups_main['date purchased'].between(ww_date_start, ww_date_end)
                & (ups_main['jb charge description'].str.contains(charge))].copy()

# create this flag
ww_main['residential charge'] = np.where(ww_main['jb charge description'].str.contains('residential'),
                                        'yes','no')

# view some descriptors
dupes = np.sum(ww_main['orders id'].duplicated())
print('{} dupe OIDs'.format(dupes))

print('\ntotal count of OIDs: {:,.0f}\n'.format(len(ww_main)))

print('\ndata is from {} to {}'.format(ww_date_start, ww_date_end))

print('\nthese are all of the charge-combinations and their counts for the WW services only:\n')
print(ww_main['jb charge description'].value_counts())

print('\ncount of overcharges within this dataset:\n')
print(ww_main['overcharge'].value_counts())

print('\ncount of overcharges WITH residential charges within this dataset:\n')
print(ww_main[ww_main['residential charge'] == 'yes']['overcharge'].value_counts())

### Service and Accessorial Charge, and Shipping Charge Correction flag

In [None]:
df = ups_super_main[ups_super_main['orders id'].isin(ww_main['orders id'].tolist())].copy()

# isolate "service charges"
service = ['fuel surcharge',
          'worldwide expedited',
          'worldwide expedited shipment',
          'worldwide express',
          'worldwide saver',
          'worldwide saver shipment']

# isolate shipping charge corrections
correction = ['shipping charge correction expedited',
              'shipping charge correction express',
              'shipping charge correction fuel surcharge',
              'shipping charge correction worldwide saver']

# isolate accessorial charges
accessorial = list(set(df['charge description']))

# then remove "service" and "corrections"
for x in service + correction:
    try:
        accessorial.remove(x)
    except:
        pass
    
# now you have just "accessorial charges"    

# store them in this dict
d = {'service charge':service,
    'shipping charge correction':correction,
    'accessorial charge':accessorial}

print('these are all of the charges in this dataset')
for k,v in d.items():
    print('\n' + k + '\n')
    for v2 in v:
        print(v2)
        
# create this column        
df['charge type'] = df['charge description']
for k,v in d.items():
    for v2 in v:
        df['charge type'] = df['charge type'].replace(v,k)        
        
# groupby OID
df2 = df.groupby(['orders id','charge type'])[['netAmount']].sum().unstack(1).fillna(0)
df2.columns = df2.columns.droplevel(0)     
df2.reset_index(inplace = True)

# map these "charge types" to ww_main
for col in df2.columns:
    ww_main[col] = ww_main['orders id'].map(dict(zip(df2['orders id'], df2[col])))
    
print('\n{} null(s)'.format(np.sum(ww_main.isnull().any(1))))    

# push some columns to the end of the data
cols = ww_main.columns.tolist()
ls = ['ups charge','shipping revenue','shipping profit']
for x in ls:
    cols.remove(x)
cols = cols + ls

ww_main = ww_main[cols]
ww_main.rename(columns = {'ups charge':'ups total charge'}, inplace = True)

### Did or did not cover service charge?

In [None]:
write_book = 'no'

# this data consists of OIDs that:
# went via some WW service
# AND incurred an overcharge
# AND incurred a residential charge

# then we go further and ask: of these OIDs,
    # which ones did the shipping revenue cover the service charge,
    # and which ones did not
for_x = ww_main[(ww_main['overcharge'] == 'yes')
              & (ww_main['jb charge description'].str.contains('residential'))].copy()

d1 = for_x['date purchased'].min().date()
d2 = for_x['date purchased'].max().date()
print('data is from {} to {}\n'.format(d1,d2))

a = for_x[for_x['shipping revenue'] >= for_x['service charge']] # covered service charge
b = for_x[for_x['shipping revenue'] < for_x['service charge']] # did not cover service charge
t = len(for_x)
print('count of all lines: {}\ncovered service charge: {}\ndid not cover service charge: {}'.format(t, len(a), len(b)))

if write_book == 'yes':
    m1 = str(for_x['date purchased'].min().date())
    m2 = str(for_x['date purchased'].max().date())
    
    title = 'Worldwide and Residential Charge data from {} to {}'.format(m1,m2)
    writer = pd.ExcelWriter(title + '.xlsx', engine = 'xlsxwriter')    
    
    a.to_excel(writer, 'covered basis cost', index = False)
    b.to_excel(writer, 'did not cover basis cost', index = False)
    
    # to show which UPS charges are in each "charge type"
    c = pd.DataFrame.from_dict(d, orient = 'index').T.fillna('')
    c.to_excel(writer, 'breakdown of charge types', index = False)
    
    writer.save()

### Entered vs Billed Weight 
* From Daigo via Shipping Audit (Monthly and Weekly) Basecamp
* 1.) box predictor picks a weight and a box. 
* 2.) We send the weight and the box size to UPS's api, and they return to us a quoted price.
* 3.)  this price shows up on checkout, and if the customer makes an order then the box and weight is saved into shipping_quotes. 
* 4.) shipper actually ships it, which involves weighing the actual box. this weight and box gets logged to ship_log, and hob sends that weight and box size to a different UPS api
* 5.) UPS returns us a shipping label, which has price and dim weight recorded. This is the weight that will eventually be returned to us as 'entered weight' as well
* 6.) UPS sends us an invoice with 'entered weight' and 'billed weight', where entered weight is what we sent them for the package in step 4 (supposedly),  and 'billed weight' is the value that they got when they weighed and measured it independently.

In [None]:
s = ww_date_start
e = ww_date_end

In [None]:
weight_main = pd.read_sql(
'''
SELECT

DATE(date_purchased) AS date_purchased,
DATE_FORMAT(date_purchased, '%Y-%m') AS year_and_month,
ups.orders_id,
tracking,
LOWER(charge_description) AS charge_description,
entered_weight,
billed_weight

FROM orders o

LEFT JOIN ups_billing ups ON o.orders_id = ups.orders_id

WHERE DATE(date_purchased) BETWEEN ' '''+ s +''' ' AND ' '''+ e +''' '
AND shipping_module_code = 'upsxml'
AND shippingcost > 0 # exclude free shipping
AND ups.orders_id IS NOT NULL
AND ups.billed_weight > 0
''', db)

# this function replaces the underscore in the column header with a space
col_fix(weight_main)

# find nulls, any and all
print('{} null(s)'.format(np.sum(weight_main.isnull().any(1))))

# get rid of the words "residential" and "commercial"
weight_main['charge description'] = weight_main['charge description'].str.replace('residential|commercial','')

# get rid of "returns", "delivery intercepts", and "undeliverable"
ls = ['return','intercept','undeliverable']
weight_main.drop(weight_main[weight_main['charge description'].str.contains('|'.join(ls))].index, inplace = True)

d1 = weight_main['date purchased'].min()
d2 = weight_main['date purchased'].max()

print('\ndata is from {} to {}\n'.format(d1,d2))

# view all the charge descriptions and their counts
print('\ncharge description counts over all UPS packages:\n')

v1 = weight_main['charge description'].value_counts()
v2 = weight_main['charge description'].value_counts().sum()
v = pd.DataFrame(v1)
v.columns = ['count']
v['%'] = v1/v2

print(v.format_(['n0','p2']))

# groupby year/month and tracking
# grouping by tracking and not OID ensures that we capture multi-box shipments
# get the min entered weight per tracking
# get the max billed weight per tracking
    # if there was a shipping charge correction,
    # then this max weight is the final billed weight value after this correction
weight = weight_main.groupby(['year and month','tracking'], as_index = False).agg({'entered weight':'min',
                                                                                    'billed weight':'max'})
# map OIDs to tracking
weight['orders id'] = weight['tracking'].map(dict(zip(weight_main['tracking'], weight_main['orders id'])))

# flag tracking for shipping charge corrections
ls = weight_main[weight_main['charge description'].str.contains('shipping charge correction')]['tracking'].tolist()
weight['shipping charge correction'] = np.where(weight['tracking'].isin(ls), 'yes', 'no')

# get the weight difference
weight['weight difference'] = weight['billed weight'] - weight['entered weight']

# get the date purchased
weight['date purchased'] = weight['orders id'].map(dict(zip(ot_main['orders id'], ot_main['date purchased'])))

# get shipping revenue
ship_revenue = pd.read_sql(
'''
SELECT
ot1.orders_id,
ot1.value + IFNULL(ot2.value, 0) AS shipping_revenue
FROM orders_total ot1
LEFT JOIN orders_total ot2 ON ot1.orders_id = ot2.orders_id
AND ot2.class = 'ot_ddp'
WHERE ot1.class = 'ot_shipping'
AND ot1.orders_id IN '''+ str(tuple(weight_main['orders id'].tolist())) +'''
''', db)

col_fix(ship_revenue)

# get shipping charges
ship_charge = pd.read_sql(
'''
SELECT
orders_id,
SUM(netAmount) AS shipping_charge
FROM ups_billing
WHERE orders_id IN '''+ str(tuple(weight_main['orders id'].tolist())) +'''
GROUP BY orders_id
''', db)

col_fix(ship_charge)

# merge the two on OID
df = pd.merge(ship_revenue, ship_charge, on = 'orders id')

# flag overcharges
df['overcharge'] = np.where(df['shipping revenue'] < df['shipping charge'], 'yes','no')
ls = df[df['overcharge'] == 'yes']['orders id'].tolist()

# map this result to your weight data
# note that the data in the "weight" df is by tracking,
    # so there are dupe OIDs,
    # that's why we map the yes/no flag and not the actual rev/charge amounts
weight['overcharge'] = np.where(weight['orders id'].isin(ls), 'yes', 'no')

### Summary and stats

In [None]:
t = len(weight)
print('To ensure that we catch multi-box shipments, these counts are by tracking number and not by OID.')
print('\n{:,.0f} total tracking numbers considered\nfrom dates {} to {}'.format(t, d1, d2)) # recall that the weight df is by tracking, not by OID

print('\nWHERE entered weight == 0\nAND billed weight == 1')
df1 = weight[(weight['entered weight'] == 0)
           & (weight['billed weight'] == 1)]
val1 = len(df1)
print('{:,.2f}%\nor {:,.0f} tracking numbers'.format(val1/t * 100, val1))

print('\nWHERE entered weight != billed weight\nAND billed weight > 1')
df2 = weight[(weight['entered weight'] != weight['billed weight'])
           & (weight['billed weight'] > 1)]
val2 = len(df2)
print('{:,.2f}%\nor {:,.0f} tracking numbers'.format(val2/t * 100, val2))

print('\nWHERE entered weight = billed weight')
df3 = weight[weight['billed weight'] == weight['entered weight']]
val3 = len(df3)
print('{:,.2f}%\nor {:,.0f} tracking numbers'.format(val3/t * 100, val3))

print('\nproportions check: they sum to {:,.0f}%'.format((val1 + val2 + val3)/t * 100))

print('\n=====\n\nGeneral Stats')

print('\nCount of overcharges')
val6 = len(weight[(weight['overcharge'] == 'yes')])
print('{:,.2f}%\nor {:,.0f} tracking numbers'.format(val6/t * 100, val6))

print('\nCount of Shipping Charge Corrections')
val4 = len(weight[weight['shipping charge correction'] == 'yes'])
print('{:,.2f}%\nor {:,.0f} tracking numbers'.format(val4/t * 100, val4))

print('\nCount of Shipping Charge Corrections AND Overcharges')
val5 = len(weight[(weight['shipping charge correction'] == 'yes') & (weight['overcharge'] == 'yes')])
print('{:,.2f}%\nor {:,.0f} tracking numbers'.format(val5/t * 100, val5))

### To Excel

In [None]:
write = 'no'

if write == 'yes':
    title = 'UPS Weight Data from {} to {}'.format(m1, m2)
    writer = pd.ExcelWriter(title + '.xlsx', engine = 'xlsxwriter')
    weight.to_excel(writer, 'data', index = False)
    writer.save()

### See how many mismatched weights resulted in Shipping Charge Corrections

In [None]:
print('data is from {} to {}'.format(d1,d2))
df = weight_main[(weight_main['entered weight'] != weight_main['billed weight'])
               & (weight_main['billed weight'] > 1)].copy()

otb = pd.read_sql(
'''
SELECT 
orders_id,
COUNT(orders_id) AS count
FROM orders_to_boxes
GROUP BY orders_id HAVING COUNT(orders_id) = 1 # only consider tracking numbers that were shipped using a single box, for simplicity
''', db)

col_fix(otb)

c = pd.read_sql(
'''
SELECT
tracking,
SUM(netAmount) AS ups_charge,
GROUP_CONCAT(LOWER(charge_description) SEPARATOR ', ') AS charge_description
FROM ups_billing
WHERE tracking IN '''+ str(tuple(df['tracking'].tolist())) +'''
AND charge_description NOT LIKE '%fuel%' # for ease of reading, remove this charge
GROUP BY tracking
''', db)

col_fix(c)

t_to_o = pd.read_sql(
'''
SELECT
orders_id,
tracking
FROM ups_billing
WHERE tracking IN '''+ str(tuple(df['tracking'].tolist())) +'''
''', db)
col_fix(t_to_o)

c['orders id'] = c['tracking'].map(dict(zip(t_to_o['tracking'], t_to_o['orders id'])))

# only consider tracking numbers that were shipped using a single box, for simplicity
c = c[c['orders id'].isin(otb['orders id'].tolist())].copy()

df['charge description'] = df['tracking'].map(dict(zip(c['tracking'], c['charge description'])))
df['billed minus entered'] = df['billed weight'] - df['entered weight']

df['shipping charge correction'] = np.where(df['charge description'].str.contains('shipping charge correction'),
                                           'yes', 'no')
df['ups charge'] = df['tracking'].map(dict(zip(c['tracking'],c['ups charge'])))
df['shipping revenue'] = df['orders id'].map(dict(zip(ot_main['orders id'],ot_main['shipping revenue'])))
df['shipping profit'] = df['shipping revenue'] - df['ups charge']
df['overcharge'] = np.where(df['shipping profit'] < 0,'yes','no')

df.dropna(inplace = True)

In [None]:
df.groupby(['shipping charge correction','overcharge'])[['billed minus entered']].describe()

In [None]:
df[df['billed minus entered'] > 35]

### For Daigo:
* From basecamp: "...pick an order that we can look at the measurements for and confirm whether they're getting rounded down"

In [None]:
# Whelp, nice find! I've made a push to update these columns to have decimal accuracy, but unfortunately it'll
    # only take effect for invoices that we import going forward. 
    # I'm thinking we can ignore those in the context of this conversation, 
    # except when the billed_weight is != 1 (ex 1809911), since that still 
    # isn't explained away by the rounding issue. 
    # this was on aug 9

d = '2018-08-09'

# get this data
df1 = weight[(weight['entered weight'] > 0)
           & (pd.to_datetime(weight['date purchased']) > d)]

if df1.empty:
    print('from {} onwards all entered_weight > zero'.format(d))
else:

    # make a list of OIDs
    ls = df1['orders id'].tolist()

    # get the actual weight of the package
    s = pd.read_sql(
    '''
    SELECT
    DATE(shipped_date) AS shipped_date,
    orders_id,
    sl_weight
    FROM ship_log 
    WHERE orders_id IN '''+ str(tuple(ls)) +'''
    ORDER BY orders_id
    ''', db)

    col_fix(s)

    # join the two
    df2 = df1[['orders id','entered weight']].set_index('orders id').join(s.set_index('orders id'))

    # organize and rename
    df2 = df2[['sl weight','entered weight','shipped date']]
    df2.rename(columns = {'entered weight':'ups_billing.entered weight',
                          'sl weight':'ship_log.sl_weight'}, inplace = True)

    df2.reset_index(inplace = True)

    # view this info
    a = df2['shipped date'].min()
    b = df2['shipped date'].max()
    print('data is from {} to {}'.format(a,b))
    print('WHERE ups_billing.entered_weight > 0 AND orders_total.date_purchased > 2018-08-09\n')

    # make this columns
    df2['result'] = ''
    for index, row in df2.iterrows():

        if np.abs(row['ship_log.sl_weight'] - row['ups_billing.entered weight']) < 0.01:
            df2.loc[index, 'result'] = 'equal'

        else:
            if np.ceil(row['ship_log.sl_weight']) == row['ups_billing.entered weight']:
                df2.loc[index, 'result'] = 'rounded up'

            elif np.floor(row['ship_log.sl_weight']) == row['ups_billing.entered weight']:
                df2.loc[index, 'result'] = 'rounded down'     

            else:
                df2.loc[index, 'result'] = 'off by more than one pound'

    # get a nice result
    v1 = df2['result'].value_counts()
    v2 = df2['result'].value_counts().sum()
    v = pd.DataFrame(v1)
    v.columns = ['count']
    v['% of total'] = v1/v2
    v.loc['total'] = v.sum()
    print(v.format_(['n0','p2']))

    print('\ntop five "rounded down" by "shipped date"')
    df2[df2['result'] == 'rounded down'].sort_values('shipped date', ascending = False).head()

In [None]:
close = 'no'

if close == 'yes':
    writer = pd.ExcelWriter('UPS weight rounded down or up? from {} to {}.xlsx'.format(a,b), engine = 'xlsxwriter')
    df2.to_excel(writer, 'data', index = False)
    writer.save()

In [None]:
print('done')

### For Daigo, again, now with more conditions

In [None]:
ls = list(box_main[box_main['correct box'] == 1]['orders id'])

df = weight[(weight['overcharge'] == 'yes')
          & (weight['shipping charge correction'] == 'no')
          & (weight['orders id'].isin(ls))
          & (weight['entered weight'] != 0)].copy()

for col in ['pred box','actual box','correct box']:
    df[col] = df['orders id'].map(dict(zip(box_main['orders id'], box_main[col])))
    
for col in ['shipping revenue','ups charge','shipping profit','jb charge description']:
    df[col] = df['orders id'].map(dict(zip(ups_main['orders id'], ups_main[col])))
    
df.sort_values('shipping profit', inplace = True)

In [None]:
df.head()

# FTP weight data
### Be sure to check the CSV that you are uploading!!!

### Get and clean CSV

In [None]:
#path = r'/Users/jarad/fake_folder/Shipping/Projects/Box Predictor/Docs/FTP data for WW weights.csv'
path = r'/Users/jarad/fake_folder/Shipping/Projects/Box Predictor/Docs/08_22_2018_ftp_csv.csv'

# get data
csv_main = pd.read_csv(path, low_memory = False, header = None)

# rename cols
cols = {11:'transaction date',
        15:'orders id',
        20:'tracking',
        26:'entered weight',
        28:'billed weight',        
        35:'service',
        44:'charge description code',
        45:'charge description'}

for k,v in cols.items():
    csv_main.rename(columns = {k:v}, inplace = True)
    
# clean these    
csv_main['transaction date'] = pd.to_datetime(csv_main['transaction date'])
csv_main['charge description'] = [x.lower() for x in csv_main['charge description']]

### Make a copy and clean it
* This data has exclusions; be sure to view them below

In [None]:
# to exclude
# note that we include Shipping Charge Corrections
ls = ['return','undeliverable','not previously','collect','shipment']
ls2 = '|'.join(ls)

# make a copy
csv_clean = csv_main[list(cols.values())][(csv_main['entered weight'] > 0) # to return only fields with an entered weight
                                        & (~csv_main['charge description'].str.contains(ls2))].copy()

# UPS should round any fractions of a pound up to the nearest whole pound
# let's do that here
csv_clean['entered weight (rounded)'] = np.ceil(csv_clean['entered weight'])

# get difference
csv_clean['weight difference'] = csv_clean['billed weight'] - csv_clean['entered weight (rounded)']

### Compare CSV with database

In [None]:
a = csv_clean.set_index('tracking')
a = a[['orders id','entered weight','billed weight']]
a.columns = ['orders id','ftp entered weight','ftp billed weight']

b = df1.set_index('tracking')
b = b[['entered weight','billed weight','year and month']]
b.columns = ['db entered weight','db billed weight','year and month']

c = a.join(b).dropna()

c.head()

In [None]:
c[c['ftp billed weight'] > 1].head()

### View all charge descriptions after cleanup

In [None]:
csv_clean['charge description'].value_counts()

In [None]:
# get data on the tracking number level
# by getting the max billed weight, we get the weight of the shipping charge correction, if applicable
    # if none, then this max weight is simply the max billed weight
csv = csv_clean.groupby('tracking', as_index = False).agg({'entered weight':'min',
                                                           'billed weight':'max'}) 

# map OID and date
for col in ['orders id','transaction date']:
    csv[col] = csv['tracking'].map(dict(zip(csv_main['tracking'], csv_main[col])))

# UPS rounds up to the nearest whole pound
csv['entered weight (rounded)'] = np.ceil(csv['entered weight'])

# get the diff
csv['weight difference'] = csv['billed weight'] - csv['entered weight (rounded)']

# flag tracking numbers that incurred a shipping charge correction
tracking = csv_clean[csv_clean['charge description'].str.contains('shipping charge correction')]['tracking'].tolist()
csv['correction'] = np.where(csv['tracking'].isin(tracking), 'yes', 'no')

### Entered weight (rounded) vs billed weight

In [None]:
csv[['entered weight (rounded)','billed weight']].describe()

### Proportions of weight differences

In [None]:
val1 = csv['weight difference'].value_counts()
val2 = csv['weight difference'].value_counts().sum()

d = pd.DataFrame(val1/val2)
d['count'] = d['weight difference'] * val2

# check your calcs
if np.abs(d.iloc[:,0].sum() - 1) < 0.1 and np.abs(d.iloc[:,1].sum() - val2) < 0.1:
    print('match\n')
else:
    print('misatch\n')

# get the min and max date, for reference
d1 = str((csv['transaction date'].min()).date())
d2 = str((csv['transaction date'].max()).date())

# print some info
print('data is from {} to {}'.format(d1,d2))
print('{:,.0f} unique tracking numbers'.format(len(set(csv['tracking']))))

print('\nweight difference proportions and counts\n')
print(d.format_(['p2','n0']))

ax = d.iloc[:,0].iloc[:5].plot(kind = 'barh',
                               color = 'purple',
                               title = 'Difference between billed and entered weight (rounded)\nTop 5',
                               grid = True)
ax.set_xticklabels(['{:,.0f}%'.format(x * 100) for x in ax.get_xticks()])
ax.set_xlabel('proportion of whole')
ax.set_ylabel('weight difference')
plt.show()

### 95% Confidence interval for proportion of orders where entered != billed

In [None]:
n = d['count'].sum()

# return lines where "weight diff" > 0
p = d[d.index > 0]['weight difference'].sum()
phi = 1.96
s = np.sqrt((p * (1-p))/n)

lower = p - phi * s
upper = p + phi * s

print('95% confidence interval for proportion of orders with mismatched entered and billed weights\n')
print('lower: {:,.2f}%\nmiddle: {:,.2f}%\nupper: {:,.2f}%'.format(lower * 100, p * 100, upper * 100))
print('sample size: {:,.0f} tracking numbers'.format(n))

### Shipping Charge Corrections

In [None]:
val1 = len(csv[csv['correction'] == 'yes'])
val2 = len(csv)

print('{:,.2f}% tracking numbers incurred a Shipping Charge Correction'.format(val1/val2 * 100))
print('or {:,.0f} out of {:,.0f}'.format(val1, val2))

### Weight differences of trackings that incurred a shipping charge correction

In [None]:
# get only tracking numbers that incurred this charge
scc = csv[csv['correction'] == 'yes'].copy()

# get the value counts per weight difference
val1 = scc['weight difference'].value_counts()
val2 = scc['weight difference'].value_counts().sum()

df = pd.DataFrame(val1/val2)
df['count'] = val1

# check totals
if np.abs(df.iloc[:,0].sum() - 1) < 0.1 and np.abs(df.iloc[:,1].sum() - val2) < 0.1:
    print('match')
else:
    print('mismatch')

# view it
df.format_(['p2','n0'])

In [None]:
print('done')

# Follow-up check on 2018-10-01

On Aug 09 from Daigo via basecamp: Whelp, nice find! I've made a push to update these columns to have decimal accuracy, but unfortunately it'll only take effect for invoices that we import going forward. 

Below we check to see when the last orders_id had an entered_weight = zero

In [None]:
csv_main = pd.read_csv(r'/Users/jarad/fake_folder/Shipping/Projects/Box Predictor/Docs/US_A71EY05 8.csv', encoding = 'ISO-8859-1', low_memory = False, header = None)

In [None]:
# rename cols
cols = {11:'transaction date',
        15:'orders id',
        20:'tracking',
        26:'ftp entered weight',
        28:'ftp billed weight',        
        35:'service',
        44:'charge description code',
        45:'charge description'}

for k,v in cols.items():
    csv_main.rename(columns = {k:v}, inplace = True)
    
# clean these    
csv_main['transaction date'] = pd.to_datetime(csv_main['transaction date'])
csv_main['charge description'] = [x.lower() for x in csv_main['charge description']]

a = csv_main.groupby('tracking')[['ftp entered weight']].sum()

In [None]:
b = pd.read_sql(
'''
SELECT
DATE(o.date_purchased) AS date_purchased,
ups.tracking,
SUM(ups.entered_weight) AS ups_billing_entered_weight
FROM ups_billing ups
LEFT JOIN orders o ON ups.orders_id = o.orders_id
WHERE DATE(transaction_date) >= '''+ str(d1) +'''
GROUP BY ups.tracking
''', db)
col_fix(b)

In [None]:
c = a.join(b.set_index('tracking')[['ups billing entered weight']])

In [None]:
for col in ['orders id','transaction date']:
    c[col] = c.index.to_series().map(dict(zip(csv_main['tracking'], csv_main[col])))

In [None]:
print('{:,.0f} nulls'.format(np.sum(np.sum(c.isnull().any(1)))))
c.dropna(inplace = True)

d1 = csv_main['transaction date'].min().date()
d2 = csv_main['transaction date'].max().date()
print('ftp data from {} to {}'.format(d1,d2))

In [None]:
print(len(c[np.floor(c['ftp entered weight']) == c['ups billing entered weight']]))
print('out of')
print(len(c))

In [None]:
c[np.floor(c['ftp entered weight']) == c['ups billing entered weight']].head(10)['orders id']

### Order progression, for reference

In [None]:
oid = 1860847

### Entered vs Billed Weight 
* From Daigo via Shipping Audit (Monthly and Weekly) Basecamp
* 1.) box predictor picks a weight and a box. 
* 2.) We send the weight and the box size to UPS's api, and they return to us a quoted price.
* 3.)  this price shows up on checkout, and if the customer makes an order then the box and weight is saved into shipping_quotes. 
* 4.) shipper actually ships it, which involves weighing the actual box. this weight and box gets logged to ship_log, and hob sends that weight and box size to a different UPS api
* 5.) UPS returns us a shipping label, which has price and dim weight recorded. This is the weight that will eventually be returned to us as 'entered weight' as well
* 6.) UPS sends us an invoice with 'entered weight' and 'billed weight', where entered weight is what we sent them for the package in step 4 (supposedly),  and 'billed weight' is the value that they got when they weighed and measured it independently.

In [None]:
pd.read_sql(
'''
SELECT
s.*,
b.name,
b.length,
b.width,
b.height,
b.volume/139 AS dim_weight,
CEILING(b.volume/139) AS dim_weight_rounded
FROM shipping_quotes s
JOIN boxes b ON s.box = b.sku_id
WHERE orders_id = '''+ str(oid) +'''
''', db)

In [None]:
pd.read_sql(
'''
SELECT
s.*,
b.name,
b.length,
b.width,
b.height,
b.volume/139 AS dim_weight,
CEILING(b.volume/139) AS dim_weight_rounded
FROM ship_log s
JOIN boxes b ON s.sl_box = b.sku_id
WHERE orders_id = '''+ str(oid) +'''
''', db)

In [None]:
pd.read_sql(
'''
SELECT
charge_description,
entered_weight,
billed_weight
FROM ups_billing
WHERE orders_id = '''+ str(oid) +'''
''', db)

In [None]:
csv_main[csv_main['orders id'] == str(oid)][['orders id','ftp entered weight','ftp billed weight']]

### Compare dim weights by rounding as explained here:
https://www.ups.com/us/en/help-center/packaging-and-supplies/determine-billable-weight.page#contentBlock-12

In [None]:
ship_log = pd.read_sql(
'''
SELECT
s.orders_id,
DATE(s.shipped_date) AS shipped_date,
CEILING(s.sl_weight) AS sl_weight, # round up to the nearest pound, as per UPS
b.name,
b.length,
b.width,
b.height
FROM ship_log s
JOIN boxes b ON s.sl_box = b.sku_id
WHERE DATE(s.shipped_date) >= '2018-09-01'
''', db)

col_fix(ship_log)
ship_log.drop_duplicates(inplace = True)

In [None]:
# rounding example using this overly confusing module
# https://gist.github.com/jackiekazil/6201722 

from decimal import Decimal, ROUND_HALF_UP
# Here are all your options for rounding:
# This one offers the most out of the box control
# ROUND_05UP       ROUND_DOWN       ROUND_HALF_DOWN  ROUND_HALF_UP
# ROUND_CEILING    ROUND_FLOOR      ROUND_HALF_EVEN  ROUND_UP

our_value = Decimal(2.5)
output = Decimal(our_value.quantize(Decimal('1'), rounding = ROUND_HALF_UP))

print(output)

In [None]:
def ups_rounding(x):
    val = Decimal(x)
    output = Decimal(val.quantize(Decimal('1'), rounding = ROUND_HALF_UP))
    return output

for col in ['length','width','height']:
    ship_log[col + ' rounded'] = ship_log[col].apply(ups_rounding, 2)

In [None]:
ups_main = pd.read_sql(
'''
SELECT
orders_id,
charge_description,
billed_weight
FROM ups_billing
WHERE orders_id IN '''+ str(tuple(ship_log['orders id'])) +'''
''', db)

col_fix(ups_main)

### Rounding example

In [None]:
oid = 1808679

In [None]:
ups_main[ups_main['orders id'] == oid].sort_values('billed weight', ascending = False)

In [None]:
ex = ship_log[ship_log['orders id'] == oid].copy()
ex['dim weight'] = np.ceil(ex[['length rounded','width rounded','height rounded']].product(1)/139)
ex['billable weight'] = ex[['sl weight','dim weight']].max(1)
ex.sort_values('billable weight', ascending = False, inplace = True)

In [None]:
ex

In [None]:
# now the billable weights from ship_log match those from ups_billing

### Test this ups rounding on the entire data set

In [None]:
# get the dim weight and billed weight
ship_log['dim weight'] = np.ceil(ship_log[['length rounded','width rounded','height rounded']].product(1)/139)
ship_log['ship log billed weight'] = ship_log[['sl weight','dim weight']].max(1)

# sum billed weight by OID
ship_log_billed = ship_log.groupby('orders id', as_index = False)[['ship log billed weight']].sum()

# leave out shipping charge corrections, sum billed weight by OID
ups_billed = ups_main[~ups_main['charge description'].str.contains('Shipping Charge Correction')].groupby('orders id', as_index = False)[['billed weight']].sum()
ups_billed.rename(columns = {'billed weight':'ups billed weight'}, inplace = True)

# map UPS to ship_log
billed = pd.merge(ship_log_billed, ups_billed, on = 'orders id', how = 'left')

# remove where we do not yet have UPS data
billed.dropna(subset = ['ups billed weight'], inplace = True)

In [None]:
t = len(billed)
print('count of unique OIDs considered: {:,.0f}'.format(t))

a = np.sum(billed['ship log billed weight'] < billed['ups billed weight'])
print('ship_log < ups: {:,.2f}%'.format(a/t * 100))

b = np.sum(billed['ship log billed weight'] == billed['ups billed weight'])
print('ship_log = ups: {:,.2f}%'.format(b/t * 100))

c = np.sum(billed['ship log billed weight'] > billed['ups billed weight'])
print('ship_log > ups: {:,.2f}%'.format(c/t * 100))

### Results without rounding

In [None]:
no_round = ship_log[['orders id','sl weight','name','length','width','height']].copy()
no_round['dim weight'] = np.ceil(no_round[['length','width', 'height']].prod(1)/139)
no_round['ship log billed weight'] = no_round[['sl weight','dim weight']].max(1)

no_round_ship_log = no_round.groupby('orders id', as_index = False)[['ship log billed weight']].sum()

no_round_billed = pd.merge(no_round_ship_log, ups_billed, on = 'orders id', how = 'left')
no_round_billed.dropna(subset = ['ups billed weight'], inplace = True)

In [None]:
t = len(no_round_billed)
print('count of unique OIDs considered: {:,.0f}'.format(t))

a = np.sum(no_round_billed['ship log billed weight'] < no_round_billed['ups billed weight'])
print('ship_log < ups: {:,.2f}%'.format(a/t * 100))

b = np.sum(no_round_billed['ship log billed weight'] == no_round_billed['ups billed weight'])
print('ship_log = ups: {:,.2f}%'.format(b/t * 100))

c = np.sum(no_round_billed['ship log billed weight'] > no_round_billed['ups billed weight'])
print('ship_log > ups: {:,.2f}%'.format(c/t * 100))

# Follow-up on 2018-10-03

In [None]:
oid = 1865347

In [None]:
pd.read_sql(
'''
SELECT
s.*,
b.name,
b.length,
b.width,
b.height,
CEILING((ROUND(b.length) * ROUND(b.height) * ROUND(b.width))/139) AS dim_weight
FROM shipping_quotes s
JOIN boxes b ON s.box = b.sku_id
WHERE orders_id = '''+ str(oid) +'''
''', db)

In [None]:
pd.read_sql(
'''
SELECT
sl.orders_id,
CEILING(sl.sl_weight) AS sl_weight,
sl.sl_box,
b.name,
b.length,
b.width,
b.height,
CEILING((ROUND(b.length) * ROUND(b.height) * ROUND(b.width))/139) AS dim_weight
FROM ship_log sl
JOIN boxes b ON sl.sl_box = b.sku_id
WHERE sl.orders_id = '''+ str(oid) +'''
''', db)

In [None]:
pd.read_sql(
'''
SELECT
charge_description,
entered_weight,
billed_weight
FROM ups_billing
WHERE orders_id = '''+ str(oid) +'''
''', db)

### Get some recent overcharges and see what's up

In [None]:
import sys
sys.path.insert(0,'/Users/jarad/fake_folder/Python Libraries/')

from jb_libraries import *
%matplotlib inline

In [None]:
d = '2018-09-15'

In [None]:
ot = pd.read_sql(
'''
SELECT
DATE(o.date_purchased) AS date_purchased, 
ot.orders_id,
ot.value AS shipping_revenue,
ups.ups_charge,
ot.value - ups.ups_charge AS shipping_profit,
#ups.entered_weight,
ups.billed_weight AS ups_billed_weight

FROM orders o

LEFT JOIN orders_total ot ON o.orders_id = ot.orders_id

LEFT JOIN
(SELECT
orders_id,
SUM(netAmount) AS ups_charge,
SUM(entered_weight) AS entered_weight,
SUM(billed_weight) AS billed_weight
FROM ups_billing
GROUP BY orders_id) ups ON ot.orders_id = ups.orders_id

WHERE ot.class = 'ot_shipping'
AND ot.value != 0
AND DATE(o.date_purchased) >= '''+ str(d) +'''
''', db)

col_fix(ot)

In [None]:
ot['date purchased'] = pd.to_datetime(ot['date purchased'])
ot = ot[ot['date purchased'] >= d].copy()

In [None]:
ship_quotes = pd.read_sql(
'''
SELECT
sq.orders_id,
sq.box AS sq_box,
CEILING(sq.weight) AS sq_weight,
#b.name,
#b.length,
#b.width,
#b.height,
CEILING((ROUND(b.length) * ROUND(b.height) * ROUND(b.width))/139) AS sq_dim_weight
FROM shipping_quotes sq
JOIN boxes b ON sq.box = b.sku_id
WHERE sq.orders_id IN '''+ str(tuple(ot['orders id'])) +'''
''', db)

col_fix(ship_quotes)
ship_quotes['sq billed weight'] = ship_quotes[['sq weight','sq dim weight']].max(1)

In [None]:
ship_log = pd.read_sql(
'''
SELECT
sl.orders_id,
sl.sl_box,
CEILING(sl.sl_weight) AS sl_weight,
#b.name,
#b.length,
#b.width,
#b.height,
CEILING((ROUND(b.length) * ROUND(b.height) * ROUND(b.width))/139) AS sl_dim_weight
FROM ship_log sl
JOIN boxes b ON sl.sl_box = b.sku_id
WHERE sl.orders_id IN '''+ str(tuple(ot['orders id'])) +'''
''', db)

col_fix(ship_log)
ship_log['sl billed weight'] = ship_log[['sl weight','sl dim weight']].max(1)

In [None]:
charges = pd.read_sql(
'''
SELECT
orders_id,
charge_description
FROM ups_billing
WHERE orders_id IN '''+ str(tuple(ot['orders id'])) +'''
''', db)

col_fix(charges)

charges2 = charges.groupby('orders id')['charge description'].apply(lambda x: ', '.join(x))
charges3 = pd.DataFrame(charges2).reset_index()

In [None]:
audit = pd.merge(ot, ship_quotes, on = 'orders id').merge(ship_log, on = 'orders id')
audit['ups charges'] = audit['orders id'].map(dict(zip(charges3['orders id'], charges3['charge description'])))
audit.dropna(subset = ['ups charge'], inplace = True) # remove where we do not yet have UPS billing data

In [None]:
loss = audit[(audit['shipping profit'] < 0)].copy()
print(len(loss))
print(loss['shipping profit'].sum())

In [None]:
a = loss[loss['sq billed weight'] < loss['ups billed weight']].copy()
print(len(a))
a['shipping profit'].describe()

In [None]:
b = loss[loss['sq billed weight'] >= loss['ups billed weight']].copy()
print(len(b))
b['shipping profit'].describe()

In [None]:
c = b[b['sl billed weight'] <= b['ups billed weight']].copy()
len(c)
c['shipping profit'].describe()

In [None]:
ax = c.groupby('ups charges')[['shipping profit']].sum().sort_values('shipping profit').plot(kind = 'barh')
ax.legend(loc = 'upper left')
ax.grid()
ax.set_xticklabels(['${:,.0f}'.format(x) for x in ax.get_xticks()])
plt.show()

In [None]:
c.sort_values('shipping profit').head()

### To excel, per Daigo request

In [None]:
xl_main = audit[audit['sq billed weight'] < audit['ups billed weight']]
xl_gain = xl_main[xl_main['shipping profit'] >= 0].copy()
xl_loss = xl_main[xl_main['shipping profit'] < 0].copy()

print(len(xl_main) == len(xl_gain) + len(xl_loss))

In [None]:
len(xl_main)

In [None]:
len(xl_gain)

In [None]:
len(xl_loss)

In [None]:
write = 'no'

if write == 'yes':
    writer = pd.ExcelWriter('2018-09-15 to 2018-10-03 - shipping_quotes.billed_weight and ups_billing.billed_weight.xlsx', engine = 'xlsxwriter')

    xl_main.to_excel(writer, 'all data', index = False)
    xl_gain.to_excel(writer, 'zero profit or gain', index = False)
    xl_loss.to_excel(writer, 'profit loss', index = False)
    
    writer.save()

In [None]:
print('done')

### "Under Prediction" comment

In [None]:
a = len(audit)
b = len(audit[audit['sq billed weight'] < audit['ups billed weight']])
b/a

### 2018-10-04 follow-up

In [None]:
xl_main[xl_main['orders id'] == 1853173]

In [None]:
pd.read_sql(
'''
SELECT
sq.orders_id,
sq.box AS sq_box,
sq.weight,
b.name,
b.length,
b.width,
b.height,
CEILING((ROUND(b.length) * ROUND(b.height) * ROUND(b.width))/139) AS sq_dim_weight
FROM shipping_quotes sq
JOIN boxes b ON sq.box = b.sku_id
WHERE sq.orders_id = 1853173
''', db)

In [None]:
pd.read_sql(
'''
SELECT
sl.orders_id,
sl.sl_box,
sl.sl_weight,
#b.name,
b.length,
b.width,
b.height,
CEILING((ROUND(b.length) * ROUND(b.height) * ROUND(b.width))/139) AS sl_dim_weight
FROM ship_log sl
JOIN boxes b ON sl.sl_box = b.sku_id
WHERE sl.orders_id = 1853173
''', db)

### 2018-10-09: Create interval for additional non-box packaging weight, like bubble wrap, etc.
* This is the difference in dim weight between shipping_quotes and ship_log 
* [Confidence interval for a proportion](https://onlinecourses.science.psu.edu/stat100/node/56/)

In [None]:
import sys
sys.path.insert(0,'/Users/jarad/fake_folder/Python Libraries/')

from jb_libraries import *
%matplotlib inline

In [None]:
date_start = '2018-09-08'
date_end = '2018-10-08'

In [None]:
boxes = pd.read_sql(
'''
SELECT
sku_id,
name,
CEILING((ROUND(length) * ROUND(width) * ROUND(height))/139) AS box_dim_weight
FROM boxes
''', db)

col_fix(boxes)

In [None]:
boxes

In [None]:
pd.read_sql(
'''
SELECT
*
FROM orders_status
ORDER BY orders_status_id
''', db)

In [None]:
sq = pd.read_sql(
'''
SELECT
DATE(o.date_purchased) AS date_purchased,
sq.orders_id,
sq.box AS sq_box,
sq.weight AS sq_weight,
CEILING(sq.weight) AS sq_weight_rounded,
b.name,
b.length,
b.width,
b.height,
CEILING((ROUND(b.length) * ROUND(b.height) * ROUND(b.width))/139) AS sq_dim_weight
FROM shipping_quotes sq
JOIN boxes b ON sq.box = b.sku_id
JOIN orders o ON sq.orders_id = o.orders_id
WHERE DATE(o.date_purchased) BETWEEN ' '''+ date_start +''' ' AND ' '''+ date_end +''' '
AND o.orders_status NOT IN (8,9,10,11,12,13,14,15)
''', db)

col_fix(sq)
sq['sq billed weight'] = sq[['sq weight rounded','sq dim weight']].max(1)

dupes1 = sq[sq['orders id'].duplicated()]['orders id'].tolist()
sq = sq[~sq['orders id'].isin(dupes1)].copy()

In [None]:
sl = pd.read_sql(
'''
SELECT
DATE(o.date_purchased) AS date_purchased,
sl.orders_id,
sl.sl_box,
sl.sl_weight AS sl_weight,
CEILING(sl.sl_weight) AS sl_weight_rounded,
b.name,
b.length,
b.width,
b.height,
CEILING((ROUND(b.length) * ROUND(b.height) * ROUND(b.width))/139) AS sl_dim_weight
FROM ship_log sl
JOIN boxes b ON sl.sl_box = b.sku_id
JOIN orders o ON sl.orders_id = o.orders_id
WHERE DATE(o.date_purchased) BETWEEN ' '''+ date_start +''' ' AND ' '''+ date_end +''' '
AND o.orders_status NOT IN (8,9,10,11,12,13,14,15)
''', db)

col_fix(sl)
sl['sl billed weight'] = sl[['sl weight rounded','sl dim weight']].max(1)

dupes2 = sl[sl['orders id'].duplicated()]['orders id'].tolist()
sl = sl[~sl['orders id'].isin(dupes2)].copy()

In [None]:
print('data is between %s and %s' % (date_start, date_end))
print('{:,.0f} OIDs were shipped in multiple boxes, and these have been excluded here\n'.format(len(dupes2)))

a = len(sq)
b = len(sq[sq['orders id'].isin(sl['orders id'].tolist())])
c = a-b

print('{:,.0f} OIDs in shipping_quotes\n{:,.0f} of these OIDs are in ship_log'.format(a,b))
print('which leaves {:,.0f} boxes ({:,.2f}% of shipping_quotes total) that were not scanned by shippers'.format(c, c/a*100))

ls1 = ['date purchased','orders id','sl box','sl weight','sl weight rounded','sl billed weight']
ls2 = ['orders id','sq box','sq weight','sq weight rounded','sq billed weight']
main = pd.merge(sl[ls1], sq[ls2], how = 'inner', on = 'orders id')

d = len(main[main['sq box'] == main['sl box']])
print('\n{:,.0f} OIDs were shipped in the same box as quoted'.format(d))
print('this is {:,.2f}% of ship_log total'.format(d/b*100))

In [None]:
cols = main.columns.tolist()
ls = ['sl box','sl weight','sl weight rounded','sl billed weight']
for w in ls:
    cols.remove(w)
    cols.append(w)
main = main[cols]

### Same box quoted and shipped

In [None]:
w1 = main[main['sl box'] == main['sq box']].copy()
print('{:,.0f} OIDs were shipped in the same box as quoted'.format(len(w1)))
print('or {:,.2f}% of total'.format(len(w1)/len(main) * 100))

In [None]:
n = len(w1)
s1 = len(w1[w1['sq billed weight'] >= w1['sl billed weight']])/n

st_error = np.sqrt((s1 * (1-s1))/n)
z = 1.96

lower = s1 - z * st_error
upper = s1 + z * st_error

print('out of all OIDs where the same box was quoted and shipped')
print('confidence interval for proportion of orders where shipping_quotes billed weight >= ship_log billed weight')
print('\nlower {:,.2f}%\nactual: {:,.2f}%\nupper: {:,.2f}%'.format(lower * 100, s1 * 100, upper * 100))

### Different box quoted and shipped

In [None]:
w2 = main[main['sl box'] != main['sq box']].copy()
print('{:,.0f} OIDs were NOT shipped in the same box as quoted'.format(len(w2)))
print('this is {:,.2f}% of ship_log total'.format(len(w2)/len(main) * 100))

In [None]:
w2['sq box dim weight'] = w2['sq box'].map(dict(zip(boxes['sku id'], boxes['box dim weight'])))
w2['sl box dim weight'] = w2['sl box'].map(dict(zip(boxes['sku id'], boxes['box dim weight'])))

In [None]:
n = len(w2)
s1 = len(w2[w2['sq billed weight'] < w2['sl billed weight']])/n

st_error = np.sqrt((s1 * (1-s1))/n)
z = 1.96

lower = s1 - z * st_error
upper = s1 + z * st_error

print('out of all OIDs where the same box was NOT quoted and shipped')
print('confidence interval for proportion of orders where quoted billed weight < shipped billed weight')
print('\nlower {:,.2f}%\nactual: {:,.2f}%\nupper: {:,.2f}%'.format(lower * 100, s1 * 100, upper * 100))

In [None]:
len(w2[w2['sq billed weight'] < w2['sl billed weight']])/len(main)

In [None]:
len(main)

In [None]:
a = len(w2[(w2['sq billed weight'] < w2['sl billed weight'])])

b = len(w2[(w2['sq billed weight'] < w2['sl billed weight'])
  & (w2['sq weight rounded'] < w2['sl weight rounded'])])

c = len(w2[(w2['sq billed weight'] < w2['sl billed weight'])
  & (w2['sq box dim weight'] < w2['sl box dim weight'])])

print('total OID count of the orders where a different box was quoted and shipped: {:,.0f}'.format(len(w2)))
print('of this count, {:,.0f} or {:,.2f}% of OIDs had shipping_quote billed weight < ship_log billed weight'.format(a,a/len(w2) * 100))
print('\n{:,.2f}% of this count had an under-quoted rounded actual weight'.format(b/a*100))
print('{:,.2f}% of this count contained had an under-quoted box dim weight'.format(c/a * 100))

# Follow-up on numerical truncation issue
* Fix was pushed 2018-10-01

In [1]:
import sys
sys.path.insert(0,'/Users/jarad/fake_folder/Python Libraries/')

from jb_libraries import *
%matplotlib inline

In [None]:
trunc = pd.read_sql(
'''
SELECT
ups.orders_id,
entered_weight,
billed_weight
FROM ups_billing ups
JOIN orders o ON ups.orders_id = o.orders_id
AND DATE(date_purchased) >= '2018-10-01'
''', db)

col_fix(trunc)

In [None]:
trunc.head()