# REMINDER:
* consider making x1 .py file for the DHL billing data import
* furthermore, consider just using the SUM(weight charge, xtra charges) = SUM(total charge) to compare against adafruit db
* see very bottom of this script

In [None]:
import sys
sys.path.insert(0,'/Users/jarad')

import pandas as pd
import numpy as np
from db2 import *
import xlsxwriter

import datetime as dt
import calendar
from df_format import *

import glob

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


import matplotlib.pyplot as plt
%matplotlib inline

# Set stuff

In [None]:
date_start = '2016-12-01'
date_end = '2017-12-31'

#===== colors http://www.color-hex.com/color-palette/50389
colors = {
'color01':'#f2f2f2',
'color02':'#cccccc',
'color03':'#b0d8da',
'color04':'#007897',
'color05':'#0a406e'}

# Adafruit data

In [None]:
orders_total_main = pd.read_sql(
'''
SELECT
DATE(A.date_purchased) AS date,
DATE_FORMAT(A.date_purchased, '%Y-%m') AS 'year and month',
A.orders_id AS 'orders id',
A.service,
B.order_subtotal AS 'order subtotal',
C.shipping AS 'service revenue',
D.ddp AS 'ddp revenue',
A.adabox,
A.delivery_city AS 'delivery city',
A.delivery_country AS 'delivery country'

FROM

(SELECT
date_purchased,
orders_id,

CASE 
WHEN shipping_module_code = 'upsxml' THEN 'ups'
WHEN shipping_module_code = 'usps' THEN 'usps'
WHEN shipping_module_code = 'dhlexpress' THEN 'dhl'
WHEN shipping_module_code = 'resellershipping' THEN 'reseller shipping'
WHEN shipping_module_code IN ('free','','sameday','----- NO SHIPPING SELECTED -----') THEN 'free/sameday/other'
ELSE 'YOU MISSED ONE' END AS 'service',

LOWER(delivery_city) AS delivery_city,
LOWER(delivery_state) AS delivery_state,
LOWER(delivery_country) AS delivery_country,

IF(orders_id IN 
(SELECT
sh.orders_id
FROM subscriptions s
JOIN subscriptions_history sh ON s.subscriptions_id = sh.subscriptions_id 
AND sh.action = 'Create Shipment Order'
WHERE s.subscriptions_type = 'adabox'), 'yes','no') AS 'adabox'

FROM orders

WHERE orders_status != 9
AND orders_status != 10
AND payment_method != 'Replacement Order') A

LEFT JOIN

(SELECT
orders_id,
value AS order_subtotal
FROM orders_total
WHERE class = 'ot_subtotal') B ON A.orders_id = B.orders_id

LEFT JOIN

(SELECT
orders_id,
value AS shipping
FROM orders_total
WHERE class = 'ot_shipping') C ON A.orders_id = C.orders_id

LEFT JOIN

(SELECT
orders_id,
value AS ddp
FROM orders_total
WHERE class = 'ot_ddp') D ON A.orders_id = D.orders_id
''', db)

orders_total_main['shipping revenue'] = orders_total_main[['service revenue','ddp revenue']].sum(1)
orders_total_main['free shipping'] = np.where(orders_total_main['shipping revenue'] == 0, 'yes','no')
orders_total_main['ddp revenue'].fillna(0, inplace = True)

orders_total_main = orders_total_main[['date',
                                       'year and month',
                                       'orders id',
                                       'order subtotal',
                                       'service revenue',
                                       'ddp revenue',
                                       'shipping revenue',
                                       'service',
                                       'adabox',
                                       'free shipping',
                                       'delivery city',
                                       'delivery country']]

In [None]:
orders_total_main.head()

# Latest date in ups_billing table

In [None]:
pd.read_sql(
'''
SELECT
MAX(transaction_date) AS 'latest date'
FROM ups_billing
''', db)

# Missing UPS orders

In [None]:
missing = pd.read_sql(
'''
SELECT
DATE_FORMAT(date_purchased, '%Y-%m') AS 'year and month',
COUNT(*) AS missing
FROM orders
WHERE shipping_module_code = 'upsxml'
AND DATE(date_purchased) BETWEEN ' '''+ date_start +''' ' AND ' '''+ date_end +''' '
AND orders_id NOT IN (SELECT orders_id FROM ups_billing)
GROUP BY DATE_FORMAT(date_purchased, '%Y-%m')
''', db)

In [None]:
ax = missing.plot(kind = 'bar', figsize = (20,5), use_index = False)

ax.set_xticks(np.arange(0, len(missing.index)))
ax.set_xticklabels([str(x)[:7] for x in missing['year and month']], rotation = 45)

vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0f}'.format(x) for x in vals])

plt.title('Count of UPS Orders Missing From ups_billing table', fontsize = 15)
plt.xlabel('Year and Month', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
plt.legend(['Missing UPS Count'], fontsize = 12)
plt.grid()
plt.show()

In [None]:
print ('count of orders missing from current month: {}'.format(missing['missing'].iloc[12]))

# Get data

### UPS

In [None]:
ups_ddp_charge_list = ['Agri Processing',
                      'Broker Fee',
                      'Brokerage Fees',
                      'Brokerage GST',
                      'Ca British Columbia Pst',
                      'Ca Customs Hst',
                      'Complex Entry',
                      'Customs Gst',
                      'Customs Warehouse',
                      'DGoods Air Inaccessible',
                      'Duty Amount',
                      'Pst Quebec',
                      'QST']

In [None]:
ups_main = pd.read_sql(
'''
SELECT
DATE_FORMAT(transaction_date, '%Y-%m') AS 'year and month',
DATE(transaction_date) AS 'date',
orders_id AS 'orders id',
CASE WHEN charge_description IN '''+ str(tuple(ups_ddp_charge_list)) +''' THEN netAmount END AS 'ddp charge',
CASE WHEN charge_description NOT IN '''+ str(tuple(ups_ddp_charge_list)) +''' THEN netAmount END AS 'service charge',
charge_description AS 'charge description',
'ups' AS 'service'

FROM ups_billing

WHERE DATE(transaction_date) >= ' '''+ date_start +''' '
''', db)

In [None]:
ups_main['ddp charge'].fillna(0, inplace = True)

ups_main['adabox'] = np.where(ups_main['orders id'].isin(orders_total_main['orders id'][orders_total_main['adabox'] == 'yes'].tolist()), 'yes','no')
ups_main['free shipping'] = np.where(ups_main['orders id'].isin(orders_total_main['orders id'][orders_total_main['free shipping'] == 'yes'].tolist()), 'yes','no')
ups_main['incoming/outgoing'] = np.where(ups_main['orders id'].isin(orders_total_main['orders id'].tolist()), 'outgoing','incoming')

ups_main['shipping charge'] = ups_main[['service charge','ddp charge']].sum(1)

ups_main['charge description'] = [x.lower() for x in ups_main['charge description']]

### DHL

In [None]:
# get all CSVs
path = r'/Users/jarad/fake_folder/Shipping/Recurring/DHL Tax Analysis/Docs/DHL Invoices/'
all_files = glob.glob(path + '/*.csv')
frame = pd.DataFrame()
list_ = []

for file_ in all_files:
    df = pd.read_csv(file_, index_col = None, header = 0)
    list_.append(df)
    
dhl_super_raw = pd.concat(list_)   

# fix columns
dhl_super_raw.columns = [x.lower() for x in dhl_super_raw]
dhl_super_raw.rename(columns = {'shipment reference 1':'orders id',
                               'dest name':'delivery city',
                               'dest country name':'delivery country'}, inplace = True)

# check for dupes
if dhl_super_raw[dhl_super_raw.duplicated()].empty:
    print('main CSV has no fully duplicated lines')
else:
    print('main CSV does have some fully duplicated lines')

# fix orders id data type
dhl_super_raw['orders id'] = pd.to_numeric(dhl_super_raw['orders id'], errors = 'coerce', downcast = 'integer')
dhl_super_raw['orders id'].fillna(0, inplace = True)
dhl_super_raw['orders id'] = [int(x) for x in dhl_super_raw['orders id']]

# fix weight charge
dhl_super_raw['weight charge'].fillna(0, inplace = True)

# drop any column with all zeros
dhl_super_raw = dhl_super_raw.loc[:, (dhl_super_raw != 0).any(axis = 0)]

# get count of columns with charge names and amounts
# ask jarad for more about why this is the way it is

# YOU ARE HERE

total_columns = sum(dhl_super_raw.columns.str.contains('xc')) - 1
total_charge_groups = total_columns/3

print('count of charge-type columns: %i' % total_charge_groups)

dhl_main = pd.DataFrame()

for i in np.arange(1, int(total_charge_groups) + 1):
    
    df = dhl_super_raw.groupby(['shipment number','orders id', 'xc' + str(i) + ' name'])[['xc' + str(i) + ' charge']].sum().unstack(2).fillna(0)
    df.columns = df.columns.get_level_values(1)    
    df.reset_index(inplace = True)
    dhl_main = dhl_main.append(df, ignore_index = True)
    
dhl_test = dhl_main.copy()    
    
# fill nulls
dhl_main.fillna(0, inplace = True) 

# drop columns with all zeros
dhl_main = dhl_main.loc[:, (dhl_main != 0).any(axis = 0)]

# convert column names to lower case
dhl_main.columns = [x.lower() for x in dhl_main]  

# consolidate charges by shipment number
dhl_main = dhl_main.groupby(['shipment number','orders id'], as_index = False).sum()

# add the weight charge
dhl_main = pd.merge(dhl_main,
                    dhl_super_raw.groupby(['shipment number','orders id'], as_index = False)[['weight charge']].sum(),
                    how = 'right',
                    on = ['shipment number','orders id'])

# find and fix shipment number dupes
shipment_number_dupes = dhl_main.copy()
shipment_number_dupes = shipment_number_dupes[shipment_number_dupes['shipment number'].isin(shipment_number_dupes['shipment number'][shipment_number_dupes['shipment number'].duplicated()].tolist())].sort_values('shipment number')

def shipment_number_dupe_clean(x):
    if x['shipment number'] in shipment_number_dupes['shipment number'].tolist():
        return 0
    else:
        return x['orders id']

dhl_main['orders id'] = dhl_main.apply(shipment_number_dupe_clean, axis = 1)

# find and fix orders id dupes
# each set of orders id's have different shipment numbers
# we replace the different shipment numbers with a single shipment number
# we keep track of them to check our work at the end
orders_id_dupes = dhl_main.copy()
orders_id_dupes = orders_id_dupes[orders_id_dupes['orders id'].isin(orders_id_dupes['orders id'][(orders_id_dupes['orders id'].duplicated()) & (orders_id_dupes['orders id'] != 0)]).tolist()].sort_values('orders id')

dhl_main = dhl_main.groupby(['shipment number','orders id'], as_index = False).sum()

def orders_id_dupe_clean(x):
    if x['orders id'] in orders_id_dupes['orders id'].tolist():
        df = orders_id_dupes[orders_id_dupes['orders id'] == x['orders id']]
        return df['shipment number'].values[0]
    else:
        return x['shipment number']
    
dhl_main['shipment number'] = dhl_main.apply(orders_id_dupe_clean, axis = 1)

excluded_shipment_numbers = list(set(orders_id_dupes['shipment number'].tolist()) - set(dhl_main['shipment number'][dhl_main['orders id'].duplicated()]))

# now we have created a one-to-one between shipment numbers and orders id
dhl_main = dhl_main.groupby(['shipment number','orders id'], as_index = False).sum()

# check it out
print('')
print('count of unique shipment numbers in dhl_super_raw: ', len(dhl_super_raw['shipment number'].unique()))
print('count of unique shipment numbers in dhl_main after dupe clean up: ', len(dhl_main['shipment number'].unique()))
print('count of shipment numbers we excluded', len(excluded_shipment_numbers))
print('so {} total unique shipment numbers after dupe clean up'.format(len(dhl_main['shipment number'].unique()) + len(excluded_shipment_numbers)))

if len(dhl_super_raw['shipment number'].unique()) == len(dhl_main['shipment number'].unique()) + len(excluded_shipment_numbers):
    print('your dupe clean up worked!')
else:
    print('your dupe clean up DID NOT work')
    
# clean columns
dhl_main['ddp charge'] = dhl_main[['import / export duties',
                                 'import export duties',
                                 'import export taxes',
                                 'import/ export duties']].sum(1)
dhl_main.drop(['import / export duties',
                 'import export duties',
                 'import export taxes',
                 'import/ export duties'], 1, inplace = True)

dhl_main['shipment value protect'] = dhl_main[['shipment value protection']].sum(1)
dhl_main.drop(['shipment value protection'], 1, inplace = True)

dhl_main['licenses and permits'] = dhl_main[['obtaining permits & licenses',
                                               'obtaining permits &amp; licenses']].sum(1)
dhl_main.drop(['obtaining permits & licenses',
                'obtaining permits &amp; licenses',], 1, inplace = True)    

# add in weight charge

# create new columns
dhl_main['service charge'] = dhl_main.iloc[:, 2:].sum(1) - dhl_main['ddp charge']
dhl_main['service'] = 'dhl'

# check out the totals
print('')
print('dhl_super_raw total: ', dhl_super_raw['total charge'].sum())
print('dhl_main total: ', dhl_main['service charge'].sum() + dhl_main['ddp charge'].sum())
if np.abs(dhl_super_raw['total charge'].sum() - (dhl_main['service charge'].sum() + dhl_main['ddp charge'].sum())) < 1.00:
    print('total charge from dhl_super_raw MATCHES total charge from dhl_main')
else:
    print('your totals DON\'T match!!')

# add other data
dhl_main = pd.merge(dhl_main,
                    dhl_super_raw.groupby(['shipment number','senders name','senders city','senders country'], as_index = False)[['shipment date']].max(),
                    how = 'left',
                    on = 'shipment number')
dhl_main['senders name'] = [x.lower() for x in dhl_main['senders name']]
dhl_main['senders city'] = [x.lower() for x in dhl_main['senders city']]
dhl_main['senders country'] = [x.lower() for x in dhl_main['senders country']]    

dhl_main['adabox'] = np.where(dhl_main['orders id'].isin(orders_total_main['orders id'][orders_total_main['adabox'] == 'yes'].tolist()), 'yes','no')

dhl_main.rename(columns = {'shipment date':'date'}, inplace = True)
dhl_main['date'] = pd.to_datetime(dhl_main['date'])
dhl_main['year and month'] = pd.to_datetime(dhl_main['date'].dt.year.map(str) + '-' + dhl_main['date'].dt.month.map(str))
dhl_main['year and month'] = [str(x)[:7] for x in dhl_main['year and month']]

dhl_main['shipping charge'] = dhl_main[['service charge','ddp charge']].sum(1)

# determine incoming/outgoing
dhl_main['incoming/outgoing'] = np.where(dhl_main['orders id'].isin(orders_total_main['orders id'].tolist()),
                                         'outgoing',
                                         'incoming')
dhl_main['free shipping'] = np.where(dhl_main['orders id'].isin(orders_total_main['orders id'][orders_total_main['free shipping'] == 'yes'].tolist()), 'yes','no')

### Check
* service charge is all fees, including weight, and excluding ddp charge
* ddp charge is the sum of any import/export charges
* shipping charge is the total shippingcharge

In [None]:
dhl_main[['service charge','ddp charge','shipping charge']].head()

In [None]:
dhl_main[dhl_main['orders id'] != 0].iloc[:, 2:23].head()

In [None]:
totals_check = pd.DataFrame(dhl_main[dhl_main['orders id'] != 0].iloc[:, 2:22].sum()).rename(columns = {0:'total charge'}).sort_values('total charge', ascending = False)
print('{:,.0f}'.format(dhl_main[dhl_main['orders id'] != 0]['service charge'].sum() + dhl_main[dhl_main['orders id'] != 0]['ddp charge'].sum()))
print('{:,.0f}'.format(totals_check.sum()[0]))

### USPS

In [None]:
usps_main = pd.read_sql(
'''
SELECT
DATE(o.date_purchased) AS 'date',
DATE_FORMAT(o.date_purchased, '%Y-%m') AS 'year and month',
o.orders_id AS 'orders id',
s.sl_cost AS 'service charge',
0.0 AS 'ddp charge'

FROM orders o
JOIN ship_log s ON o.orders_id = s.orders_id

WHERE o.shipping_module_code = 'usps'
AND DATE(o.date_purchased) BETWEEN ' '''+ date_start +''' ' AND ' '''+ date_end +''' '
AND o.orders_status != 9
AND o.orders_status != 10
AND o.payment_method != 'Replacement Order'
''', db)

usps_main['service'] = 'usps'

usps_main['incoming/outgoing'] = np.where(usps_main['orders id'].isin(orders_total_main['orders id'].tolist()), 'outgoing','incoming')
usps_main['adabox'] = np.where(usps_main['orders id'].isin(orders_total_main['orders id'][orders_total_main['adabox'] == 'yes'].tolist()), 'yes','no')
usps_main['free shipping'] = np.where(usps_main['orders id'].isin(orders_total_main['orders id'][orders_total_main['free shipping'] == 'yes'].tolist()), 'yes','no')

usps_main['shipping charge'] = usps_main[['service charge','ddp charge']].sum(1)

In [None]:
usps_main.head()

# Outgoing df

In [None]:
outgoing_charges_main = pd.concat([ups_main.groupby('orders id', as_index = False)[['service charge','ddp charge','shipping charge']].sum(),
                                   dhl_main.groupby('orders id', as_index = False)[['service charge','ddp charge','shipping charge']].sum(),
                                   usps_main.groupby('orders id', as_index = False)[['service charge','ddp charge','shipping charge']].sum()])

outgoing_charges = pd.merge(orders_total_main,
                           outgoing_charges_main,
                           how = 'left',
                           on = 'orders id')

# find nulls
outgoing_nulls = outgoing_charges[(outgoing_charges.isnull().any(1)) & (outgoing_charges['service'].isin(['usps','ups','dhl']))]
print('count of nulls where service is UPS, USPS, or DHL: {:,.0f}'.format(len(outgoing_nulls.index)))

# remove them
outgoing_charges = outgoing_charges[~outgoing_charges['orders id'].isin(outgoing_nulls['orders id'])]

# fill nulls for sameday/free/other and reseller shipping
outgoing_charges.fillna(0, inplace = True)

# get shipping profit
outgoing_charges['shipping profit'] = outgoing_charges['shipping revenue'] - outgoing_charges['shipping charge']

# get domestic vs international
outgoing_charges['region'] = np.where(outgoing_charges['delivery country'] == 'united states', 'domestic','international')

# convert to datetime
outgoing_charges['date'] = pd.to_datetime(outgoing_charges['date'])

# grab all lines before excluding by date
outgoing_charges_all_lines = outgoing_charges.copy()

# isolate dates
outgoing_charges = outgoing_charges[outgoing_charges['date'].between(date_start,date_end)]

In [None]:
outgoing_nulls[outgoing_nulls['adabox'] == 'no'].groupby('service')[['service']].count()

In [None]:
outgoing_nulls[outgoing_nulls['adabox'] == 'no'].groupby(['year and month','service'])[['service']].count().unstack(1).fillna(0)

# Dict of dfs

In [None]:
services_list = outgoing_charges['service'].unique().tolist()

dates = pd.DataFrame({'year and month':pd.date_range(date_start, date_end, freq = 'MS')})
dates['year and month'] = [str(x)[:7] for x in dates['year and month']]
dates.set_index('year and month', inplace = True) 

outgoing_dfs = {}

for service in services_list:
    df = outgoing_charges.copy()
    df = df[df['service'] == service]
    df = df.groupby('year and month').agg({'shipping revenue':'sum',
                                          'shipping charge':'sum',
                                          'shipping profit':'sum',
                                          'orders id':'count'}).rename(columns = {'orders id':'order count'})
    df = pd.merge(dates, df, how = 'left', left_index = True, right_index = True)
    df.fillna(0, inplace = True)
    outgoing_dfs[service + ' with free'] = df

    df = outgoing_charges.copy()
    df = df[(df['service'] == service) & (df['free shipping'] == 'no')]        
    df = df.groupby('year and month').agg({'shipping revenue':'sum',
                                          'shipping charge':'sum',
                                          'shipping profit':'sum',
                                          'orders id':'count'}).rename(columns = {'orders id':'order count'})

    df = pd.merge(dates, df, how = 'left', left_index = True, right_index = True)
    df.fillna(0, inplace = True)    
    outgoing_dfs[service + ' without free'] = df

outgoing_dfs['all with free'] = outgoing_charges.groupby('year and month').agg({'shipping revenue':'sum',
                                          'shipping charge':'sum',
                                          'shipping profit':'sum',
                                          'orders id':'count'}).rename(columns = {'orders id':'order count'})
outgoing_dfs['all without free'] = outgoing_charges[outgoing_charges['free shipping'] == 'no'].groupby('year and month').agg({'shipping revenue':'sum',
                                          'shipping charge':'sum',
                                          'shipping profit':'sum',
                                          'orders id':'count'}).rename(columns = {'orders id':'order count'})

### No Exclusions

In [None]:
pd.DataFrame(outgoing_dfs['all with free'].mean()).rename(columns = {0:'monthly mean'})

In [None]:
outgoing_dfs['all with free'].format_(['m0','m0','m0','n0'])

In [None]:
outgoing_dfs['all with free'].pct_change().format_(['p0','p0','p0','p0'])

### No Free Shipping

In [None]:
pd.DataFrame(outgoing_dfs['all without free'].mean()).rename(columns = {0:'monthly mean'})

In [None]:
outgoing_dfs['all without free'].format_(['m0','m0','m0','n0'])

In [None]:
outgoing_dfs['all without free'].pct_change().format_(['p0','p0','p0','p0'])

In [None]:
all_revenue = plt.figure(figsize = (20,5))
ax = all_revenue.add_subplot(1,1,1)

plot01, = ax.plot(np.arange(len(outgoing_dfs['all without free'])), 
        outgoing_dfs['all without free']['shipping revenue'],
        '--o',
        color = 'white',
        linewidth = 5,
        markersize = 10,
        label = 'Shipping Revenue')

ax2 = ax.twinx()
plot02, = ax2.plot(np.arange(len(outgoing_dfs['all without free'])), 
        outgoing_dfs['all without free']['shipping profit'],
        '--o',
        color = 'black',
        linewidth = 5,
        markersize = 10,
        label = 'Shipping Profit')

vals = ax.get_yticks()
ax.set_yticklabels(['${:,.0f}'.format(x) for x in vals],
                  fontsize = 15,
                  fontname = 'Arial',
                  fontweight = 'bold')

vals2 = ax2.get_yticks()
ax2.set_yticklabels(['${:,.0f}'.format(x) for x in vals2],
                  fontsize = 15,
                  fontname = 'Arial',
                  fontweight = 'bold')

ax.set_ylabel('Revenue',
              fontsize = 25,
              fontname = 'Arial',
              fontweight = 'bold')

ax2.set_ylabel('Profit',
              fontsize = 25,
              fontname = 'Arial',
              fontweight = 'bold',
              rotation = 270,
              labelpad = 25)

ax.set_xticks(np.arange(len(outgoing_dfs['all with free'])))
ax.set_xticklabels([calendar.month_abbr[int(x[-2:])] + ' ' + x[:4] for x in outgoing_dfs['all with free'].index],
                  fontsize = 20,
                  fontname = 'Arial',
                  fontweight = 'bold',
                  rotation = 45)

ax.set_title('Shipping Revenue and Profit\nExcluding Free Shipping',
              fontsize = 30,
              fontname = 'Arial',
              fontweight = 'bold',
              y = 1.02)
ax.grid(color = 'white')
ax.set_facecolor(colors['color03'])
ax.legend(handles = [plot01,plot02],
         fontsize = 20)

In [None]:
fig = plt.figure(figsize = (20,5))
ax = fig.add_subplot(1,1,1)

x = np.arange(len(outgoing_dfs['all without free']))
y1 = outgoing_dfs['ups without free']['order count']
y2 = outgoing_dfs['dhl without free']['order count']
y3 = outgoing_dfs['usps without free']['order count']

width = 0.30

plot01 = ax.bar(x, y1, width, color = colors['color01'], edgecolor = 'black', label = 'UPS')
plot02 = ax.bar(x + width, y2, width, color = colors['color02'], edgecolor = 'black', label = 'DHL')
plot03 = ax.bar(x + 2 * width, y3, width, color = colors['color04'], edgecolor = 'black', label = 'USPS')

vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0f}'.format(x) for x in vals],
                  fontsize = 15,
                  fontname = 'Arial',
                  fontweight = 'bold')

ax.set_xticks(np.arange(len(outgoing_dfs['all without free'])))
ax.set_xticklabels([calendar.month_abbr[int(x[-2:])] + ' ' + x[:4] for x in outgoing_dfs['all without free'].index],
                  fontsize = 20,
                  fontname = 'Arial',
                  fontweight = 'bold',
                  rotation = 45)

ax.set_title('Order Count by Service',
              fontsize = 30,
              fontname = 'Arial',
              fontweight = 'bold',
              y = 1.02)
ax.grid(color = 'white')
ax.legend(handles = [plot01,plot02,plot03], fontsize = 20)
ax.set_facecolor(colors['color03'])

### UPS

In [None]:
pd.DataFrame(outgoing_dfs['ups without free'].mean()).rename(columns = {0:'monthly mean'})

In [None]:
outgoing_dfs['ups without free'].format_(['m0','m0','m0','n0',])

In [None]:
outgoing_dfs['ups without free'].pct_change().format_(['p2','p2','p2','p2',])

In [None]:
outgoing_charges[(outgoing_charges['service'] == 'ups')
                & (outgoing_charges['free shipping'] == 'no')].groupby('year and month')[['shipping revenue','shipping charge','shipping profit']].mean()

In [None]:
outgoing_dfs['ups without free'][['shipping revenue','shipping charge']].plot(figsize = (20,5))

### DHL

In [None]:
pd.DataFrame(outgoing_dfs['dhl without free'].mean()).rename(columns = {0:'monthly mean'})

In [None]:
outgoing_dfs['dhl without free'].format_(['m0','m0','m0','n0',])

In [None]:
outgoing_dfs['dhl without free'].pct_change().format_(['p2','p2','p2','p2'])

In [None]:
outgoing_charges[(outgoing_charges['service'] == 'dhl')
                & (outgoing_charges['free shipping'] == 'no')].groupby('year and month')[['shipping revenue','shipping charge','shipping profit']].mean()

In [None]:
outgoing_dfs['dhl without free'][['shipping revenue','shipping charge']].plot(figsize = (20,5))

### USPS

In [None]:
pd.DataFrame(outgoing_dfs['usps without free'].mean()).rename(columns = {0:'monthly mean'})

In [None]:
outgoing_dfs['usps without free'].format_(['m0','m0','m0','n0',])

In [None]:
outgoing_dfs['usps without free'].pct_change().format_(['p2','p2','p2','p2',])

### Margins

In [None]:
(outgoing_dfs['all without free']['shipping profit']/outgoing_dfs['all without free']['shipping revenue']).mean()

In [None]:
(outgoing_dfs['ups without free']['shipping profit']/outgoing_dfs['ups without free']['shipping revenue']).mean()

In [None]:
(outgoing_dfs['dhl without free']['shipping profit']/outgoing_dfs['dhl without free']['shipping revenue']).mean()

In [None]:
(outgoing_dfs['usps without free']['shipping profit']/outgoing_dfs['usps without free']['shipping revenue']).mean()

### Free Shipping

In [None]:
free_shipping = outgoing_charges[outgoing_charges['free shipping'] == 'yes'].groupby('year and month').agg({'order subtotal':'sum',
                                                                                                           'shipping charge':'sum',
                                                                                                           'orders id':'count'}).rename(columns = {'orders id':'order count'})

In [None]:
pd.DataFrame(free_shipping.mean()).rename(columns = {0:'monthly mean'})

In [None]:
free_shipping.format_(['m0','m0','n0'])

In [None]:
free_shipping.pct_change().format_(['p2','p2','p2'])

In [None]:
outgoing_charges[(outgoing_charges['free shipping'] == 'yes')
                & (outgoing_charges['adabox'] == 'yes')].groupby('year and month').agg({'order subtotal':'sum',
                                                                                                           'shipping charge':'sum',
                                                                                                           'orders id':'count'}).rename(columns = {'orders id':'order count'}).format_(['m0','m0','n0'])

In [None]:
outgoing_charges[(outgoing_charges['free shipping'] == 'yes')
                & (outgoing_charges['adabox'] == 'yes')].groupby('year and month').agg({'order subtotal':'sum',
                                                                                                           'shipping charge':'sum',
                                                                                                           'orders id':'count'}).rename(columns = {'orders id':'order count'}).pct_change().format_(['p2','p2','p2'])

### Inbound

In [None]:
incoming = pd.merge(ups_main[ups_main['incoming/outgoing'] == 'incoming'].groupby('year and month').agg({'shipping charge':'sum'}).rename(columns = {'shipping charge':'ups charge'}),
                   dhl_main[dhl_main['incoming/outgoing'] == 'incoming'].groupby('year and month').agg({'shipping charge':'sum'}).rename(columns = {'shipping charge':'dhl charge'}),
                   how = 'left',
                   left_index = True, 
                   right_index = True).merge(usps_main[usps_main['incoming/outgoing'] == 'incoming'].groupby('year and month').agg({'shipping charge':'sum'}).rename(columns = {'shipping charge':'usps charge'}),
                                            how = 'left',
                                            left_index = True,
                                            right_index = True)

incoming.fillna(0, inplace = True)
incoming = incoming[incoming.index >= str((pd.to_datetime(date_end[:7]) - pd.DateOffset(years = 1)).date())[:7]]
incoming = incoming[incoming.index <= date_end[:7]]
incoming['total cost'] = incoming.sum(1)

In [None]:
pd.DataFrame(incoming.mean()).rename(columns = {0:'monthly mean'})

In [None]:
incoming.format_(['m0','m0','m0','m0'])

In [None]:
incoming.pct_change().format_(['p2','p2','p2','p2'])

# Charts for Limor for Oct 2017 (DEFUNCT)

* add chart showing usps,ups,dhl order count over time

In [None]:
fig = plt.figure(figsize = (20,5))
ax = fig.add_subplot(1,1,1)

x = np.arange(len(outgoing_dfs['all without free']))
y1 = outgoing_dfs['ups without free']['order count']
y2 = outgoing_dfs['dhl without free']['order count']
y3 = outgoing_dfs['usps without free']['order count']

width = 0.30

plot01 = ax.bar(x, y1, width, color = colors['color01'], edgecolor = 'black', label = 'UPS')
plot02 = ax.bar(x + width, y2, width, color = colors['color02'], edgecolor = 'black', label = 'DHL')
plot03 = ax.bar(x + 2 * width, y3, width, color = colors['color04'], edgecolor = 'black', label = 'USPS')

vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0f}'.format(x) for x in vals],
                  fontsize = 15,
                  fontname = 'Arial',
                  fontweight = 'bold')

ax.set_xticks(np.arange(len(outgoing_dfs['all without free'])))
ax.set_xticklabels([calendar.month_abbr[int(x[-2:])] + ' ' + x[:4] for x in outgoing_dfs['all without free'].index],
                  fontsize = 15,
                  fontname = 'Arial',
                  fontweight = 'bold')

ax.set_title('UPS, DHL, and USPS Order Counts\nby month and for all countries',
              fontsize = 30,
              fontname = 'Arial',
              fontweight = 'bold',
              y = 1.02)
ax.legend(handles = [plot01,plot02,plot03])
ax.set_facecolor(colors['color03'])

* add chart showing dhl, ups in canada, EU, and england

In [None]:
dhl_date_start = str(outgoing_charges['date'][outgoing_charges['service'] == 'dhl'].min().date())
dhl_date_start

In [None]:
countries = {'canada':['canada'],
            'uk':['united kingdom'],
            'europe':['france',
                     'germany',
                     'belgium',
                     'ireland',
                     'italy',
                     'netherlands',
                     'spain',
                     'portugal']}

by_country_for_limor = {}
for k,v in countries.items():
    outgoing_charges_main = outgoing_charges[(outgoing_charges['date'] >= dhl_date_start) & (outgoing_charges['service'].isin(['ups','dhl']))]
    outgoing_charges_main = outgoing_charges_main[outgoing_charges_main['delivery country'].isin(v)]
    order_count = outgoing_charges_main.groupby(['year and month','service'])[['orders id']].count().rename(columns ={'orders id':'order count'}).unstack(1).fillna(0)
    order_count.columns = order_count.columns.get_level_values(1)
    by_country_for_limor[k] = order_count

In [None]:
fig = plt.figure(figsize = (20,10))
ax1 = fig.add_subplot(3,1,1)
ax2 = fig.add_subplot(3,1,2)
ax3 = fig.add_subplot(3,1,3)

by_country_for_limor['canada'].plot(ax = ax1, kind = 'bar', edgecolor = 'black', color = [colors['color02'], colors['color01']])
by_country_for_limor['uk'].plot(ax = ax2, kind = 'bar', edgecolor = 'black', color = [colors['color02'], colors['color01']])
by_country_for_limor['europe'].plot(ax = ax3, kind = 'bar', edgecolor = 'black', color = [colors['color02'], colors['color01']])

ax1.set_title('UPS and DHL Order Counts Since Feb 2017',
             fontsize = 25,
             fontweight = 'bold',
             fontname = 'Arial')

ax1.set_xticklabels([''])
ax1.set_xlabel('')
ax1.set_ylabel('Canada',
             fontsize = 25,
             fontweight = 'bold',
             fontname = 'Arial')
ax2.set_xticklabels([''])
ax2.set_xlabel('')
ax2.set_ylabel('UK',
             fontsize = 25,
             fontweight = 'bold',
             fontname = 'Arial')

ax3.set_xticklabels([calendar.month_abbr[int(str(x)[-2:])] for x in by_country_for_limor['canada'].index],
                   rotation = 0,
                   fontsize = 15,
                   fontweight = 'bold',
                   fontname = 'Arial')
ax3.set_xlabel('2017',
             fontsize = 25,
             fontweight = 'bold',
             fontname = 'Arial')
ax3.set_ylabel('Europe',
             fontsize = 25,
             fontweight = 'bold',
             fontname = 'Arial')

vals1 = ax1.get_yticks()
ax1.set_yticklabels(['{:,.0f}'.format(x) for x in vals1],
             fontsize = 15,
             fontweight = 'bold',
             fontname = 'Arial')

vals2 = ax2.get_yticks()
ax2.set_yticklabels(['{:,.0f}'.format(x) for x in vals2],
             fontsize = 15,
             fontweight = 'bold',
             fontname = 'Arial')

vals3 = ax3.get_yticks()
ax3.set_yticklabels(['{:,.0f}'.format(x) for x in vals3],
             fontsize = 15,
             fontweight = 'bold',
             fontname = 'Arial')

ax1.set_facecolor(colors['color03'])
ax2.set_facecolor(colors['color03'])
ax3.set_facecolor(colors['color03'])

ax1.legend(['DHL','UPS'], fontsize = 20)
ax2.legend(['DHL','UPS'], fontsize = 20)
ax3.legend(['DHL','UPS'], fontsize = 20)

plt.show()

# Stats for Limor (DEFUNCT)

In [None]:
limor_db = pd.read_sql(
'''
SELECT
orders_id AS 'orders id',
LOWER(delivery_country) AS 'delivery country',
IF(shipping_module_code = 'upsxml', 'ups','dhl') AS 'service'
FROM orders
WHERE DATE(date_purchased) BETWEEN '2017-07-26' AND '2017-11-29'
AND orders_status != 9
AND orders_status != 10
AND payment_module_code != 'Replacement Order'
AND shipping_module_code IN ('upsxml','dhlexpress')
''', db)

In [None]:
percent = pd.read_sql(
'''
SELECT
B.orders_id AS shipping,
C.orders_id AS ddp

FROM

(SELECT
orders_id
FROM orders
WHERE shipping_module_code = 'dhlexpress'
AND DATE(date_purchased) BETWEEN '2017-07-26' AND '2017-11-29'
AND orders_status != 9
AND orders_status != 10
AND payment_module_code != 'Replacement Order') A

LEFT JOIN

(SELECT
orders_id
FROM orders_total
WHERE class = 'ot_shipping') B ON A.orders_id = B.orders_id

LEFT JOIN

(SELECT
orders_id
FROM orders_total
WHERE class = 'ot_ddp') C ON A.orders_id = C.orders_id
''', db)

In [None]:
print('{:,.2f}% of dhl orders received ddp'.format(percent.count()[1]/percent.count()[0] * 100))

In [None]:
ddp = percent.dropna()

In [None]:
limor_dhl = dhl_main.copy()
limor_dhl['is ddp?'] = np.where(limor_dhl['orders id'].isin(ddp['ddp'].tolist()), 'yes','no')
limor_dhl = limor_dhl[((limor_dhl['is ddp?'] == 'yes') & (limor_dhl['ddp charge'] > 0)) | ((limor_dhl['is ddp?'] == 'no') & (limor_dhl['ddp charge'] == 0))]
limor_dhl = limor_dhl[limor_dhl['incoming/outgoing'] == 'outgoing']
limor_dhl = limor_dhl[['orders id','shipping charge']]

limor_ups = ups_main.copy()
limor_ups = limor_ups[limor_ups['incoming/outgoing'] == 'outgoing']
limor_ups = limor_ups.groupby('orders id', as_index = False)[['shipping charge']].sum()
limor_ups = limor_ups[['orders id','shipping charge']]
limor_ups.head()

limor_dhl_and_ups = pd.concat([limor_dhl, limor_ups])

In [None]:
limor = pd.merge(limor_db,
                limor_dhl_and_ups,
                how = 'left',
                on = 'orders id')
limor.dropna(inplace = True)

In [None]:
limor_countries = list(limor['delivery country'][limor['service'] == 'dhl'].unique())
limor_countries[:5]

In [None]:
limor = limor[limor['delivery country'].isin(limor_countries)]

In [None]:
limor['service'].value_counts()

In [None]:
by_country = limor.groupby(['delivery country','service']).agg({'shipping charge':['count','mean','sum']}).unstack(1)

by_europe = limor[limor['delivery country'].isin(['france',
'germany',
'belgium',
'ireland',
'italy',
'netherlands',
'spain',
'portugal'])].groupby('service').agg({'shipping charge':['count','mean','sum']})

In [None]:
by_country.head()

In [None]:
by_europe

In [None]:
pd.read_sql(
'''
SELECT
MIN(DATE(date_purchased)) AS 'min date',
delivery_country AS 'delivery country'
FROM orders
WHERE shipping_module_code = 'dhlexpress'
AND delivery_country IN '''+ str(tuple(limor_countries)) +'''
GROUP BY delivery_country
ORDER BY MIN(DATE(date_purchased))
''', db)

# Check against Weekly Audit 

In [None]:
week_start = '2018-01-01'
week_end = '2018-01-15'

# conditions match those in the weekly audit script, for dhl
weekly_check = outgoing_charges_all_lines.copy()
weekly_check['shipping profit'] = weekly_check['shipping revenue'] - weekly_check['shipping charge']
weekly_check = weekly_check[weekly_check['date'].between('2018-01-01','2018-01-15')]
weekly_check['did we charge ddp'] = np.where(weekly_check['ddp revenue'] > 0, 'yes','no')
weekly_check = weekly_check[weekly_check['service'] == 'dhl']
weekly_check = weekly_check[((weekly_check['did we charge ddp'] == 'yes') & (weekly_check['ddp charge'] != 0))
                            | ((weekly_check['did we charge ddp'] == 'no'))]

In [None]:
weekly_check['shipping profit'].sum()

# Charts for report

In [None]:
fig, ax = plt.subplots(figsize = (20,5))

chart01 = outgoing_charges[outgoing_charges['service'].isin(['ups','usps','dhl'])].groupby(['year and month','service']).agg({'shipping revenue':'sum'}).unstack(1).fillna(0)
chart01.columns = chart01.columns.get_level_values(1)
chart01.columns = [x.upper() for x in chart01.columns]

plot01 = chart01.plot(kind = 'bar',
                         ax = ax,
                         color = [colors['color02'],colors['color03'],colors['color04']],
                         edgecolor = 'black')

ax.set_xticklabels([calendar.month_abbr[int(str(x)[-2:])] + '\n' + str(x)[:4] for x in chart01.index],
                  rotation = 0,
                  fontsize = 15,
                  fontweight = 'bold',
                  fontname = 'Arial')
ax.set_xlabel('')
vals = ax.get_yticks()
ax.set_yticklabels(['${:,.0f}'.format(x/1000) for x in vals],
                  fontsize = 20,
                  fontweight = 'bold',
                  fontname = 'Arial')
ax.set_ylabel('Thousands',
                  fontsize = 20,
                  fontweight = 'bold',
                  fontname = 'Arial')
ax.set_title('UPS, DHL, and USPS\nShipping Revenue',
                  fontsize = 30,
                  fontweight = 'bold',
                  fontname = 'Arial',
                  y = 1.02)

ax.set_facecolor(colors['color01'])
ax.legend(fontsize = 15)
ax.grid()
plt.show()

# Numbers check

In [None]:
numbers = outgoing_charges[outgoing_charges['free shipping'] == 'no'].groupby(['year and month','service'])[['shipping profit']].sum().unstack(1).fillna(0)

In [None]:
numbers.columns = numbers.columns.get_level_values(1)
numbers['totals'] = numbers.sum(1)

In [None]:
numbers

# DHL overcharges for DHL Rep

In [None]:
import sys
sys.path.insert(0,'/Users/jarad')

import pandas as pd
import numpy as np
from db2 import *
import xlsxwriter

import datetime as dt
import calendar
from df_format import *

import glob

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '{:,.4f}'.format(x))


import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# get all CSVs
path = r'/Users/jarad/fake_folder/Shipping/Recurring/DHL Tax Analysis/Docs/DHL Invoices/'
all_files = glob.glob(path + '/*.csv')
frame = pd.DataFrame()
list_ = []

for file_ in all_files:
    df = pd.read_csv(file_, index_col = None, header = 0)
    list_.append(df)
    
dhl_super_raw = pd.concat(list_)   

# fix columns
dhl_super_raw.columns = [x.lower() for x in dhl_super_raw]
dhl_super_raw.rename(columns = {'shipment reference 1':'orders id',
                               'dest name':'delivery city',
                               'dest country name':'delivery country'}, inplace = True)

# check for dupes
if dhl_super_raw[dhl_super_raw.duplicated()].empty:
    print('main CSV has no fully duplicated lines')
else:
    print('main CSV does have some fully duplicated lines')

# fix orders id data type
dhl_super_raw['orders id'] = pd.to_numeric(dhl_super_raw['orders id'], errors = 'coerce', downcast = 'integer')
dhl_super_raw['orders id'].fillna(0, inplace = True)
dhl_super_raw['orders id'] = [int(x) for x in dhl_super_raw['orders id']]

In [None]:
dhl_super_raw.fillna(0, inplace = True)

In [None]:
dhl_db = pd.read_sql(
'''
SELECT
A.orders_id AS 'orders id',
A.delivery_country AS 'delivery country',
B.value AS 'ada service charge',
C.value AS 'ada ddp charge',
B.value + IFNULL(C.value, 0) AS 'ada total ship charge'

FROM

(SELECT
orders_id,
delivery_country
FROM orders
WHERE shipping_module_code = 'dhlexpress'
AND DATE(date_purchased) >= '2017-10-01') A

LEFT JOIN

(SELECT
orders_id,
value 
FROM orders_total
WHERE class = 'ot_shipping') B ON A.orders_id = B.orders_id

LEFT JOIN

(SELECT
orders_id,
value 
FROM orders_total
WHERE class = 'ot_ddp') C ON A.orders_id = C.orders_id
''', db)

In [None]:
dhl_by_oid = dhl_super_raw.groupby(['shipment number','orders id'], as_index = False)[['weight charge',
                                                                                      'total extra charges (xc)',
                                                                                      'total charge']].sum().rename(columns = {'weight charge':'dhl weight charge',
                                                                                                                              'total extra charges (xc)':'dhl extra charge',
                                                                                                                              'total charge':'dhl total charge'})

In [None]:
dhl = pd.merge(dhl_db,
              dhl_by_oid,
              how = 'left',
              on = 'orders id')

dhl = dhl[dhl.isnull().any(1) == False]
dhl['ship profit'] = dhl['ada total ship charge'] - dhl['dhl total charge']
dhl['ddp diff'] = dhl['ada ddp charge'] - dhl['dhl extra charge']

In [None]:
dhl.sort_values('ddp diff').head()

In [None]:
print(len(dhl_by_oid))
print('')
print(len(dhl))
print('')
print(dhl['ddp diff'].quantile(np.arange(0,1.05,0.05)))
print('')
print(dhl['ship profit'].quantile(np.arange(0,1.05,0.05)))

In [None]:
import_name_list = []
for i in np.arange(1,9):
    dhl_super_raw['xc' + str(i) +  ' name'] = [str(x).lower() for x in dhl_super_raw['xc' + str(i) +  ' name']]
    [import_name_list.append(x) for x in dhl_super_raw['xc' + str(i) +  ' name'][dhl_super_raw['xc' + str(i) +  ' name'].str.contains('import', na = False)].unique()]

In [None]:
sorted(import_name_list)

In [None]:
oid_list = []
for i in np.arange(1,9):
    dhl_super_raw['xc' + str(i) +  ' name'] = [str(x).lower() for x in dhl_super_raw['xc' + str(i) +  ' name']]
    [oid_list.append(x) for x in dhl_super_raw['orders id'][dhl_super_raw['xc' + str(i) +  ' name'].str.contains('import', na = False)].unique()]

In [None]:
dhl = dhl[dhl['orders id'].isin(oid_list)]
for_rep = dhl[(dhl['ada ddp charge'] - dhl['dhl extra charge']) < 0].sort_values('ship profit').drop_duplicates('delivery country').head(10).copy()
for_rep = for_rep[['shipment number','orders id','delivery country','ada service charge','ada ddp charge','ada total ship charge','dhl total charge','ship profit']]
for_rep

### Simple check

In [None]:
check_oids = dhl['orders id'].tolist()

ada_ddp_sum = dhl_db[dhl_db['orders id'].isin(check_oids)]['ada ddp charge'].sum()
dhl_ddp_sum = dhl['dhl extra charge'].sum()

print('ada ddp charge sum: {:,.0f}'.format(ada_ddp_sum))
print('dhl ddp sumL {:,.0f}'.format(dhl_ddp_sum))
print('ada ddp minus dhl ddp: {:,.0f}'.format(ada_ddp_sum - dhl_ddp_sum))