In [None]:
import sys
sys.path.insert(0, '/Users/jarad/fake_folder/Python Libraries')

from jb_libraries import *
%matplotlib inline

import glob
import re

colors = {'facecolor':'#146eb4',
          'color01':'white',
          'color02':'#000000',
          'color03':'#ff9900'}

date_start = '2017-09-01'
date_end = '2018-09-30'

csv_path = r'/Users/jarad/fake_folder/Finance/Recurring/Amazon Monthly Audit/Docs/Statements/'

ym = pd.DataFrame({'year and month':pd.date_range(date_start, date_end, freq = 'MS')})
ym['year and month'] = [str(x)[:7] for x in ym['year and month']]
ym.set_index('year and month', inplace = True)

pretty_dates = list(ym.index)
pretty_dates = [calendar.month_abbr[int(x[-2:])] + '\n' + x[:4] for x in pretty_dates]

### Get database data

In [None]:
lapwa_log_main = pd.read_sql(
'''
SELECT
timestamp,
order_id AS orders_id,
LOWER(action) AS action,
order_reference_id,
amount AS lapwa_log_amount
FROM lapwa_log
''', db)

orders_main = pd.read_sql(
'''
SELECT
DATE(o.date_purchased) AS date_purchased,
DATE_FORMAT(o.date_purchased, '%Y-%m') AS year_and_month_purchased,
o.orders_id,
ot.value AS ot_amount,
os.orders_status_name AS orders_status,
IF(o.orders_id IN (SELECT orders_id FROM orders_deleted),'yes','no') AS deleted
FROM orders o 
JOIN orders_total ot ON o.orders_id = ot.orders_id
AND ot.class = 'ot_total'
JOIN orders_status os ON o.orders_status = os.orders_status_id
WHERE o.payment_module_code = 'lapwa'
''', db)

for df in [lapwa_log_main, orders_main]:
    col_fix(df)
    
lapwa_log_main.sort_values(['order reference id','timestamp'], inplace = True)
lapwa_log_main.drop_duplicates(['orders id','action'], keep = 'last', inplace = True)    

In [None]:
ot = pd.read_sql(
'''
SELECT
DATE_FORMAT(o.date_purchased, '%Y-%m') As year_and_month,
SUM(ot.value) AS monthly_total
FROM orders_total ot
JOIN orders o ON ot.orders_id = o.orders_id
AND DATE(o.date_purchased) BETWEEN ' '''+ date_start +''' ' AND ' '''+ date_end +''' '
AND orders_status NOT IN (8,9,10,11,12,14,15)
AND payment_method != 'Replacement Order'
WHERE ot.class = 'ot_subtotal'
GROUP BY DATE_FORMAT(o.date_purchased, '%Y-%m')
''', db)

col_fix(ot)

### Reconcile tables in database

In [6]:
# assemble main df
db_rec_main = pd.merge(lapwa_log_main, orders_main, how = 'left', on = 'orders id')

# get it by OID
by_oid = db_rec_main.groupby(['orders id','action'])[['ot amount']].sum().unstack(1).fillna(0)
by_oid.columns = by_oid.columns.droplevel(0)
by_oid.reset_index(inplace = True)

# join some info
df = orders_main[['orders status','date purchased','year and month purchased','orders id']]

by_oid = pd.merge(by_oid, df, how = 'left', on = 'orders id')

# get mismatches
mismatch = by_oid[np.abs(by_oid['authorize'] - by_oid['capture']) > 1].copy()
mismatch['difference'] = mismatch['authorize'] - mismatch['capture']

# get current mismatches
mismatch_current = mismatch[mismatch['date purchased'].between(pd.to_datetime(date_start).date(), pd.to_datetime(date_end).date())].groupby(['year and month purchased','orders status'])[['difference']].sum().unstack(1).fillna(0)
mismatch_current.columns = mismatch_current.columns.droplevel(0)
mismatch_current = ym.join(mismatch_current).fillna(0)

In [7]:
print('if any mismatch is NOT "fraud" or "voided" then you must investigate because this means that we authorized but never captured.')

mismatch_current.format_(['m0'] * len(mismatch_current.columns))

if any mismatch is NOT "fraud" or "voided" then you must investigate because this means that we authorized but never captured.


Unnamed: 0_level_0,Fraud - Void,Voided
year and month,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-09,$0,$0
2017-10,$0,$0
2017-11,$0,$0
2017-12,$0,$0
2018-01,$0,$0
2018-02,$0,$0
2018-03,$0,$0
2018-04,$55,$0
2018-05,"$1,029","$1,162"
2018-06,"$2,357","$1,071"


### Get CSV data

In [8]:
# get all CSVs
all_files = glob.glob(csv_path + '/*.csv')
frame = pd.DataFrame()
list_ = []

for file_ in all_files:
    df = pd.read_csv(file_, index_col = None, header = 0)
    list_.append(df)
    
csv_main = pd.concat(list_)    

# clean column headers
columns_list = []
for y in csv_main.columns:
    columns = '_'.join(x.lower() for x in re.findall('[A-Z][^A-Z]*', y))
    columns_list.append(columns)
    
csv_main.columns = columns_list 

# fix column headers
csv_main.columns = [x.replace('_',' ') for x in csv_main.columns]
csv_main.columns = [x.lstrip().rstrip() for x in csv_main.columns]

# convert to datetime and get rid of time units
csv_main['transaction posted date'] = pd.to_datetime(csv_main['transaction posted date'])
csv_main['posted date'] = pd.to_datetime(csv_main['transaction posted date']).dt.date
csv_main['posted date'] = pd.to_datetime(csv_main['posted date'])
csv_main['year and month posted'] = [str(x)[:7] for x in csv_main['posted date']]

# convert to numeric
ls = ['transaction amount',
      'transaction percentage fee',
      'transaction fixed fee',
      'total transaction fee',
      'net transaction amount']

for col in ls:
    try:
        csv_main[col] = csv_main[col].str.replace(',','')
    except:
        pass
    csv_main[col] = pd.to_numeric(csv_main[col])
    
# rename
csv_main.rename(columns = {'amazon order reference id':'order reference id',
                          'transaction type':'action',
                          'transaction amount':'csv amount',
                          'total transaction fee':'csv transaction fee',
                          'net transaction amount':'csv net amount'}, inplace = True)    

# keep only the columns you want
csv = csv_main.copy()
csv = csv[[
     'transaction posted date', 
     'posted date',
     'year and month posted',
     'amazon transaction id',
     'action',
     'order reference id',
     'csv amount',
     'csv transaction fee',
     'csv net amount']]

# change the wording of this label
csv['action'] = np.where(csv['action'] == 'authorization', 'authorize', csv['action'])

# lower case
csv['action'] = [x.lower() for x in csv['action']]

# dupes
print('%i dupe(s)\nremove them' % np.sum(csv.duplicated()))
csv.drop_duplicates(inplace = True)

print('if a line in the CSV has no order reference id, then the actions are:', csv['action'][(csv['order reference id'].isnull())].unique())

5604 dupe(s)
remove them
if a line in the CSV has no order reference id, then the actions are: ['reserve' 'transfer' 'debt']


### Reconcile CSV with database

In [9]:
# we know now that the db tables are reconciled
# so join this db table to the CSV statement
csv_recon_main = pd.merge(csv, lapwa_log_main, how = 'left', on = ['order reference id','action'])

# get some dates
for col in ['date purchased','year and month purchased']:
    csv_recon_main[col] = csv_recon_main['orders id'].map(dict(zip(orders_main['orders id'], orders_main[col])))

# get last 13 months by date_purchased    
csv_recon_main = csv_recon_main[csv_recon_main['posted date'].between(date_start, date_end)].copy()    

# get values of all actions between CSV and db
df = csv_recon_main.groupby(['year and month posted','action'])[['csv amount','lapwa log amount']].sum()

# refunds are negative in the CSV, positive in the db, take the abs value
df['csv amount'] = np.abs(df['csv amount'])

# take the differemce
df['difference'] = df['csv amount'] - df['lapwa log amount']

# now we have the differences between the CSV and the db
# if the difference is small for any given month then just move on
# these are usually manual refunds that are created in some backwards way
df[df.index.get_level_values(0) == date_end[:7]].format_(['m0'] * 3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Csv Amount,Lapwa Log Amount,Difference
year and month posted,action,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-09,authorization,$0,$0,$0
2018-09,capture,"$147,369","$147,369",$0
2018-09,refund,$645,$645,$0
2018-09,transfer,"$139,498",$0,"$139,498"


### Explore for report

In [10]:
# make a copy 
explore = csv.copy()

# get OID
explore['orders id'] = explore['order reference id'].map(dict(zip(lapwa_log_main['order reference id'], lapwa_log_main['orders id'])))

# get some dates
for col in ['date purchased','year and month purchased']:
    explore[col] = explore['orders id'].map(dict(zip(orders_main['orders id'], orders_main[col])))
    
# restrict by date
explore = explore[explore['posted date'].between(date_start, date_end)]

# create one big dataframe
df1 = explore.groupby(['year and month posted','action'])[['csv amount']].sum().unstack(1).fillna(0)
df1.columns = df1.columns.droplevel(0)

df2 = explore.groupby('year and month posted').agg({'csv transaction fee':'sum', 'orders id':'nunique'})
df2.columns = ['fee amount','oid count']

df3 = df1.join(df2)
df3['fees as % of capture'] = (df3['fee amount'] * -1)/df3['capture']
df3['avg capture per oid'] = df3['capture']/df3['oid count']

df4 = explore[explore['action'] == 'refund'].groupby('year and month posted')[['order reference id']].count()
df4.columns = ['refund count']

df5 = df3.join(df4)
df5['avg refunded'] = df5['refund']/df5['refund count']

df6 = df5.copy()

df7 = df6.join(ot.set_index('year and month'))
df7['capture as % of monthly total'] = df7['capture']/df7['monthly total']

m = pd.DataFrame(df7.mean()).rename(columns = {0:'mean'}).T.format_(['m0'] * 9 + ['n0','p2','m2','n0','m2','m0','p2']).T
print(m)
print('\nyoy')
yoy = pd.DataFrame((df7.pct_change(periods = 12).iloc[-1])).format_(['p2']).replace('nan%','')
print(yoy)
print('\n')
df7.format_(['m0'] * 9 + ['n0','p2','m2','n0','m2','m0','p2'])

IndexError: single positional indexer is out-of-bounds

### Refunds

In [None]:
ls = ['orders id','csv amount']
r = explore[(explore['year and month posted'] == date_end[:7])
          & (explore['action'] == 'refund')].sort_values('csv amount').head(3)
r['orders id'] = [int(x) for x in r['orders id']]
r[ls]

### Charts for report

In [None]:
d = {'capture':'amazon revenue',
    'refund':'amazon refunds',
    'fee amount':'amazon fees',
    'avg capture per oid':'amazon avg order value'}

for k,v in d.items():
    fig, ax = plt.subplots(figsize = (20,5))
    
    x = range(len(df))
    y1 = np.abs(df[k])
    y2 = df['oid count']
    
    ax.bar(x, y1, color = colors['color01'], edgecolor = 'black', label = d[k])
    ax.axhline(y1.mean(), color = colors['color03'], ls = '--', lw = 4, label = 'monthly average')
        
    ax.set_title(d[k].title(), fontsize = 20)
    ax.set_facecolor(colors['facecolor'])
    ax.set_yticklabels(['${:,.0f}'.format(x) for x in ax.get_yticks()], fontsize = 15)
    ax.set_xticks(x)
    ax.set_xticklabels(pretty_dates, fontsize = 15)
    ax.grid(color = 'white', alpha = 0.35)
    ax.legend(['Average'], fontsize = 15)
    
    plt.savefig(ax.get_title(), bbox_inches = 'tight')
    plt.show()

### Payment types by count

In [None]:
pd.read_sql(
'''
SELECT
*
FROM orders_status
ORDER BY orders_status_id
''', db)

In [None]:
payment_types = pd.read_sql(
'''
SELECT
DATE_FORMAT(date_purchased, '%Y-%m') AS 'year and month',
payment_method AS 'payment type'
FROM orders
WHERE DATE(date_purchased) BETWEEN ' '''+ date_start +''' ' AND ' '''+ date_end +''' '
AND orders_status NOT IN (8,9,10,11,12,14,15)
''', db)

In [None]:
df = payment_types.groupby(['year and month','payment type'])[['payment type']].count().unstack(1).fillna(0)
df.columns = df.columns.get_level_values(1)
df.sort_values(df.index[-1], ascending = False, axis = 1,inplace = True)
df = df.iloc[-1,:5]

fig, ax = plt.subplots(figsize = (20,5))

x = range(len(df))
y1 = df

ax.bar(x, y1, color = [x for x in list(colors.values())[1:]], edgecolor = 'white', width = 0.5)

ax.set_xticks(x)
ax.set_xticklabels(df.index, rotation = 0,fontsize = 15)

vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0f}'.format(x) for x in vals], fontsize = 15)

ax.set_title('Top 5 Payment Types by Count', fontsize = 20, y = 1.02)
ax.set_facecolor(colors['facecolor'])
ax.grid(color = 'white', alpha = 0.35)

plt.savefig(ax.get_title(), bbox_inches = 'tight')

In [None]:
print('done')