### Some info
From Kelly in email with subject line "CSP Monthly Chargeback/Fraud Report":
* Fraud stopped
* Fraud - Void (orders_status = 15)


* Fraud not stopped
* Fraud - Confirmed (orders_status = 9)

### For reference

[CSP Shift Report Google Doc](https://docs.google.com/spreadsheets/d/1RfqZtU-qZY7_Xn1gAZ15LBS2nshO5RBwPLRDmfFTx4M/edit#gid=300944739); note that the numbers are different from the db  
[Features of a Fraud Order Brainstorm](https://docs.google.com/spreadsheets/d/1HBddFw03NCuwJ3P1QPuSFQBaRXluQVA2uPecj38Ardc/edit?usp=sharing)

### Libraries

In [6]:
import sys
sys.path.insert(0,'/Users/jarad/Fake Folder/Python Libraries')

from jb_libraries import *
%matplotlib inline

from osh_data import *

### Script settings

In [7]:
r_write = 'yes'

In [8]:
# the first month we started marking orders as "fraud - void" instead of deleting them
fraud_date_start = '2018-05-01' 
date_end = str(dt.datetime.now().date())

csv_path = '/Users/jarad/Fake Folder/CSP/Recurring/Fraud and Chargeback Report/CSVs/'

### Chart settings

In [9]:
style_sheet_path = '/Users/jarad/Fake Folder/Matplotlib Stylesheets/the_pillars_stylesheet.mplstyle'
plt.style.use(style_sheet_path)

alpha = 0.8

adablue01 = '#00ffff'
adablue02 = '#00a6e9'
adapink = '#ff00ff'
adapurple = '#662d91'
adayellow = '#ffff00'
adagreen = '#00ff00'

colors = [adablue01,adablue02,adapink,adayellow,adapurple,adagreen] * 10

*****
# Get orders history data
*****

In [10]:
s = dt.datetime.now()

osh_super_main = pd.read_sql(
'''
SELECT
orders_status_history_id AS osh_id,
DATE(date_added) AS date_added,
orders_id,
orders_status_id,
orders_billing_id
FROM orders_status_history
WHERE LENGTH(orders_id) <= 7
AND orders_id != 0
AND DATE(date_added) >= ' '''+ fraud_date_start +''' '
ORDER BY orders_id DESC
''', db)

col_fix(osh_super_main)

e = dt.datetime.now()
print(e-s)

0:00:49.269365


In [11]:
osh_super_main['date added'] = pd.to_datetime(osh_super_main['date added'])

for x in ['year and month','year and quarter']:
    osh_super_main[x + ' added'] = jb_dates(osh_super_main['date added'], x)

### Make a copy

In [12]:
osh_main = osh_super_main.copy()

### Get orders status and billing status names

In [13]:
os_main = pd.read_sql(
'''
SELECT
orders_status_id AS os_id,
LOWER(orders_status_name) AS os_name
FROM orders_status
ORDER BY orders_status_id
''', db)

col_fix(os_main)

bs_main = pd.read_sql(
'''
SELECT
bs_status_id AS bs_id,
LOWER(bs_status_name) AS bs_name
FROM billing_status
''', db)

col_fix(bs_main)

osh_main['orders status name'] = osh_main['orders status id'].map(dict(zip(os_main['os id'], os_main['os name'])))
osh_main['billing status name'] = osh_main['orders billing id'].map(dict(zip(bs_main['bs id'], bs_main['bs name'])))

ls = ['orders status id','orders billing id']
osh_main.drop(ls, 1, inplace = True)

*****
# Get orders data
*****

In [14]:
s = dt.datetime.now()

orders_super_main = pd.read_sql(
'''
SELECT
DATE(date_purchased) AS date_purchased,
orders_id,

customers_id,
customers_email_address,

billing_street_address,
billing_state,
billing_country,
billing_postcode,

delivery_street_address,
delivery_state,
delivery_country,
delivery_postcode,
IF(delivery_address_commercial = 0, 'residential','commercial') AS delivery_address_commercial,

ip_address,
ip_mismatch,

IF(orders_reseller = 0 AND orders_super_reseller = 0, 'non reseller','reseller/super') AS customer_type,

payment_method

FROM orders
''', db)

col_fix(orders_super_main)

orders_super_main['purchase type'] = np.where(orders_super_main['customers id'] == 0, 'guest','account')
orders_super_main['date purchased'] = pd.to_datetime(orders_super_main['date purchased'])

for col in orders_super_main.columns:
    if orders_super_main[col].dtype == 'O':
        orders_super_main[col] = orders_super_main[col].str.lower()
        
e = dt.datetime.now()
print(e-s)

0:03:33.492204


In [15]:
customers_info_main = pd.read_sql(
'''
SELECT
customers_info_id AS customers_id,
DATE(customers_info_date_account_created) AS date_account_created
FROM customers_info
WHERE customers_info_id != 0
''', db)

col_fix(customers_info_main)

customers_info_main['date account created'] = pd.to_datetime(customers_info_main['date account created'])

orders_super_main['date account created'] = orders_super_main['customers id'].map(dict(zip(customers_info_main['customers id'], customers_info_main['date account created'])))

orders_super_main['account duration at time of purchased'] = (orders_super_main['date purchased'] - orders_super_main['date account created']).dt.days
orders_super_main['account duration at time of purchased'].fillna(0, inplace = True)
orders_super_main['account duration at time of purchased'] = orders_super_main['account duration at time of purchased'].astype(int)

In [16]:
ot_main = pd.read_sql(
'''
SELECT
orders_id,
value AS subtotal
FROM orders_total
WHERE class = 'ot_subtotal'
''', db)

col_fix(ot_main)

orders_super_main['subtotal'] = orders_super_main['orders id'].map(dict(zip(ot_main['orders id'], ot_main['subtotal'])))

### Make a copy

In [17]:
orders_main = orders_super_main.copy()

### Fix up email addresses

In [18]:
# remove where customer was an adafruit employee
ls = list(set(orders_main[orders_main['customers email address'].str.contains('@adafruit.com$')]['customers id']))
ls.remove(0) # an adafruit employee can order as a guest, but we want to keep all guest orders in our data

orders_main.drop(orders_main[(orders_main['customers id'].isin(ls))
                           | (orders_main['customers email address'].str.contains('@adafruit.com$'))].index, inplace = True)
orders_main.reset_index(drop = True, inplace = True)

if orders_main[orders_main['customers email address'].str.contains('@adafruit.com$')].empty == False:
    raise ValueError('check this')
    
# remove where email is an empty string
empty = orders_main[orders_main['customers email address'] == '']
if len(empty) < 100:
    print('count of "empty string" emails is x%i' % len(empty))
    orders_main.drop(empty.index.tolist(), inplace = True)
    orders_main.reset_index(drop = True, inplace = True)
else:
    raise ValueError('check your "empty string" email count')

count of "empty string" emails is x71


### Assign unique labels for each person
Regardless if they have multiple ids or emails

In [19]:
do_this = 'no' # this takes forever and I can't figure out a way to speed it up

if do_this == 'yes':
    
    s = dt.datetime.now()

    df = orders_main[orders_main['customers id'] > 0][['customers id','customers email address']].drop_duplicates()
    id_ls = list(set(df['customers id']))
    master_ls = []

    for id_ in id_ls:

        # find all emails associated with this id
        emails = df[df['customers id'] == id_]['customers email address'].drop_duplicates().tolist()

        # find all ids associated with these emails which are not the original id
        ids = df[(df['customers email address'].isin(emails))
               & (df['customers id'] != id_)]['customers id'].tolist()

        # find all emails associated with all ids, where the id and email have not already been used or found
        emails2 = df[(~df['customers email address'].isin(emails))
                   & (df['customers id'].isin(ids))]['customers id'].tolist()    

        # remove from the main data what you have already found
        df.drop(df[(df['customers email address'].isin(emails + emails2))
                    | (df['customers id'].isin([id_] + ids))].index, inplace = True)

        df.reset_index(drop = True, inplace = True)

        # remove from the id list the ids which you have already found
        for x in ids:
            id_ls.remove(x)

        # make a nice list and add to master list
        ls = [id_] + ids + emails + emails2    
        master_ls.append(ls)

    e = dt.datetime.now()
    print(e-s)

*****
# Restrict by date
Recall that in May 2018 we started marking orders as "fraud" instead of deleting them; start the data set at this date.
*****

In [20]:
orders = orders_main[orders_main['date purchased'] >= fraud_date_start].copy()
orders.reset_index(drop = True, inplace = True)

### Flag fraud orders

In [21]:
ls = ['fraud - void','fraud - confirmed']
oids = osh_main[osh_main['orders status name'].isin(ls)]['orders id']

orders['fraud'] = np.where(orders['orders id'].isin(oids),'yes','no')

### Get part data

In [22]:
parts_main = pd.read_sql(
'''
SELECT
part_id,
products_price
FROM parts
''', db)

col_fix(parts_main)

q = 0.95
top_parts = parts_main[parts_main['products price'] >= parts_main['products price'].quantile(q)]['part id'].tolist()

In [23]:
min_oid = orders['orders id'].min()

op_main = pd.read_sql(
'''
SELECT
orders_id,
part_id
FROM orders_products
WHERE orders_id >= '''+ str(min_oid) +'''
''', db)

col_fix(op_main)

op_main['top part'] = np.where(op_main['part id'].isin(top_parts),'yes','no')

### Flag top parts

In [24]:
orders['contains top part'] = np.where(orders['orders id'].isin(op_main[op_main['top part'] == 'yes']['orders id'].tolist()),
                                      'yes','no')

### Flag parts that have been involved in past fraud orders

In [25]:
# get OIDs of fraud orders
ls1 = orders[orders['fraud'] == 'yes']['orders id'].tolist()

# get parts belonging to those OIDs
ls2 = list(set(op_main[op_main['orders id'].isin(ls1)]['part id']))

# get OIDs which contain these parts
op_main['fraud part'] = np.where(op_main['part id'].isin(ls2),'yes','no')

# flag OIDs which contain fraud parts
orders['contains fraud part'] = np.where(orders['orders id'].isin(op_main[op_main['fraud part'] == 'yes']['orders id'].tolist()),
                                        'yes','no')

### Flag matching delivery/billing addresses

In [26]:
orders['matching address'] = np.where(orders['delivery postcode'] == orders['billing postcode'], 'yes','no')

### Get email domain

In [27]:
orders['email domain'] = orders['customers email address'].str.split('@', expand = True).iloc[:,-1]

*****
# Structure for model
*****

In [28]:
for_model = orders.copy()

ls = ['date purchased',
     'orders id',
     'customers id',
     'customers email address',
     'billing street address',
     'billing state',
     'billing country',
     'billing postcode',
     'delivery street address',
     'delivery state',
     'delivery country',
     'delivery postcode',
     'ip address',
     'date account created']

for_model.drop(ls,1,inplace = True)

In [29]:
for_model.head()

Unnamed: 0,delivery address commercial,ip mismatch,customer type,payment method,purchase type,account duration at time of purchased,subtotal,fraud,contains top part,contains fraud part,matching address,email domain
0,residential,0,non reseller,paypal,account,146,28.9,no,no,yes,yes,gmail.com
1,residential,1,non reseller,credit card,account,1455,35.0,no,no,yes,yes,rhodesstate.edu
2,residential,1,non reseller,paypal,guest,0,16.75,no,no,yes,no,dougbraun.com
3,residential,1,non reseller,paypal,account,1870,69.8,no,no,yes,yes,gmail.com
4,residential,1,non reseller,amazon payments,account,0,24.3,no,no,yes,yes,innovandistudios.com


In [30]:
if r_write == 'yes':

    title = 'Fraud Detection Data for R.xlsx'
    
    writer = pd.ExcelWriter(csv_path + title, engine = 'xlsxwriter')
    for_model.to_excel(writer, index = False)
    writer.save()

*****
# Confusion matrix
*****

### Prepare data

In [31]:
ls = orders['orders id'].tolist()
for_rates = osh_main[osh_main['orders id'].isin(ls)].drop_duplicates(['orders id','orders status name']).copy()

v1 = len(orders)
v2 = len(set(for_rates['orders id']))

if v1 != v2:
    raise ValueError('totals do not match')
    
t = len(set(for_rates['orders id']))
print('count of unique OIDs: {:,.0f}'.format(t))    

count of unique OIDs: 250,336


### True Positive
We say fraud and it's fraud

In [32]:
ls = for_rates[for_rates['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = for_rates[for_rates['orders id'].isin(ls)]

b = a.groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
b.columns = b.columns.droplevel(0)

tp = int(b['fraud - void'].sum())

### False positive
We say fraud and it's not fraud

In [33]:
ls = for_rates[for_rates['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = for_rates[for_rates['orders id'].isin(ls)]

b = a.groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
b.columns = b.columns.droplevel(0)

fp = int(b['fraud - pending'].sum() - b['fraud - void'].sum())

### True Negative
We say not fraud and it's not fraud

In [None]:
ls = for_rates[for_rates['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = for_rates[~for_rates['orders id'].isin(ls)].groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
a.columns = a.columns.droplevel(0)

tn = int(len(a) - len(a[(a['fraud - confirmed'] == 1) | (a['fraud - void'] == 1)]))

### False negative
We say not fraud and it's fraud

In [None]:
ls = for_rates[for_rates['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = for_rates[~for_rates['orders id'].isin(ls)].groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
a.columns = a.columns.droplevel(0)

fn = len(a[(a['fraud - confirmed'] == 1) | (a['fraud - void'] == 1)])

### Confusion Matrix

In [None]:
cols = ['reference - no','reference - yes']
ix = ['prediction - no','prediction - yes']

data = [[tn,fn],[fp,tp]]

confusion_matrix = pd.DataFrame(columns = cols, index = ix, data = data)

if confusion_matrix.sum(1).sum() != t:
    raise ValueError('your confusion matrix does not sum to total')

In [None]:
confusion_matrix

### Make easier to read

In [None]:
confusion_matrix2 = confusion_matrix.copy()
confusion_matrix2.columns = ['really not fraud','really fraud']
confusion_matrix2.index = ['We Said Not Fraud','We Said Fraud']

fmt = ['n0','n0']
display(confusion_matrix2.format_(fmt))

### See the proportions

In [None]:
fmt = ['p1','p1']
(confusion_matrix2/t).format_(fmt)

In [None]:
print('done')