### Some notes
We cannot use the following data as features:
* customers_email_address with customers_id because these are not always one-to-one
* customers.customers_info_date_account_created because some customer can checkout as a guest for 100 years and then create an account for the latest order, so this just wouldn't make any sense


Be careful when billing and delivery addresses don't match, it could be something as simple as billing_state = 'MA' and delivery_state = 'massachusetts'

### TO DO
* Show that the shipping and billing are the same even though the flag says different
* Show why we need something like "use shipping address as billing", because people make small changes even though the two may be the same
* Show that a single customer can place multiple orders and then one order suddently gets flagged as fraud (customer id 30068)

### Libraries

In [1]:
import sys
sys.path.insert(0,'/Users/jarad/Fake Folder/Python Libraries')

from jb_libraries import *
%matplotlib inline

import re

### Script settings

In [83]:
r_write = 'no'
csv_path = '/Users/jarad/Desktop/Desktop/Portfolio Scripts/Machine Learning to Detect Credit Card Fraud/'

In [3]:
# the first month we started marking orders as "fraud - void" instead of deleting them
fraud_date_start = '2018-05-01' 
date_end = str(dt.datetime.now().date())

three_months_ago = str((pd.to_datetime(date_end) - pd.DateOffset(months = 3)).date())

### Get orders statuses

In [4]:
os = pd.read_sql(
'''
SELECT
orders_status_id,
orders_status_name
FROM orders_status
ORDER BY orders_status_id
''', db)

col_fix(os)
os

Unnamed: 0,orders status id,orders status name
0,1,Pending
1,2,Processing
2,3,Shipped
3,4,Update
4,5,Printed
5,6,Billed
6,7,Payment Received
7,8,Fraud - Pending
8,9,Fraud - Confirmed
9,10,Return


### Get the first OID that we marked as "Fraud - Void" 
We deleted OIDs before using this flag

In [5]:
min_oid = pd.read_sql(
'''
SELECT
MIN(orders_id) AS min_oid
FROM orders_status_history
WHERE orders_status_id = 15 # Fraud - Void
''', db)

col_fix(min_oid)

### Get orders data

In [6]:
s = dt.datetime.now()

orders_super_main = pd.read_sql(
'''
SELECT
DATE(date_purchased) AS date_purchased,
o.orders_id,

customers_email_address AS email,

billing_street_address,
billing_suburb,
billing_city,
billing_state,
billing_country,
billing_postcode,

delivery_street_address,
delivery_suburb,
delivery_city,
delivery_state,
delivery_country,
delivery_postcode,
IF(delivery_address_commercial = 0, 'residential','commercial') AS delivery_address_commercial,

# here, we reverse the question that the db asks
# so a "yes" for the db is a "no" for us
IF(ip_mismatch = 1, 'no','yes') AS ip_and_shipping_match,

IF(orders_reseller = 0 AND orders_super_reseller = 0, 'non reseller','reseller/super') AS customer_type,
IF(customers_id = 0, 'guest','account') AS account_type,
customers_id,

payment_method,

IF(o.orders_id IN (SELECT 
                   orders_id 
                   FROM orders_status_history 
                   # 9 = Fraud - Confirmed; 15 = Fraud - Void
                   WHERE orders_status_id IN (9,15)), 'yes','no') AS fraud,

ot.value AS subtotal

FROM orders o
JOIN orders_total ot ON o.orders_id = ot.orders_id
AND class = 'ot_subtotal'

WHERE o.orders_id >= '''+ str(min_oid.values.item()) +'''

# exclude adabox
AND o.orders_id NOT IN (SELECT orders_id FROM subscriptions_history)

# exclude partnerships
AND o.orders_id NOT IN (SELECT
                        orders_id
                        FROM orders_products
                        WHERE part_id IN (SELECT part_id FROM products_description WHERE products_name LIKE '%partner%'))

''', db)

col_fix(orders_super_main)

# change to datetime
orders_super_main['date purchased'] = pd.to_datetime(orders_super_main['date purchased'])

# make everything lowercase
for col in orders_super_main.columns:
    if orders_super_main[col].dtype == 'O':
        orders_super_main[col] = orders_super_main[col].str.lower()
        orders_super_main[col] = orders_super_main[col].str.strip()
        
# fix this payment method        
orders_super_main['payment method'] = np.where(orders_super_main['payment method'].str.contains('purchase order'),
                                              'purchase order',
                                              orders_super_main['payment method'])

# fix these payment methods
ls = ['replacement order','gift certificate/coupon','bitpay']
orders_super_main['payment method'] = np.where(orders_super_main['payment method'].isin(ls),
                                              'other',
                                              orders_super_main['payment method'])

# fill in these nulls
for x in ['billing','delivery']:
    orders_super_main[x + ' suburb'].fillna('',inplace = True)
        
e = dt.datetime.now()
print(e-s)

0:00:25.856323


### Get customers data

In [10]:
cust = pd.read_sql(
'''
SELECT

c1.customers_info_id AS customers_id,
DATE(customers_info_date_account_created) AS date_created,
LOWER(c2.customers_email_address) AS email

FROM customers_info c1
LEFT JOIN customers c2 ON c1.customers_info_id = c2.customers_id
''', db)

col_fix(cust)

# structure and fix up email addresses
cust['date created'] = pd.to_datetime(cust['date created'])
cust['email'] = cust['email'].str.strip()

drop = cust[(cust['email'].isnull()) |
            (cust['email'] == '') |
            (cust['email'].str.contains('^[^a-z0-9]'))]

print('{:,.2f}% of emails are nulls, empty strings, or begin with a character\nremove them'.format(len(drop)/len(cust) * 100))
cust.drop(drop.index, inplace = True)
cust.reset_index(drop = True, inplace = True)

# if some email has multiple date_createds, take the middle date between the min and max
cust2 = cust.groupby('email').agg({'date created':['min','max']})
cust2.columns = cust2.columns.droplevel(0)

cust2['date created'] = cust2.iloc[:,0] - (cust2.iloc[:,-1] - cust2.iloc[:,0])/2
cust2['date created'] = pd.to_datetime(cust2['date created']).dt.date

cust2.reset_index(inplace = True)
cust2.drop(['min','max'],1,inplace = True)

# map date_created and get account age
orders_super_main['date created'] = orders_super_main['email'].map(dict(zip(cust2['email'], cust2['date created'])))
orders_super_main['date created'] = pd.to_datetime(orders_super_main['date created'])

orders_super_main['account age'] = (orders_super_main['date purchased'] - orders_super_main['date created']).dt.days
orders_super_main['account age'] = np.where((orders_super_main['account age'] < 0) | (orders_super_main['account age'].isnull()),
                                           0,
                                           orders_super_main['account age'])

orders_super_main.drop('date created',1,inplace = True)

if orders_super_main[orders_super_main.isnull().any(1)].empty == False:
    raise ValueError('check ur nulls')

0.08% of emails are nulls, empty strings, or begin with a character
remove them


### Make a copy
So that you can get the original data set without querying

In [11]:
orders_main = orders_super_main.copy()

### Structure orders data

Flag matching addresses

In [12]:
a1 = orders_main[['billing street address',
                  'billing state',                  
                  'billing country',
                  'billing postcode']].apply(lambda x: ', '.join(x), axis = 1)
orders_main['billing address'] = a1

a2 = orders_main[['delivery street address',
                  'delivery state',                  
                  'delivery country',
                  'delivery postcode']].apply(lambda x: ', '.join(x), axis = 1)
orders_main['delivery address'] = a2

orders_main['billing and shipping match'] = np.where(a1 == a2, 'yes','no')

# drop these; don't need them anymore
for x in ['delivery','billing']:
    for y in ['street address','state','country','postcode']:
        orders_main.drop(x + ' ' + y, 1, inplace = True)

Extract email domain

In [13]:
ls1 = orders_main['email'].tolist()
ls2 = [re.sub('.*@','',x) for x in ls1]
ls3 = [x.split('.')[0] for x in ls2]

orders_main['email domain'] = ls3

Get percentage of characters in email user name that are numbers and characters.  
The hypothesis is that fraud orders have a higher proportion of numbers and characters.

In [14]:
# get entire email address
email = orders_main['email'].tolist()

# exclude the domain
user = [re.sub('@.*','',x) for x in email]

# get length of entire user name
length1 = [len(x) for x in user]

# get count of numbers and non-letters
length2 = [len(re.sub('[a-z]','',x)) for x in user]

# get (count of numbers and non-letters) / (count of all characters)
proportion = [(x/y) for x,y in zip(length2, length1)]

orders_main['non letter proportion'] = proportion

### Get product data

In [15]:
op_main = pd.read_sql(
'''
SELECT

A.orders_id,
A.revenue/A.qty_bought AS avg_part_price

FROM

(SELECT
orders_id,
SUM(products_quantity - products_quantity_free) AS qty_bought,
SUM((products_quantity - products_quantity_free) * products_price) AS revenue
FROM orders_products
WHERE orders_id >= '''+ str(min_oid.values.item()) +'''
AND products_price > 0
GROUP BY orders_id) A

GROUP BY A.orders_id
''', db)

col_fix(op_main)

# map avg part price
orders_main['avg part price'] = orders_main['orders id'].map(dict(zip(op_main['orders id'], op_main['avg part price'])))

# check nulls
nulls = np.sum(orders_main['avg part price'].isnull())
if nulls < 50:
    print('%i nulls will be dropped' % nulls)
    orders_main.dropna(subset = ['avg part price'], inplace = True)
    orders_main.reset_index(drop = True, inplace = True)

34 nulls will be dropped


### Send to excel for R

In [81]:
if r_write == 'yes':

    title = 'Fraud Data for R.xlsx'
    
    writer = pd.ExcelWriter(csv_path + title, engine = 'xlsxwriter')
    orders_main.to_excel(writer, index = False)
    writer.save()

In [82]:
print('done')

done


# Create confusion matrix
We'll use this as a benchmark for our models

### Get historical data

In [18]:
osh_main = pd.read_sql(
'''
SELECT

orders_id,
LOWER(orders_status_name) AS orders_status_name

FROM orders_status_history osh
JOIN orders_status os ON osh.orders_status_id = os.orders_status_id

WHERE LENGTH(orders_id) <= 7
AND orders_id != 0
AND orders_id >= '''+ str(min_oid.values.item()) +'''
''', db)

col_fix(osh_main)

# drop OIDs that are not in our orders_main df
osh_main.drop(osh_main[~osh_main['orders id'].isin(orders_main['orders id'].tolist())].index, inplace = True)
osh_main.reset_index(drop = True, inplace = True)

# drop duplicates
osh_main.drop_duplicates(['orders id','orders status name'], inplace = True)
osh_main.reset_index(drop = True, inplace = True)

# check it
s1 = set(osh_main['orders id'])
s2 = set(orders_main['orders id'])
s3 = s1.symmetric_difference(s2)

if len(s3) > 0:
    raise ValueError('check ur OIDs')
else:
    total_oids = len(s1)
    print('there are {:,.0f} total unique OIDs'.format(total_oids))

there are 273,779 total unique OIDs


In [19]:
#=========================
# true positive
# we say fraud, it's really fraud
#=========================

ls = osh_main[osh_main['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = osh_main[osh_main['orders id'].isin(ls)]

b = a.groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
b.columns = b.columns.droplevel(0)

tp = int(b['fraud - void'].sum())

#=========================
# false positive
# we say fraud, it's not really fraud
#=========================

ls = osh_main[osh_main['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = osh_main[osh_main['orders id'].isin(ls)]

b = a.groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
b.columns = b.columns.droplevel(0)

fp = int(b['fraud - pending'].sum() - b['fraud - void'].sum())

#=========================
# true negative
# we say not fraud, it's really not fraud
#=========================

ls = osh_main[osh_main['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = osh_main[~osh_main['orders id'].isin(ls)].groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
a.columns = a.columns.droplevel(0)

tn = int(len(a) - len(a[(a['fraud - confirmed'] == 1) | (a['fraud - void'] == 1)]))

#=========================
# false negative
# we say not fraud, it's really fraud
#=========================

ls = osh_main[osh_main['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = osh_main[~osh_main['orders id'].isin(ls)].groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
a.columns = a.columns.droplevel(0)

fn = len(a[(a['fraud - confirmed'] == 1) | (a['fraud - void'] == 1)])

#=========================
# confusion matrix
#=========================

cols = ['reference - no','reference - yes']
ix = ['prediction - no','prediction - yes']

data = [[tn,fn],[fp,tp]]

confusion_matrix = pd.DataFrame(columns = cols, index = ix, data = data)

if confusion_matrix.sum(1).sum() != total_oids:
    raise ValueError('your confusion matrix does not sum to total')
    
#=========================
# make it pretty
#=========================    
    
confusion_matrix2 = confusion_matrix.copy()
confusion_matrix2.columns = ['Really Not Fraud','Really Fraud']
confusion_matrix2.index = ['We Said Not Fraud','We Said Fraud']

fmt = ['n0','n0']
display(confusion_matrix2.format_(fmt))    

#=========================
# some metrics
#=========================    

accuracy = (tp + tn)/(tp + tn + fp + fn)
print('\naccuracy is {:,.2f}%\n'.format(accuracy * 100))

# of all the true frauds, this is the proportion you got right
sensitivity = tp/(tp+fn)
print('sensitivity (true positive rate) is {:,.2f}%'.format(sensitivity * 100))

# of all the true not-frauds, this is the proportion you got right
specificity = tn/(tn+fp)
print('specificity (true negative rate) is {:,.2f}%\n'.format(specificity * 100))

# of all your fraud predictions, this is the proportion you got correct
pos_pred = tp/(tp+fp)
print('positive predicted value (tp/(tp+fp)) is {:,.2f}%'.format(pos_pred * 100))

# of all the not-fraud predictions, this is the proportion you got correct
neg_pred = tn/(tn+fn)
print('negative predicted value (tn/(tn+fpn) is {:,.2f}%'.format(neg_pred * 100))

Unnamed: 0,Really Not Fraud,Really Fraud
We Said Not Fraud,270704,231
We Said Fraud,2554,290



accuracy is 98.98%

sensitivity (true positive rate) is 55.66%
specificity (true negative rate) is 99.07%

positive predicted value (tp/(tp+fp)) is 10.20%
negative predicted value (tn/(tn+fpn) is 99.91%


In [20]:
print('with our current method of fraud identification:')

lost = fn * orders_main[orders_main['fraud'] == 'yes'][['subtotal']].mean().item()
print('lost: ${:,.0f}'.format(lost))

saved = tp * orders_main[orders_main['fraud'] == 'yes'][['subtotal']].mean().item()
print('saved: ${:,.0f}'.format(saved))

with our current method of fraud identification:
lost: $77,528
saved: $97,330


In [21]:
print('done')

done
