### Load Libraries

In [26]:
import sys
sys.path.insert(0,'/Users/jarad/Fake Folder/Python Libraries')

from jb_libraries import *
%matplotlib inline

import re

### Script settings

In [51]:
r_write = 'no'
csv_path = '/Users/jarad/Desktop/Desktop/Portfolio Scripts/Machine Learning/'

In [28]:
# the first month we started marking orders as "fraud - void" instead of deleting them
fraud_date_start = '2018-05-01' 
date_end = str(dt.datetime.now().date())

three_months_ago = str((pd.to_datetime(date_end) - pd.DateOffset(months = 3)).date())

### Get orders statuses

In [29]:
os = pd.read_sql(
'''
SELECT
orders_status_id,
orders_status_name
FROM orders_status
ORDER BY orders_status_id
''', db)

col_fix(os)
os

Unnamed: 0,orders status id,orders status name
0,1,Pending
1,2,Processing
2,3,Shipped
3,4,Update
4,5,Printed
5,6,Billed
6,7,Payment Received
7,8,Fraud - Pending
8,9,Fraud - Confirmed
9,10,Return


### Get the first OID that we marked as "Fraud - Void" 
We deleted OIDs before we started using this fraud flag.

In [31]:
min_oid = pd.read_sql(
'''
SELECT
MIN(orders_id) AS min_oid
FROM orders_status_history
WHERE orders_status_id = 15 # Fraud - Void
''', db)

col_fix(min_oid)

### Get orders data

In [32]:
s = dt.datetime.now()

orders_super_main = pd.read_sql(
'''
SELECT
DATE(date_purchased) AS date_purchased,
o.orders_id,

customers_email_address AS email,

billing_name,
billing_company,
billing_street_address,
billing_suburb,
billing_city,
billing_state,
billing_country,
billing_postcode,

delivery_name,
delivery_company,
delivery_street_address,
delivery_suburb,
delivery_city,
delivery_state,
delivery_country,
delivery_postcode,
IF(delivery_address_commercial = 0, 'residential','commercial') AS delivery_address_type,

# here, we reverse the question that the db asks
# so a "yes" for the db is a "no" for us
IF(ip_mismatch = 1, 'no','yes') AS ip_and_shipping_match,

IF(orders_reseller = 0 AND orders_super_reseller = 0, 'non reseller','reseller/super') AS customer_type,
IF(customers_id = 0, 'guest','account') AS account_type,
customers_id,

payment_method,

IF(o.orders_status IN (9,15), 'yes','no') AS fraud, # 9 = Fraud - Confirmed; 15 = Fraud - Void

ot.value AS subtotal,

LOWER(os.orders_status_name) AS orders_status_name

FROM orders o
JOIN orders_total ot ON o.orders_id = ot.orders_id
AND class = 'ot_subtotal'
JOIN orders_status os ON o.orders_status = os.orders_status_id

WHERE o.orders_id >= '''+ str(min_oid.values.item()) +'''

# exclude adabox
AND o.orders_id NOT IN (SELECT orders_id FROM subscriptions_history)

# exclude partnerships
AND o.orders_id NOT IN (SELECT
                        orders_id
                        FROM orders_products
                        WHERE part_id IN (SELECT part_id FROM products_description WHERE products_name LIKE '%partner%'))

''', db)

col_fix(orders_super_main)

e = dt.datetime.now()
print(e-s)

0:00:29.176441


### Make a copy

In [33]:
orders_main = orders_super_main.copy()

### Get running count of all orders per customer

In [None]:
s = dt.datetime.now()

for_count = pd.read_sql(
'''
SELECT
DISTINCT customers_email_address AS email,
orders_id,
DATE(date_purchased) AS date_purchased,

(SELECT 
COUNT(*)
FROM orders o2
WHERE o1.customers_email_address = o2.customers_email_address
AND o2.date_purchased <= o1.date_purchased) AS order_count

FROM orders o1

''', db)

col_fix(for_count)

e = dt.datetime.now()
print(e-s)

orders_main = pd.merge(orders_main, for_count, how = 'left', on = ['date purchased','orders id','email'])

### Get a count of all types of past payment methods per customer

In [140]:
s = dt.datetime.now()

for_count2 = pd.read_sql(
'''
SELECT
LOWER(customers_email_address) AS email,
LOWER(payment_method) AS payment_method,
COUNT(payment_method) AS count
FROM orders
GROUP BY LOWER(email), LOWER(payment_method)
''', db)

col_fix(for_count2)


e = dt.datetime.now()
print(e-s)

# reshape
for_count2 = for_count2.groupby(['email','payment method'])[['count']].sum().unstack(1)
for_count2.columns = for_count2.columns.droplevel(0)

# map to main data
for col in for_count2.columns:
    orders_main['past ' + col] = orders_main['email'].map(dict(zip(for_count2.index.tolist(), for_count2[col])))

# drop columns of all nulls
orders_main.dropna(how = 'all', axis = 1, inplace = True)

# fill nulls with zero
orders_main.fillna(0, inplace = True)

### Structure data

In [35]:
# change to datetime
orders_main['date purchased'] = pd.to_datetime(orders_main['date purchased'])

# make everything lowercase
for col in orders_main.columns:
    if orders_main[col].dtype == 'O':
        orders_main[col] = orders_main[col].str.lower()
        orders_main[col] = orders_main[col].str.strip()
        
# fix this payment method        
orders_main['payment method'] = np.where(orders_main['payment method'].str.contains('purchase order'),
                                               'purchase order',
                                               orders_main['payment method'])

# fix these payment methods
ls = ['replacement order','gift certificate/coupon','bitpay']
orders_main['payment method'] = np.where(orders_main['payment method'].isin(ls),
                                              'other',
                                               orders_main['payment method'])

# fill in these nulls
for x in ['billing','delivery']:
    orders_main[x + ' suburb'].fillna('',inplace = True)
    
# this was a mistake, it shouldn't be fraud, fix it here
# https://volcano.adafruit.com/volcano/Order_Status.php?oid=1949181
orders_main['fraud'] = np.where(orders_main['email'] == 'backorders@mouser.com', 
                                      'no',
                                       orders_main['fraud'])

# remove these order stauses
# keep in "returns" and "replaced defective" because these are valid orders
remove = ['voided', # not a valid order
          'pending',  # we do not yet know if this order is fraud or not
          'processing', # we do not yet know if this order is fraud or not
          'fraud - pending'] # we do not yet know if this order is fraud or not

orders_main.drop(orders_main[orders_main['orders status name'].isin(remove)].index, inplace = True)
orders_main.reset_index(drop = True, inplace = True)

### Check customer type
If no Resellers have fraud orders, drop all orders from resellers, and drop customer_type column

In [36]:
df = orders_main[(orders_main['customer type'] == 'reseller') & (orders_main['fraud'] == 'yes')]

if df.empty:
    print('no resellers have fraud orders\ndrop all reseller orders')
    orders_main.drop(orders_main[orders_main['customer type'] == 'reseller'].index, inplace = True)
    orders_main.reset_index(drop = True, inplace = True)
    orders_main.drop('customer type',1,inplace = True)
else:
    pass

no resellers have fraud orders
drop all reseller orders


### Fix Fraud flag
These are customers who have placed more than one order in their lifetime (going by the email address), and one or more orders are fraud. If a customer like this has a single fraud order among their many other orders, change it to "not fraud"; if they have more than one fraud order among their many other orders, change all of their orders to "fraud". Note that this change is retroactive as well as proactive.

In [37]:
# get some counts before the change
vc1 = pd.DataFrame(orders_main['fraud'].value_counts())
print('fraud distribution before change')
display(vc1.format_(['n0']))

# find the emails which have multiple orders and one or some are fraud
df = orders_main.groupby(['email','fraud']).size().unstack()
df = df[(df['no'].isnull() == False) & (df['yes'].isnull() == False)]
df['total'] = df.sum(1)
df.sort_values('total', ascending = False, inplace = True)

# set this up for the function below
ix1 = df[df['yes'] == 1].index.tolist()
ix2 = df[df['yes'] > 1].index.tolist()

# fix applicable flags
def flag_fix(df):
    if df['email'] in ix1: # if customer has only one fraud order our of X total orders, assume it's not fraud
        return 'no'
    elif df['email'] in ix2: # if customer has more than one fraud order our of X total orders, assume they are all fraud
        return 'yes'
    else:
        return df['fraud'] # if customer does not have a mix-and-match fraud order history, return original flag

orders_main['fraud'] = orders_main.apply(flag_fix, axis = 1)

# get some counts after the change
vc2 = pd.DataFrame(orders_main['fraud'].value_counts())
print('fraud distribution after change')
display(vc2.format_(['n0']))

# count the change
change = (vc1-vc2).loc['yes'].item()
print('\n{:,.0f} fraud flags have been changed to not-fraud'.format(change))

fraud distribution before change


Unnamed: 0,Fraud
no,287466
yes,563


fraud distribution after change


Unnamed: 0,Fraud
no,287560
yes,469



94 fraud flags have been changed to not-fraud


### Get customers data

In [38]:
cust = pd.read_sql(
'''
SELECT

c1.customers_info_id AS customers_id,
DATE(customers_info_date_account_created) AS date_created,
LOWER(c2.customers_email_address) AS email

FROM customers_info c1
LEFT JOIN customers c2 ON c1.customers_info_id = c2.customers_id
''', db)

col_fix(cust)

# structure and fix up email addresses
cust['date created'] = pd.to_datetime(cust['date created'])
cust['email'] = cust['email'].str.strip()

drop = cust[(cust['email'].isnull()) |
            (cust['email'] == '') |
            (cust['email'].str.contains('^[^a-z0-9]'))]

print('{:,.2f}% of emails are nulls, empty strings, or begin with a character\nremove them'.format(len(drop)/len(cust) * 100))
cust.drop(drop.index, inplace = True)
cust.reset_index(drop = True, inplace = True)

# if some email has multiple date_createds, take the middle date between the min and max
cust2 = cust.groupby('email').agg({'date created':['min','max']})
cust2.columns = cust2.columns.droplevel(0)

cust2['date created'] = cust2.iloc[:,0] - (cust2.iloc[:,-1] - cust2.iloc[:,0])/2
cust2['date created'] = pd.to_datetime(cust2['date created']).dt.date

cust2.reset_index(inplace = True)
cust2.drop(['min','max'],1,inplace = True)

# map date_created and get account age
orders_main['date created'] = orders_main['email'].map(dict(zip(cust2['email'], cust2['date created'])))
orders_main['date created'] = pd.to_datetime(orders_main['date created'])

orders_main['account age'] = (orders_main['date purchased'] - orders_main['date created']).dt.days
orders_main['account age'] = np.where((orders_main['account age'] < 0) | (orders_main['account age'].isnull()),
                                           0,
                                           orders_main['account age'])

orders_main.drop('date created',1,inplace = True)

if orders_main[orders_main.isnull().any(1)].empty == False:
    raise ValueError('check ur nulls')

0.09% of emails are nulls, empty strings, or begin with a character
remove them


### Get product data

In [39]:
op_main = pd.read_sql(
'''
SELECT
orders_id,
part_id,
products_quantity - products_quantity_free AS products_quantity,
products_price
FROM orders_products
WHERE orders_id >= '''+ str(min_oid.values.item()) +'''
''', db)

col_fix(op_main)

Get avg part price per order

In [40]:
# drop where price is zero
op_main.drop(op_main[op_main['products price'] == 0].index, inplace = True)
op_main.reset_index(drop = True, inplace = True)

# get revenue per line
op_main['revenue'] = op_main['products price'] * op_main['products quantity']

# get avg part price per order
df = op_main.groupby('orders id')[['revenue','products quantity']].sum()
df['avg part price'] = df['revenue']/df['products quantity']

# map avg part price
orders_main['avg part price'] = orders_main['orders id'].map(dict(zip(df.index, df['avg part price'])))

Get proportion of parts per order that are involved in fraud

In [41]:
# get all OIDs involved in fraud
oids = orders_main[orders_main['fraud'] == 'yes']['orders id'].tolist()

# get all parts involved in fraud
parts = list(set(op_main[op_main['orders id'].isin(oids)]['part id']))

# flag all parts involved in fraud
op_main['fraud part'] = np.where(op_main['part id'].isin(parts), 'yes','no')

# get the proportion of parts per order that are involved in fraud
df = op_main.groupby(['orders id','fraud part'])[['part id']].count().unstack().fillna(0)
df.columns = df.columns.droplevel(0)

df['fraud part proportion'] = df['yes']/df.sum(1)

# map it
orders_main['fraud part proportion'] = orders_main['orders id'].map(dict(zip(df.index, df['fraud part proportion'])))

# get some stats
l1 = len(set(op_main[op_main['fraud part'] == 'yes']['part id']))
l2 = len(set(op_main['part id']))

print('{:,.0f} out of {:,.0f} parts, or {:,.1f}% of all parts, have been involved in fraud orders'.format(l1, l2, l1/l2*100))

868 out of 3,254 parts, or 26.7% of all parts, have been involved in fraud orders


### Continue to structure data

Change state abbreviations to names

In [42]:
# from here: http://code.activestate.com/recipes/577305-python-dictionary-of-us-states-and-territories/
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

# change to lowercase
states = {k.lower(): v.lower() for k,v in states.items()}

# change abbreviations to state names
def state_fix(df):
    for prefix in ['billing','delivery']:
        if df[prefix + ' country'] == 'united states' and df[prefix + ' state'] in list(states.keys()):
            old_state = df[prefix + ' state']
            new_state = states[old_state]
            df[prefix + ' state'] = new_state
        else:
            pass
    return df

orders_main = orders_main.apply(state_fix, axis = 1)

Get fraud rating for country

In [43]:
# create this column, which is a tuple of billing and delivery countries
orders_main['countries'] = list(zip(orders_main['billing country'],orders_main['delivery country']))

# find the proportion of fraud orders per country tuple
countries = orders_main.groupby(['countries','fraud']).size().unstack().fillna(0)
countries['countries fraud rating'] = countries['yes']/countries.sum(1)

# map it
orders_main['countries fraud rating'] = orders_main['countries'].map(dict(zip(countries.index.tolist(), countries['countries fraud rating'])))

Flag matching addresses like how the database does it.

In [44]:
a1 = orders_main[['billing name',
                  'billing company',
                  'billing street address',
                  'billing suburb',
                  'billing city',
                  'billing state',                  
                  'billing country',
                  'billing postcode']].apply(lambda x: ', '.join(x), axis = 1)
orders_main['billing address'] = a1

a2 = orders_main[['delivery name',
                  'delivery company',
                  'delivery street address',
                  'delivery suburb',
                  'delivery city',
                  'delivery state',                  
                  'delivery country',
                  'delivery postcode']].apply(lambda x: ', '.join(x), axis = 1)
orders_main['delivery address'] = a2

orders_main['db billing and shipping match'] = np.where(a1 == a2, 'yes','no')    

Sometimes there are arbritrary differences between the billing and delivery addresses. The database will flag these as a mismatch, even though they are technically the same address.

To fix this, I'll rate the match instead of giving it a dichotomous label like yes/no. In the R script I rename "jb_billing_and_shipping_match" to "billing_and_shipping_match".

In [45]:
a = orders_main['billing address'].str.replace(',| ','')
b = orders_main['delivery address'].str.replace(',| ','')

from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

ls = []
for i,j in list(zip(a.values, b.values)):
    ls.append(similar(i,j))

orders_main['jb billing and shipping match'] = ls

Drop the pieces, now that we have the full address in one column

In [46]:
address_labels = ['name','company','street address','suburb','city','state','country','postcode']

for x in ['delivery','billing']:
    for y in address_labels:
        orders_main.drop(x + ' ' + y, 1, inplace = True)

Structure email addresses  

In [47]:
# split up username and domain
orders_main[['email username','email domain']] = orders_main['email'].str.split('@', expand = True)

# get email domain fraud rating
domains = orders_main.groupby(['email domain','fraud']).size().unstack().fillna(0)
domains['domain fraud rating'] = domains['yes']/domains.sum(1)

orders_main['email domain fraud rating'] = orders_main['email domain'].map(dict(zip(domains.index, domains['domain fraud rating'])))

# get the proportion of non-letter characters in email username
# don't consider "." since plenty of valid gmail usernames have periods
# the hypothesis is that fraud emails have more of these garabage characters

# get length of entire user name
length1 = [len(x) for x in orders_main['email username']]

# get count of numbers and non-letters
length2 = [len(re.sub('[a-z.]','',x)) for x in orders_main['email username']]

# get (count of numbers and non-letters) / (count of all characters)
proportion = [(x/y) for x,y in zip(length2, length1)]

orders_main['email username non letter rating'] = proportion

### Check for nulls

In [48]:
n = orders_main[orders_main.isnull().any(1)]
if len(n)/len(orders_main) < 0.001: # if nulls account for less than 0.1% of all lines
    orders_main.dropna(inplace = True)
    orders_main.reset_index(drop = True, inplace = True)
else:
    raise ValueError('check your nulls')
    display(n)

### Check out some class stats

In [49]:
avg = orders_main.groupby('fraud').mean().T

ls = ['orders id','customers id']
avg = avg[~avg.index.isin(ls)]

fmt = ['n2','n2']
avg.format_(fmt)

Unnamed: 0,No,Yes
subtotal,144.98,269.02
order count,17.58,1.59
account age,553.27,75.24
avg part price,17.45,85.09
fraud part proportion,0.63,1.0
countries fraud rating,0.0,0.03
jb billing and shipping match,0.93,0.79
email domain fraud rating,0.0,0.23
email username non letter rating,0.06,0.11


### Send to Excel for R

In [50]:
s = dt.datetime.now()

if r_write == 'yes':

    title = 'Fraud Data for R.xlsx'
    
    writer = pd.ExcelWriter(csv_path + title, engine = 'xlsxwriter')
    orders_main.to_excel(writer, index = False)
    writer.save()
    
e = dt.datetime.now()
print(e-s)

0:03:13.997384


# Create confusion matrix
I'll use this as a benchmark for the ML model that I created in R.

In [None]:
for_cm = orders_main.copy()
for_cm['cm result'] = np.nan

### Get historical data

In [None]:
osh_main = pd.read_sql(
'''
SELECT

orders_id,
LOWER(orders_status_name) AS orders_status_name

FROM orders_status_history osh
JOIN orders_status os ON osh.orders_status_id = os.orders_status_id

WHERE LENGTH(orders_id) <= 7
AND orders_id != 0
AND orders_id >= '''+ str(min_oid.values.item()) +'''
''', db)

col_fix(osh_main)

# drop OIDs that are not in our orders_main df
osh_main.drop(osh_main[~osh_main['orders id'].isin(orders_main['orders id'].tolist())].index, inplace = True)
osh_main.reset_index(drop = True, inplace = True)

# drop duplicates
osh_main.drop_duplicates(['orders id','orders status name'], inplace = True)
osh_main.reset_index(drop = True, inplace = True)

# check it
s1 = set(osh_main['orders id'])
s2 = set(orders_main['orders id'])
s3 = s1.symmetric_difference(s2)

if len(s3) > 0:
    raise ValueError('check ur OIDs')
else:
    total_oids = len(s1)
    print('there are {:,.0f} total unique OIDs'.format(total_oids))

### True Postive
We say fraud, it's really fraud

In [None]:
ls = osh_main[osh_main['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = osh_main[osh_main['orders id'].isin(ls)]
b = a.groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
b.columns = b.columns.droplevel(0)
c = b[b['fraud - void'].isnull() == False].index.tolist()

tp = int(len(c))

for_cm['cm result'] = np.where(for_cm['orders id'].isin(c), 'true positive', for_cm['cm result'])

### False Positive
We say fraud, it's not really fraud

In [None]:
ls = osh_main[osh_main['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = osh_main[osh_main['orders id'].isin(ls)]
b = a.groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
b.columns = b.columns.droplevel(0)
c = b[(b['fraud - pending'].isnull() == False) & (b['fraud - void'].isnull())].index.tolist()

fp = int(len(c))

for_cm['cm result'] = np.where(for_cm['orders id'].isin(c), 'false positive', for_cm['cm result'])

### True Negative
We say not fraud, it's really not fraud

In [None]:
ls = osh_main[osh_main['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = osh_main[~osh_main['orders id'].isin(ls)].groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
a.columns = a.columns.droplevel(0)
b = a[(a['fraud - confirmed'] == 1) | (a['fraud - void'] == 1)].index.tolist()

# tn = everything never labled as fraud minus everything that is fraud = everything that's not fraud and never labeled as fraud
tn = int(len(a) - len(b))

for_cm['cm result'] = np.where(for_cm['orders id'].isin(list(set(a.index) - set(b))), 'true negative', for_cm['cm result'])

### False Negative
We say not fraud, it's really fraud

In [None]:
ls = osh_main[osh_main['orders status name'] == 'fraud - pending']['orders id'].tolist()

a = osh_main[~osh_main['orders id'].isin(ls)].groupby(['orders id','orders status name'])[['orders id']].count().unstack(1)
a.columns = a.columns.droplevel(0)
b = a[(a['fraud - confirmed'] == 1) | (a['fraud - void'] == 1)].index.tolist()

fn = len(b)

for_cm['cm result'] = np.where(for_cm['orders id'].isin(b), 'false negative', for_cm['cm result'])

### Check totals

In [None]:
if total_oids != for_cm['cm result'].value_counts().sum():
    raise ValueError('your totals do not match')
    
if np.sum(for_cm['cm result'].isnull()) > 0:
    raise ValueError('you have not labeled all orders')    

### Create confusion matrix

In [None]:
cols = ['reference - no','reference - yes']
ix = ['prediction - no','prediction - yes']

data = [[tn,fn],[fp,tp]]

confusion_matrix = pd.DataFrame(columns = cols, index = ix, data = data)
   
#=========================
# make it pretty
#=========================    
    
confusion_matrix2 = confusion_matrix.copy()
confusion_matrix2.columns = ['Really Not Fraud','Really Fraud']
confusion_matrix2.index = ['We Said Not Fraud','We Said Fraud']

fmt = ['n0','n0']
display(confusion_matrix2.format_(fmt))    

### Metrics

In [None]:
accuracy = (tp + tn)/(tp + tn + fp + fn)
print('\naccuracy is {:,.2f}%\n'.format(accuracy * 100))

# of all the true frauds, this is the proportion you got right
sensitivity = tp/(tp+fn)
print('sensitivity (true positive rate) is {:,.2f}%'.format(sensitivity * 100))

# of all the true not-frauds, this is the proportion you got right
specificity = tn/(tn+fp)
print('specificity (true negative rate) is {:,.2f}%\n'.format(specificity * 100))

# of all your fraud predictions, this is the proportion you got correct
pos_pred = tp/(tp+fp)
print('positive predicted value (tp/(tp+fp)) is {:,.2f}%'.format(pos_pred * 100))

# of all the not-fraud predictions, this is the proportion you got correct
neg_pred = tn/(tn+fn)
print('negative predicted value (tn/(tn+fpn) is {:,.2f}%'.format(neg_pred * 100))

### Revenue and Costs

In [None]:
fmt = ['m0']
for_cm.groupby('cm result')[['subtotal']].sum().format_(fmt)

# Explore

In [78]:
cols = ['email domain fraud rating',
       'subtotal',
       'avg part price',
       'order count',
       'ip and shipping match',
       'payment method']

orders_main[orders_main['orders id'] == 2000000][cols]

Unnamed: 0,email domain fraud rating,subtotal,avg part price,order count,ip and shipping match,payment method
231763,0.001836,173.7,28.95,8,no,credit card


In [82]:

a.loc[['jdanh@hotmail.com']]

Unnamed: 0_level_0,payment method
email,Unnamed: 1_level_1
jdanh@hotmail.com,4


In [75]:
df = orders_main.groupby(['payment method','fraud'])[['fraud']].count().unstack()
df.columns = df.columns.droplevel(0)
df['% yes'] = df['yes']/df.sum(1)
df.sort_values('% yes', ascending = False, inplace = True)


fmt = ['n0','n0','p2']
df.format_(fmt)

Unnamed: 0_level_0,No,Yes,% Yes
payment method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
other,3617,24,0.66%
credit card,162159,317,0.20%
purchase order,3360,4,0.12%
amazon payments,28948,34,0.12%
paypal,89471,90,0.10%
