# E-commerce Fraud Detection 
* Use case: Predicts the probability that the first transaction of a new user is fraudulent.

# Part 1: Import Data

In [69]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
import xgboost as xgb
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as imbpipeline

In [5]:
ipToCountry = pd.read_csv('./data/IpAddress_to_Country.csv')
fraud_data = pd.read_csv('./data/imbalancedFraudDF.csv')

fraud_data.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
3,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0
4,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0


# Part 2: Data exploration

In [6]:
#Distribution of the label column
fraud_data['class'].value_counts()

class
0    136961
1      1415
Name: count, dtype: int64

In [7]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138376 entries, 0 to 138375
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         138376 non-null  int64  
 1   signup_time     138376 non-null  object 
 2   purchase_time   138376 non-null  object 
 3   purchase_value  138376 non-null  int64  
 4   device_id       138376 non-null  object 
 5   source          138376 non-null  object 
 6   browser         138376 non-null  object 
 7   sex             138376 non-null  object 
 8   age             138376 non-null  int64  
 9   ip_address      138376 non-null  float64
 10  class           138376 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 11.6+ MB


In [8]:
fraud_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,138376.0,200149.0,115226.8,2.0,100894.8,200000.5,299745.2,400000.0
purchase_value,138376.0,36.93899,18.32109,9.0,22.0,35.0,49.0,154.0
age,138376.0,33.12587,8.623645,18.0,27.0,33.0,39.0,76.0
ip_address,138376.0,2154381000.0,1250563000.0,52093.496895,1085079000.0,2156471000.0,3249150000.0,4294850000.0
class,138376.0,0.01022576,0.1006045,0.0,0.0,0.0,0.0,1.0


In [9]:
fraud_data.isna().sum()

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

### Identify country info based on ip_address


In [10]:
ipToCountry.head()

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


In [11]:
start = time.time()

countries = []
for i in range(len(fraud_data)):
    ip_address = fraud_data.loc[i, 'ip_address']#number
    #check which interval does ip_address falls into
    #below [] is list of T/F, only when this ip_address falls into the correct internal row does the index generate a True
    #tmp is a df of shape n * 3, where n is 1 if found a match (ip_address falls in range) or 0 if no match
    tmp = ipToCountry[(ipToCountry['lower_bound_ip_address'] <= ip_address) &
                    (ipToCountry['upper_bound_ip_address'] >= ip_address)]
    if len(tmp) == 1:#found match
        countries.append(tmp['country'].values[0])
    else:#no match
        countries.append('NA')

fraud_data['country'] = countries
runtime = time.time() - start

print("Lookup took", runtime, "seconds.")



Lookup took 31.6942880153656 seconds.


In [12]:
ip_address = fraud_data.loc[6, 'ip_address']
tmp = ipToCountry[(ipToCountry['lower_bound_ip_address'] <= ip_address) &
                    (ipToCountry['upper_bound_ip_address'] >= ip_address)]
print(tmp)

       lower_bound_ip_address  upper_bound_ip_address        country
28203            1.686110e+09              1694498815  United States


In [13]:
print(tmp['country'])

28203    United States
Name: country, dtype: object


Optimization: Since each ip range is not overlap in ipToCountry.csv, the binary search algorithm could be solved to imporve the time complexity to O(logn).

In [14]:
print(fraud_data.user_id.nunique())#138376
print(len(fraud_data.index))#138376
#all of the user_id has only the first 1 transaction, no way to do time based aggregates,

138376
138376


# Part 3a: Feature Engineering




In [15]:
#time related features: can be done before split, as they has no interaction between other rows, solely based on other columns of the same row
fraud_data['interval_after_signup'] = (pd.to_datetime(fraud_data['purchase_time']) - pd.to_datetime(
        fraud_data['signup_time'])).dt.total_seconds()

fraud_data['signup_days_of_year'] = pd.DatetimeIndex(fraud_data['signup_time']).dayofyear

#bed time operation
fraud_data['signup_seconds_of_day'] = pd.DatetimeIndex(fraud_data['signup_time']).second + 60 * pd.DatetimeIndex(
    fraud_data['signup_time']).minute + 3600 * pd.DatetimeIndex(fraud_data['signup_time']).hour

fraud_data['purchase_days_of_year'] = pd.DatetimeIndex(fraud_data['purchase_time']).dayofyear
fraud_data['purchase_seconds_of_day'] = pd.DatetimeIndex(fraud_data['purchase_time']).second + 60 * pd.DatetimeIndex(
    fraud_data['purchase_time']).minute + 3600 * pd.DatetimeIndex(fraud_data['purchase_time']).hour

fraud_data = fraud_data.drop(['user_id','signup_time','purchase_time'], axis=1)

In [16]:
fraud_data.head()
#note there are NAs in country

Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
0,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,4506682.0,55,82549,108,10031
1,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,17944.0,158,74390,159,5934
2,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,,492085.0,118,76405,124,50090
3,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,4361461.0,202,25792,252,67253
4,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0,Canada,4240931.0,141,21783,190,29114


In [17]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138376 entries, 0 to 138375
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   purchase_value           138376 non-null  int64  
 1   device_id                138376 non-null  object 
 2   source                   138376 non-null  object 
 3   browser                  138376 non-null  object 
 4   sex                      138376 non-null  object 
 5   age                      138376 non-null  int64  
 6   ip_address               138376 non-null  float64
 7   class                    138376 non-null  int64  
 8   country                  138376 non-null  object 
 9   interval_after_signup    138376 non-null  float64
 10  signup_days_of_year      138376 non-null  int32  
 11  signup_seconds_of_day    138376 non-null  int32  
 12  purchase_days_of_year    138376 non-null  int32  
 13  purchase_seconds_of_day  138376 non-null  int32  
dtypes: f

In [18]:
print(fraud_data.source.value_counts())

source
SEO       55766
Ads       54913
Direct    27697
Name: count, dtype: int64


In [19]:
print(fraud_data.browser.value_counts())

browser
Chrome     55993
IE         33836
Safari     22670
FireFox    22500
Opera       3377
Name: count, dtype: int64


# Part 4: Data Split

In [20]:
y = fraud_data['class']
X = fraud_data.drop(['class'], axis=1)

#split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)

X_train.shape: (110700, 13)
y_train.shape: (110700,)


In [21]:
X_train.head()

Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
29343,12,OULPAZAFRFPXP,Ads,Chrome,M,42,3690922000.0,Korea Republic of,3499664.0,183,67384,224,24648
12190,10,AIIWMFEYQQIEB,Ads,Opera,M,29,1686759000.0,United States,6766039.0,5,78146,84,18585
19388,34,VUVETBUPCIWJE,Direct,Chrome,M,53,4138429000.0,,5870515.0,197,81354,265,76669
89104,48,QCFULAJOYKFUU,Ads,Chrome,M,29,96173370.0,France,2145618.0,160,30920,185,16538
82082,44,IHRWLMIJMEEEU,Ads,FireFox,M,24,1936025000.0,China,7079059.0,111,71897,193,66156


# Part 3b: Feature Engineering


Convert categorical features with high cadinality to numericals

In [22]:
X_train.device_id.value_counts(dropna=False)

device_id
QTXDJHIIXYVQN    6
LQXZGPLMKAJJV    5
VSMNAOFPSEQOL    5
HSKCGAKNSEMHZ    5
HCYSLYNRFLAXU    5
                ..
DSCGCXUUKUBRR    1
PMEXPXTLZCZUV    1
RCZGGVLPNZPXG    1
TLUDXYOENHBNL    1
AKABBOYHUDIAP    1
Name: count, Length: 107963, dtype: int64

In [23]:
X_test.country.value_counts(dropna=False)

country
United States                           10675
NA                                       4016
China                                    2234
Japan                                    1413
United Kingdom                            801
                                        ...  
Belize                                      1
Congo The Democratic Republic of The        1
Vanuatu                                     1
Albania                                     1
Tanzania United Republic of                 1
Name: count, Length: 156, dtype: int64

In [24]:
#converting needs to be done after split
X_train = pd.get_dummies(X_train, columns=['source', 'browser'])#need to drop ['source', 'browser']? no, auto dropped by get_dummies
X_train['sex'] = (X_train.sex == 'M').astype(int)

# the more a device is shared, the more suspicious
# if device_id abc occurred 100 times in X_train, then replace all abc in device_id col in X_train by 100
X_train_device_id_mapping = X_train.device_id.value_counts(dropna=False)
X_train['n_dev_shared'] = X_train.device_id.map(X_train_device_id_mapping)# number of times device_id occurred in train data

# the more a ip is shared, the more suspicious
X_train_ip_address_mapping = X_train.ip_address.value_counts(dropna=False)
X_train['n_ip_shared'] = X_train.ip_address.map(X_train_ip_address_mapping)

# the less visit from a country, the more suspicious
X_train_country_mapping = X_train.country.value_counts(dropna=False)#include counts of NaN
X_train['n_country_shared'] = X_train.country.map(X_train_country_mapping)#lots of NAs in country column, without dropna=False will produce nan in this col


X_train = X_train.drop(['device_id','ip_address','country'], axis=1)




In [25]:
X_train.head()

Unnamed: 0,purchase_value,sex,age,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,n_dev_shared,n_ip_shared,n_country_shared
29343,12,1,42,3499664.0,183,67384,224,24648,True,False,False,True,False,False,False,False,1,1,3075
12190,10,1,29,6766039.0,5,78146,84,18585,True,False,False,False,False,False,True,False,1,1,42348
19388,34,1,53,5870515.0,197,81354,265,76669,False,True,False,True,False,False,False,False,1,1,16275
89104,48,1,29,2145618.0,160,30920,185,16538,True,False,False,True,False,False,False,False,1,1,2322
82082,44,1,24,7079059.0,111,71897,193,66156,True,False,False,False,True,False,False,False,1,1,8876


In [26]:
X_test = pd.get_dummies(X_test, columns=['source', 'browser'])
X_test['sex'] = (X_test.sex == 'M').astype(int)
# X_test['n_dev_shared'] = X_test.device_id.map(X_train_device_id_mapping)
# X_test['n_ip_shared'] = X_test.ip_address.map(X_train_ip_address_mapping)
# X_test['n_country_shared'] = X_test.country.map(X_train_country_mapping)


# if apply train mapping, most of the levels in test does not occur in train, so most are null after converting as we can not find the keys in the train mapping, so redo mapping on test data

# the more a device is shared, the more suspicious
X_test['n_dev_shared'] = X_test.device_id.map(X_test.device_id.value_counts(dropna=False))

# the more a ip is shared, the more suspicious
X_test['n_ip_shared'] = X_test.ip_address.map(X_test.ip_address.value_counts(dropna=False))

# the less visit from a country, the more suspicious
X_test['n_country_shared'] = X_test.country.map(X_test.country.value_counts(dropna=False))

X_test = X_test.drop(['device_id','ip_address','country'], axis=1)

# if the levels/values/mapping keys of the column in train and test data are pretty much the same(lots of overlap), e.g. country,
# then we should apply the above 3 X_train_mappings (generated from X_train) on X_test(like below),
# rather than using the new mapping generated from X_test

# method2: target encoding


In [27]:
scaler = preprocessing.MinMaxScaler().fit(X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])
X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']] = scaler.transform(X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])
X_test[['n_dev_shared', 'n_ip_shared', 'n_country_shared']] = scaler.transform(X_test[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])

In [28]:
X_train.n_dev_shared.value_counts(dropna=False)

n_dev_shared
0.0    105427
0.2      4774
0.4       324
0.6       124
0.8        45
1.0         6
Name: count, dtype: int64

In [29]:
X_test.n_dev_shared.value_counts(dropna=False)

n_dev_shared
0.0    27330
0.2      334
0.4       12
Name: count, dtype: int64

# Part 5: Model Training


## Logistic Regression

In [30]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

In [31]:
cm = metrics.confusion_matrix(y_test, y_pred)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)

        pred_0  pred_1
true_0   27389       0
true_1     287       0


As can be seen from the confusion matrix, the model predicts all data to be negative since there exists severe data imbalance in the dataset and the model cannot learn much about the positive data pattern.

## Random Forest

In [32]:
classifier_RF = RandomForestClassifier(random_state=0)

classifier_RF.fit(X_train, y_train)

# generate class probabilities
probs = classifier_RF.predict_proba(X_test)
predicts = classifier_RF.predict(X_test)

# predict class labels 0/1 for the test set
print("%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, probs[:, 1])))
print("%s: %r" % ("f1_score is: ", f1_score(y_test, predicts)))

print ("confusion_matrix is: ")
cm = confusion_matrix(y_test, predicts)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
print('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))#1.0predicted = classifier_RF.predict(X_test)



# generate evaluation metrics
print("%s: %r" % ("accuracy_score is: ", accuracy_score(y_test, predicts)))

roc_auc_score is: : 0.7801672204169557
f1_score is: : 0.6712962962962962
confusion_matrix is: 
        pred_0  pred_1
true_0   27389       0
true_1     142     145
recall = 0.5052264808362369
precision = 1.0
accuracy_score is: : 0.9948692007515537


## SMOTE sampling

In [34]:
smote = SMOTE(random_state=12)
x_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

unique, counts = np.unique(y_train_sm, return_counts=True)

print(np.asarray((unique, counts)).T)

[[     0 109572]
 [     1 109572]]


In [35]:
#RF on smoted training data
classifier_RF_sm = RandomForestClassifier(random_state=0)

classifier_RF_sm.fit(x_train_sm, y_train_sm)

# predict class labels for the test set
predicted_sm = classifier_RF_sm.predict(X_test)

# generate class probabilities
probs_sm = classifier_RF_sm.predict_proba(X_test)


# generate evaluation metrics
print("%s: %r" % ("accuracy_score_sm is: ", accuracy_score(y_test, predicted_sm)))
print("%s: %r" % ("roc_auc_score_sm is: ", roc_auc_score(y_test, probs_sm[:, 1])))
print("%s: %r" % ("f1_score_sm is: ", f1_score(y_test, predicted_sm )))#string to int

print ("confusion_matrix_sm is: ")
cm_sm = confusion_matrix(y_test, predicted_sm)
cmDF = pd.DataFrame(cm_sm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print('recall or sens_sm =',float(cm_sm[1,1])/(cm_sm[1,0]+cm_sm[1,1]))
print('precision_sm =', float(cm_sm[1,1])/(cm_sm[1,1] + cm_sm[0,1]))



accuracy_score_sm is: : 0.9948330683624801
roc_auc_score_sm is: : 0.7666438992331798
f1_score_sm is: : 0.6697459584295612
confusion_matrix_sm is: 
        pred_0  pred_1
true_0   27388       1
true_1     142     145
recall or sens_sm = 0.5052264808362369
precision_sm = 0.9931506849315068


There's no significant improvement after SMOTE resampling.

## XGBoost

In [57]:
xg_classifier = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 6, alpha = 10, n_estimators = 10)

In [58]:
xg_classifier = xg_classifier.fit(X_train, y_train)

In [59]:
probs_x = xg_classifier.predict_proba(X_test)
predicts_x = xg_classifier.predict(X_test)

In [60]:
# predict class labels 0/1 for the test set
print("%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, probs[:, 1])))
print("%s: %r" % ("f1_score is: ", f1_score(y_test, predicts)))

print ("confusion_matrix is: ")
cm = confusion_matrix(y_test, predicts)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
print('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))#1.0predicted = classifier_RF.predict(X_test)



# generate evaluation metrics
print("%s: %r" % ("accuracy_score is: ", accuracy_score(y_test, predicts)))

roc_auc_score is: : 0.7801672204169557
f1_score is: : 0.6712962962962962
confusion_matrix is: 
        pred_0  pred_1
true_0   27389       0
true_1     142     145
recall = 0.5052264808362369
precision = 1.0
accuracy_score is: : 0.9948692007515537


# Part 6: Parameter tuning by GridSearchCV

In [36]:
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score, pos_label=1)

}

In [40]:
def grid_search_wrapper(model, parameters, refit_score='f1_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization(refit on the best model according to refit_score)
    prints classifier performance metrics
    """

    grid_search = GridSearchCV(model, parameters, scoring=scorers, refit=refit_score,
                           cv=3, return_train_score=True)
    grid_search.fit(X_train, y_train)

    # make the predictions
    y_pred = grid_search.predict(X_test)
    y_prob = grid_search.predict_proba(X_test)[:, 1]

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    cm = confusion_matrix(y_test, y_pred)
    cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
    print(cmDF)

    print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, y_prob)))
    print("\t%s: %r" % ("f1_score is: ", f1_score(y_test, y_pred)))#string to int

    print('recall = ', float(cm[1,1]) / (cm[1,0] + cm[1,1]))
    print('precision = ', float(cm[1,1]) / (cm[1, 1] + cm[0,1]))

    return grid_search



## Optimizing on f1_score on LR

In [41]:
# C: inverse of regularization strength, smaller values specify stronger regularization
LRGrid = {"C" : np.logspace(-2,2,5), "penalty":["l1","l2"]}# l1 lasso l2 ridge
#param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
logRegModel = LogisticRegression(random_state=0)

grid_search_LR_f1 = grid_search_wrapper(logRegModel, LRGrid, refit_score='f1_score')

Best params for f1_score
{'C': 0.01, 'penalty': 'l2'}

Confusion matrix of Random Forest optimized for f1_score on the test data:
        pred_0  pred_1
true_0   27389       0
true_1     287       0
	roc_auc_score is: : 0.7505925405847842
	f1_score is: : 0.0
recall =  0.0
precision =  nan


The LR has no better performance after hyperparameter tuning, 

## Optimizing on f1_score on RF

In [42]:
parameters = {
'max_depth': [None, 5, 15],
'n_estimators' :  [10,150],
'class_weight' : [{0: 1, 1: w} for w in [0.2, 1, 100]]
}

clf = RandomForestClassifier(random_state=0)

In [43]:
grid_search_rf_f1 = grid_search_wrapper(clf, parameters, refit_score='f1_score')

Best params for f1_score
{'class_weight': {0: 1, 1: 0.2}, 'max_depth': None, 'n_estimators': 150}

Confusion matrix of Random Forest optimized for f1_score on the test data:
        pred_0  pred_1
true_0   27389       0
true_1     142     145
	roc_auc_score is: : 0.7781993788548851
	f1_score is: : 0.6712962962962962
recall =  0.5052264808362369
precision =  1.0


Random Forest has similar performance as the one before hyperparameter tuning.

In [44]:
best_rf_model_f1 = grid_search_rf_f1.best_estimator_
best_rf_model_f1

In [45]:
results_f1 = pd.DataFrame(grid_search_rf_f1.cv_results_)
results_sortf1 = results_f1.sort_values(by='mean_test_f1_score', ascending=False)
results_sortf1[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_f1_score', 'mean_train_precision_score', 'mean_train_recall_score', 'mean_train_f1_score','param_max_depth', 'param_class_weight', 'param_n_estimators']].round(3).head()




Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_f1_score,mean_train_precision_score,mean_train_recall_score,mean_train_f1_score,param_max_depth,param_class_weight,param_n_estimators
9,1.0,0.527,0.69,1.0,0.527,0.69,5.0,"{0: 1, 1: 1}",150
1,1.0,0.527,0.69,1.0,1.0,1.0,,"{0: 1, 1: 0.2}",150
13,1.0,0.527,0.69,1.0,1.0,1.0,,"{0: 1, 1: 100}",150
3,1.0,0.527,0.69,1.0,0.527,0.69,5.0,"{0: 1, 1: 0.2}",150
5,1.0,0.527,0.69,1.0,0.56,0.718,15.0,"{0: 1, 1: 0.2}",150


In [47]:
pd.DataFrame(best_rf_model_f1.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

Unnamed: 0,importance
interval_after_signup,0.408875
purchase_days_of_year,0.132442
purchase_seconds_of_day,0.079075
signup_seconds_of_day,0.077661
signup_days_of_year,0.057319
n_ip_shared,0.052617
purchase_value,0.044106
age,0.038233
n_dev_shared,0.035686
n_country_shared,0.027432


As can be seen from the table, `interval_after_signup` and other time related raw and aggregates are highly predictive of fraud

## Optimizing recall_score on RF

In [48]:
grid_search_rf_recall = grid_search_wrapper(clf, parameters, refit_score='recall_score')

Best params for recall_score
{'class_weight': {0: 1, 1: 100}, 'max_depth': 5, 'n_estimators': 150}

Confusion matrix of Random Forest optimized for recall_score on the test data:
        pred_0  pred_1
true_0   27146     243
true_1     132     155
	roc_auc_score is: : 0.7904661234456265
	f1_score is: : 0.4525547445255475
recall =  0.5400696864111498
precision =  0.38944723618090454


Recall score is improved but precision is decreased.

In [49]:
best_RF_model_recall = grid_search_rf_recall.best_estimator_
best_RF_model_recall

In [50]:
# predict class labels for the test set
predictedBest_recall = best_RF_model_recall.predict(X_test)

# generate class probabilities
probsBest_recall = best_RF_model_recall.predict_proba(X_test)

results_recall = pd.DataFrame(grid_search_rf_recall.cv_results_)# recall score is different from above, as above is metric on test data, this is performance on cv data
results_sortrecall = results_recall.sort_values(by='mean_test_recall_score', ascending=False)
results_sortrecall[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_f1_score', 'mean_train_precision_score', 'mean_train_recall_score', 'mean_train_f1_score','param_max_depth', 'param_class_weight', 'param_n_estimators']].round(3).head()
#recall is worse than default rf?? no this is on test, but train recall is better

Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_f1_score,mean_train_precision_score,mean_train_recall_score,mean_train_f1_score,param_max_depth,param_class_weight,param_n_estimators
15,0.159,0.636,0.254,0.164,0.656,0.262,5.0,"{0: 1, 1: 100}",150
14,0.16,0.633,0.255,0.162,0.652,0.26,5.0,"{0: 1, 1: 100}",10
16,0.675,0.533,0.593,0.759,0.813,0.782,15.0,"{0: 1, 1: 100}",10
0,0.995,0.527,0.689,1.0,0.856,0.923,,"{0: 1, 1: 0.2}",10
1,1.0,0.527,0.69,1.0,1.0,1.0,,"{0: 1, 1: 0.2}",150


## Optimizing f1_score on XGBoost

In [61]:
xgb_model = xgb.XGBClassifier()

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'max_depth': [3, 4, 5],         # Maximum depth of each tree
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

In [62]:
grid_search_xgb = grid_search_wrapper(xgb_model, param_grid , refit_score='f1_score')

Best params for f1_score
{'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 200, 'subsample': 1.0}

Confusion matrix of Random Forest optimized for f1_score on the test data:
        pred_0  pred_1
true_0   27386       3
true_1     147     140
	roc_auc_score is: : 0.7729333338252353
	f1_score is: : 0.6511627906976744
recall =  0.4878048780487805
precision =  0.9790209790209791


In [68]:
xgb_model_2 = xgb.XGBClassifier()
grid_search_xgb_recall = grid_search_wrapper(xgb_model_2, param_grid , refit_score='recall_score')

Best params for recall_score
{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}

Confusion matrix of Random Forest optimized for recall_score on the test data:
        pred_0  pred_1
true_0   27384       5
true_1     142     145
	roc_auc_score is: : 0.7943721652287224
	f1_score is: : 0.6636155606407322
recall =  0.5052264808362369
precision =  0.9666666666666667


Similar performance as random forest

## Model optimization & Smote

In [72]:
xgb_model_3 = xgb.XGBClassifier()

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],  # Number of boosting rounds
    'classifier__max_depth': [3, 4, 5],         # Maximum depth of each tree
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.8, 0.9, 1.0]
}

In [73]:
pipeline = imbpipeline(steps = [['smote', SMOTE(sampling_strategy='auto', random_state=42)],
                                ['classifier', xgb_model_3]])
grid_search_xgb = GridSearchCV(pipeline, param_grid, cv=5, scoring='recall')
grid_search_xgb.fit(X_train,y_train)

In [75]:
y_pred = grid_search_xgb.predict(X_test)
y_prob = grid_search_xgb.predict_proba(X_test)[:, 1]

In [76]:
print('Best params for recall score')
print(grid_search_xgb.best_params_)

# confusion matrix on the test data.
print('\nConfusion matrix of Random Forest optimized for recall score on the test data:')
cm = confusion_matrix(y_test, y_pred)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)

print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, y_prob)))
print("\t%s: %r" % ("f1_score is: ", f1_score(y_test, y_pred)))#string to int

print('recall = ', float(cm[1,1]) / (cm[1,0] + cm[1,1]))
print('precision = ', float(cm[1,1]) / (cm[1, 1] + cm[0,1]))

Best params for recall score
{'classifier__colsample_bytree': 0.9, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}

Confusion matrix of Random Forest optimized for recall score on the test data:
        pred_0  pred_1
true_0   25827    1562
true_1     133     154
	roc_auc_score is: : 0.767189656113374
	f1_score is: : 0.15376934598102845
recall =  0.5365853658536586
precision =  0.08974358974358974


In [74]:
clf_2 = RandomForestClassifier(random_state=0)

param_grid = {
    'classifier__max_depth': [None, 5, 15],
    'classifier__n_estimators' :  [10,150],
    'classifier__class_weight' : [{0: 1, 1: w} for w in [0.2, 1, 100]]
}

pipeline = imbpipeline(steps = [['smote', SMOTE(sampling_strategy='auto', random_state=42)],
                                ['classifier', clf_2]])
grid_search_rf = GridSearchCV(pipeline, param_grid, cv=5, scoring='recall')
grid_search_rf.fit(X_train,y_train)

In [77]:
y_pred = grid_search_rf.predict(X_test)
y_prob = grid_search_rf.predict_proba(X_test)[:, 1]

In [78]:
print('Best params for recall score')
print(grid_search_xgb.best_params_)

# confusion matrix on the test data.
print('\nConfusion matrix of XGBoost optimized for recall score on the test data:')
cm = confusion_matrix(y_test, y_pred)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)

print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, y_prob)))
print("\t%s: %r" % ("f1_score is: ", f1_score(y_test, y_pred)))#string to int

print('recall = ', float(cm[1,1]) / (cm[1,0] + cm[1,1]))
print('precision = ', float(cm[1,1]) / (cm[1, 1] + cm[0,1]))

Best params for recall score
{'classifier__colsample_bytree': 0.9, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}

Confusion matrix of XGBoost optimized for recall score on the test data:
        pred_0  pred_1
true_0      10   27379
true_1       0     287
	roc_auc_score is: : 0.6626015200028802
	f1_score is: : 0.02053446857224627
recall =  1.0
precision =  0.010373743945637245


Predicts most data as positive

# Part 7: Fraud Characteristics

In [51]:
trainDF = pd.concat([X_train, y_train], axis=1)
pd.crosstab(trainDF["n_dev_shared"],trainDF["class"])
#the larger n_dev_shared, the higher rate of fraud

class,0,1
n_dev_shared,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,104966,461
0.2,4403,371
0.4,152,172
0.6,37,87
0.8,13,32
1.0,1,5


In [52]:
fraud_data.groupby("class")[['interval_after_signup']].mean()
#action velocity(consecutive operations/actions of user)
#interval_after_signup on frauds are significantly lower compared to legits

Unnamed: 0_level_0,interval_after_signup
class,Unnamed: 1_level_1
0,5191179.0
1,2570226.0


In [53]:
fraud_data.groupby("class")[['interval_after_signup']].median()
#more than half of fraud happened 1s after signed up

Unnamed: 0_level_0,interval_after_signup
class,Unnamed: 1_level_1
0,5194911.0
1,1.0


Median is 1: Fake account -> sign up and places an order immediately -> robot

In [54]:
fraud_data[fraud_data['class'] == 1].head(100)

Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
136961,24,VLHGCDPFCICDA,SEO,Chrome,F,33,3.432126e+09,1,United States,3327952.0,218,80113,257,38465
136962,14,YLUQSRNYYIPXU,Ads,Chrome,M,40,3.905319e+09,1,,1.0,12,4207,12,4208
136963,63,ABUBCQDATQMQH,Ads,FireFox,F,46,5.505670e+08,1,United States,7640070.0,49,40723,137,77593
136964,34,QHEODGCAVJKIQ,SEO,Chrome,M,37,9.408096e+08,1,United States,1.0,12,77710,12,77711
136965,76,DAKVYHKIEYRBH,SEO,Chrome,F,48,6.361041e+08,1,Hungary,1.0,10,48421,10,48422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137056,21,AIGPGDVRDKOKT,SEO,FireFox,F,45,3.058136e+09,1,India,1.0,2,69762,2,69763
137057,52,PQGKGQACIARBV,Ads,Chrome,F,36,7.203338e+08,1,China,1.0,9,77018,9,77019
137058,54,XHZBVWFWHSGTQ,SEO,FireFox,M,50,2.249217e+09,1,United States,1.0,1,32483,1,32484
137059,10,WETYPHOQVLWMK,Ads,FireFox,M,36,2.293333e+09,1,United States,1.0,7,63914,7,63915


# Part 8: Usage of the prediction

In [79]:
t = (10 * probsBest_recall[:, 1]).astype(int)
unique, counts = np.unique(t, return_counts=True)

print(np.asarray((unique, counts)).T)

[[    1     1]
 [    2 24555]
 [    3  2623]
 [    4    99]
 [    5   177]
 [    6    76]
 [    7     1]
 [    8    20]
 [    9   124]]


Recommended:
* green: 1 - 3 pass
* *grey: 4 - 7 need manual investigation
* *red: 8 - 9 decline

In [65]:
probsBest_recall

array([[0.70205206, 0.29794794],
       [0.70917841, 0.29082159],
       [0.70522289, 0.29477711],
       ...,
       [0.7096945 , 0.2903055 ],
       [0.70465921, 0.29534079],
       [0.73310676, 0.26689324]])