In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

njobs = -1

In [2]:
train=pd.read_csv('train.csv',sep='|')
test=pd.read_csv('test.csv',sep='|')

In [3]:
print(train.fraud.value_counts())
print(train.fraud.value_counts() / len(train))

0    1775
1     104
Name: fraud, dtype: int64
0    0.944651
1    0.055349
Name: fraud, dtype: float64


# Feature Engineering

In [4]:
# for training set

#train['totalScanTimeInMinutes'] = train['totalScanTimeInSeconds'] / 60.0
train['scannedLineItems'] = train['scannedLineItemsPerSecond'] * train['totalScanTimeInSeconds']
train['pricePerScannedLineItem'] = train['grandTotal'] / train['scannedLineItems']
train['scansWithoutRegistrationPerScannedLineItem'] = train['scansWithoutRegistration'] / train['scannedLineItems']
train['quantityModificationsPerScannedLineItem'] = train['quantityModifications'] / train['scannedLineItems']
train['lineItemVoidsPerSecond'] = train['lineItemVoids'] / train['totalScanTimeInSeconds']
#train['scansWithoutRegistrationPerSecond'] = train['scansWithoutRegistration'] / train['totalScanTimeInSeconds']
#train['quantityModificationsPerSecond'] = train['quantityModifications'] / train['totalScanTimeInSeconds']
#train['secondsPerEuro'] = train['totalScanTimeInSeconds'] / train['grandTotal']
#train['lineItemVoidsPerEuro'] = train['lineItemVoids'] / train['grandTotal']
#train['scansWithoutRegistrationPerEuro'] = train['scansWithoutRegistration'] / train['grandTotal']
#train['quantityModificationsPerEuro'] = train['quantityModifications'] / train['grandTotal']


# for test set

#train['totalScanTimeInMinutes'] = train['totalScanTimeInSeconds'] / 60.0
test['scannedLineItems'] = test['scannedLineItemsPerSecond'] * test['totalScanTimeInSeconds']
test['pricePerScannedLineItem'] = test['grandTotal'] / test['scannedLineItems']
test['scansWithoutRegistrationPerScannedLineItem'] = test['scansWithoutRegistration'] / test['scannedLineItems']
test['quantityModificationsPerScannedLineItem'] = test['quantityModifications'] / test['scannedLineItems']
test['lineItemVoidsPerSecond'] = test['lineItemVoids'] / test['totalScanTimeInSeconds']
#rain['scansWithoutRegistrationPerSecond'] = train['scansWithoutRegistration'] / train['totalScanTimeInSeconds']
#train['quantityModificationsPerSecond'] = train['quantityModifications'] / train['totalScanTimeInSeconds']
#train['secondsPerEuro'] = train['totalScanTimeInSeconds'] / train['grandTotal']
#train['lineItemVoidsPerEuro'] = train['lineItemVoids'] / train['grandTotal']
#train['scansWithoutRegistrationPerEuro'] = train['scansWithoutRegistration'] / train['grandTotal']
#train['quantityModificationsPerEuro'] = train['quantityModifications'] / train['grandTotal']

# Larger training set due to rules found at Decision Tree Analysis

In [5]:
additional_no_frauds = test[test["trustLevel"] > 2.5]
additional_no_frauds = additional_no_frauds.assign(fraud = 0)

additional_frauds = test[(test["trustLevel"] < 1.5) &  (test["scannedLineItems"] > 20.5) & (test["valuePerSecond"] <= 0.118) & (test["scansWithoutRegistrationPerScannedLineItem"] > 0.168)]
additional_frauds = additional_frauds.assign(fraud = 1)


train = pd.concat([train, additional_no_frauds], ignore_index=True)
train = pd.concat([train, additional_frauds], ignore_index=True)

In [5]:
print(train.fraud.value_counts())
print(train.fraud.value_counts() / len(train))

0    1775
1     104
Name: fraud, dtype: int64
0    0.944651
1    0.055349
Name: fraud, dtype: float64


# (Optional) Normalizing the values (-> very important for some models) 

In [30]:
from sklearn.preprocessing import MinMaxScaler  
feature_scaler = MinMaxScaler()  
feature_scaler.fit_transform(train.values)  

pd.DataFrame(feature_scaler.fit_transform(train.values), columns=train.columns, index=train.index)

Unnamed: 0,fraud,grandTotal,lineItemVoids,lineItemVoidsPerPosition,lineItemVoidsPerSecond,pricePerScannedLineItem,quantityModifications,quantityModificationsPerScannedLineItem,scannedLineItems,scannedLineItemsPerSecond,scansWithoutRegistration,scansWithoutRegistrationPerScannedLineItem,totalScanTimeInSeconds,trustLevel,valuePerSecond
0,0.0,0.547055,0.636364,0.021944,0.000604,0.018864,0.6,0.020690,9.655172e-01,0.000899,0.0,0.000000,0.575410,0.8,5.204847e-04
1,0.0,0.273627,0.454545,0.032468,0.004209,0.019545,0.8,0.057143,4.482759e-01,0.004303,0.2,0.014286,0.058470,0.4,2.540701e-03
2,0.0,0.621662,0.272727,0.020979,0.000180,0.047820,1.0,0.076923,4.137931e-01,0.000268,1.0,0.076923,0.827869,0.4,4.112189e-04
3,0.0,0.923192,0.727273,0.025078,0.000406,0.031834,0.8,0.027586,9.655172e-01,0.000522,0.4,0.013793,0.978142,1.0,5.169094e-04
4,0.0,0.815382,0.272727,0.010101,0.000634,0.030199,0.4,0.014815,8.965517e-01,0.002075,0.7,0.025926,0.234426,0.8,1.901561e-03
5,1.0,0.110911,1.000000,0.038462,0.001299,0.004266,0.4,0.015385,8.620690e-01,0.001107,0.5,0.019231,0.420219,0.0,1.444449e-04
6,0.0,0.556356,0.181818,0.016529,0.000618,0.050578,0.2,0.018182,3.448276e-01,0.001229,0.7,0.063636,0.160109,0.4,1.897680e-03
7,0.0,0.228023,0.000000,0.000000,0.000000,0.022802,0.8,0.080000,3.103448e-01,0.000198,0.8,0.080000,0.843716,0.2,1.480020e-04
8,0.0,0.654465,0.636364,0.023569,0.000662,0.024239,0.4,0.014815,8.965517e-01,0.000917,0.0,0.000000,0.525137,1.0,6.822279e-04
9,0.0,0.410841,0.909091,0.033670,0.001254,0.015216,0.8,0.029630,8.965517e-01,0.001223,0.2,0.007407,0.395628,0.2,5.682687e-04


In [78]:
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm.classes import OneClassSVM
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
from sklearn.neighbors.classification import RadiusNeighborsClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.linear_model.ridge import RidgeClassifierCV
from sklearn.linear_model.ridge import RidgeClassifier
from sklearn.linear_model.passive_aggressive import PassiveAggressiveClassifier
from sklearn.gaussian_process.gpc import GaussianProcessClassifier
from sklearn.ensemble.voting_classifier import VotingClassifier
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
# from sklearn.mixture import DPGMM
#from sklearn.mixture import GMM
#from sklearn.mixture import GaussianMixture
#from sklearn.mixture import VBGMM



model_factory = [
# RandomForestRegressor(),
# XGBRegressor(nthread=1),
 #MLPRegressor(),
# Ridge(),
# BayesianRidge(),
# ExtraTreesRegressor(),
# ElasticNet(),
# KNeighborsRegressor(),
# GradientBoostingRegressor()
  ExtraTreeClassifier(),
    DecisionTreeClassifier(),
 #   OneClassSVM(),  # doesn't work instantly
    
                  # MLPClassifier(),# takes very long for a larger training set
    
 #   RadiusNeighborsClassifier(), # doesn't work instantly
    KNeighborsClassifier(),
#    ClassifierChain(),     # ensemble method
#    MultiOutputClassifier(), # ensemble method
#    OutputCodeClassifier(), # ensemble method
#    OneVsOneClassifier(), # ensemble method
#    OneVsRestClassifier(), # ensemble method
    SGDClassifier(),
    RidgeClassifierCV(),
    RidgeClassifier(),
    PassiveAggressiveClassifier(),
    
                   # GaussianProcessClassifier(),  # takes very long for a larger training set
    
#    VotingClassifier(), # ensemble method
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    RandomForestClassifier(),
    BernoulliNB(),
  #  CalibratedClassifierCV(), # doesn't work instantly
    GaussianNB(), # causing some problems
 #   LabelPropagation(), # doesn't work instantly
 #   LabelSpreading(),  # doesn't work instantly
    LinearDiscriminantAnalysis(), # causing some problems
 #   LinearSVC(max_iter = 100000), # causing some problems
    LogisticRegression(max_iter = 10000),
    LogisticRegressionCV(max_iter = 10000),
    MultinomialNB(),
    NearestCentroid(),
  #  NuSVC(),   # doesn't work instantly
    Perceptron(),
    QuadraticDiscriminantAnalysis(),
    # SVC(),
#    DPGMM(),
#    GMM(),
#    GaussianMixture(),
#    VBGMM()
]

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer


def my_custom_loss_func(y_true, y_pred):
    cm = confusion_matrix(y_true,y_pred)
    tn, fp, fn, tp = cm.ravel()
    score = ((-25)*fp + (-5)*fn + 5*tp) / len(y_true)
    return (score)

my_custom_score = make_scorer(my_custom_loss_func, greater_is_better=True)

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut

skf = StratifiedKFold(n_splits=10)



from sklearn.model_selection import GridSearchCV


model_tuning_factory = [
    GridSearchCV(KNeighborsClassifier(), 
                 dict(n_neighbors = range(1,4),
                      weights = ['uniform', 'distance']),
                 cv = skf,
                 scoring = my_custom_score),
    GridSearchCV(LogisticRegression(max_iter = 10000), 
                 dict(penalty = ['l1','l2'],  # automatic regularization  -> option 'l1' doesnt work with all solvers and leads to errors
                      fit_intercept = [True, False]),
                    #  solver = ['lbfgs', 'liblinear', 'sag', 'saga']),
                 cv = skf,
                 scoring = my_custom_score),
    GridSearchCV(DecisionTreeClassifier(),
                 dict(criterion = ['entropy', 'gini']),
                     # max_depth = range(1,100)),
                 #     max_leaf_nodes = range(2,100)),
                 cv = skf,
                 scoring = my_custom_score),
]


from sklearn.model_selection import RandomizedSearchCV

model_tuning_factory2 = [
    RandomizedSearchCV(KNeighborsClassifier(), 
                 dict(n_neighbors = range(1,4),
                      weights = ['uniform', 'distance']),
                 cv = skf,
                 scoring = my_custom_score,
                 n_iter = 10),
    RandomizedSearchCV(LogisticRegression(max_iter = 10000), 
                 dict(#penalty = ['l2'],  # automatic regularization  -> option 'l1' doesnt work with all solvers and leads to errors
                      fit_intercept = [True, False]),
                      #solver = ['lbfgs', 'liblinear', 'sag', 'saga']),
                 cv = skf,
                 scoring = my_custom_score,
                 n_iter = 10)
]

In [57]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer

def my_custom_loss_func(y_true, y_pred):
    cm = confusion_matrix(y_true,y_pred)
    tn, fp, fn, tp = cm.ravel()
    score = ((-25)*fp + (-5)*fn + 5*tp) / len(y_true)
    return (score)

my_custom_score = make_scorer(my_custom_loss_func, greater_is_better=True)

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut

skf = StratifiedKFold(n_splits=10)
loo = LeaveOneOut()

# Take a smaller sample for quicker training

In [33]:
train = train.sample(50000)

In [9]:
print(train.fraud.value_counts())
print(train.fraud.value_counts() / len(train))

0    1775
1     104
Name: fraud, dtype: int64
0    0.944651
1    0.055349
Name: fraud, dtype: float64


In [10]:
Y = train['fraud']
X = train.drop('fraud',axis=1)

In [79]:
import time
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

result_table = pd.DataFrame(columns=["Model", "Feature Count", "Features", "Optimal Parameters", "Monetary Value Per Instance - Mean", "Monetary Value Per Instance - Standard Deviation", "Raw Model"])


#for feature_count in range(1,len(list(X))+1):
for feature_count in range(1,len(list(X))+1):

 for model in model_tuning_factory:   # replace with GridSearch for greater accuracy
 
  t = time.time()
    
  best_features = SelectKBest(f_classif, k=feature_count).fit(X,Y)
  best_feature_list = X.columns[best_features.get_support()]

  X_selected_features = X[best_feature_list]
    
#  print("Started " + model.__class__.__name__ + "with " + str(feature_count) + " features:" + best_feature_list)

  model.seed = 42

  model.fit(X_selected_features,Y)  
    
  #scores = cross_val_score(model, X_selected_features, Y, cv=skf, scoring=my_custom_score)


 # model.grid_scores_.loc[models["MonetaryValue"].argmax()]["model"]

  # scores = model.best_score_

  model_name = model.best_estimator_.__class__.__name__
  score_mean = model.cv_results_['mean_test_score'][model.best_index_]
  score_std = model.cv_results_['std_test_score'][model.best_index_]

#  print("Finished " + model.__class__.__name__ + " with " + str(feature_count) + " features:" + " after " + str(time.time() - t) + " seconds")
#  print("Monetary Value Per Instance - Mean " + str(score_mean))
#  print("Monetary Value Per Instance - Standard Deviation " + str(score_std))  
#  print("")  
    
  result_table = result_table.append({
     "Model": model_name,
     "Feature Count": feature_count,
     "Features": best_feature_list.values,
     "Optimal Parameters": model.best_params_,
     "Monetary Value Per Instance - Mean":  score_mean,
     "Monetary Value Per Instance - Standard Deviation": score_std,
     "Raw Model": model.best_estimator_
      }, ignore_index=True)
 
print("Iteration " + str(feature_count) + " finished") 
    
result_table.sort_values(by = "Monetary Value Per Instance - Mean", ascending = False)

Iteration 14 finished


Unnamed: 0,Model,Feature Count,Features,Optimal Parameters,Monetary Value Per Instance - Mean,Monetary Value Per Instance - Standard Deviation,Raw Model
25,LogisticRegression,9,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'fit_intercept': True, 'penalty': 'l1'}",0.133049,0.129295,"LogisticRegression(C=1.0, class_weight=None, d..."
34,LogisticRegression,12,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'fit_intercept': True, 'penalty': 'l1'}",0.133049,0.129295,"LogisticRegression(C=1.0, class_weight=None, d..."
28,LogisticRegression,10,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'fit_intercept': True, 'penalty': 'l1'}",0.133049,0.129295,"LogisticRegression(C=1.0, class_weight=None, d..."
31,LogisticRegression,11,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'fit_intercept': True, 'penalty': 'l1'}",0.133049,0.129295,"LogisticRegression(C=1.0, class_weight=None, d..."
37,LogisticRegression,13,"[trustLevel, totalScanTimeInSeconds, grandTota...","{'fit_intercept': True, 'penalty': 'l1'}",0.077169,0.118054,"LogisticRegression(C=1.0, class_weight=None, d..."
40,LogisticRegression,14,"[trustLevel, totalScanTimeInSeconds, grandTota...","{'fit_intercept': True, 'penalty': 'l1'}",0.063864,0.135192,"LogisticRegression(C=1.0, class_weight=None, d..."
22,LogisticRegression,8,"[trustLevel, totalScanTimeInSeconds, scansWith...","{'fit_intercept': True, 'penalty': 'l1'}",0.055881,0.18499,"LogisticRegression(C=1.0, class_weight=None, d..."
35,DecisionTreeClassifier,12,"[trustLevel, totalScanTimeInSeconds, lineItemV...",{'criterion': 'gini'},-0.010644,0.201804,"DecisionTreeClassifier(class_weight=None, crit..."
7,LogisticRegression,3,"[trustLevel, totalScanTimeInSeconds, scannedLi...","{'fit_intercept': True, 'penalty': 'l2'}",-0.015966,0.160756,"LogisticRegression(C=1.0, class_weight=None, d..."
16,LogisticRegression,6,"[trustLevel, totalScanTimeInSeconds, scannedLi...","{'fit_intercept': True, 'penalty': 'l1'}",-0.015966,0.18926,"LogisticRegression(C=1.0, class_weight=None, d..."


In [27]:
def get_monetary_value(cm, y_holdout):
        tn, fp, fn, tp = cm.ravel()
        print("True negative: ", tn)
        print("False positive: ", fp)
        print("False negative: ", fn)
        print("True positive: ", tp)
        score = (-25)*fp + (-5)*fn + 5*tp
        print(score, 'for ', len(y_holdout), ' instances in the test set')
        print(score/len(y_holdout), ' per instance in the test set')
        return score
        

bestModel = result_table.loc[result_table["Monetary Value Per Instance - Mean"].argmax()]["Raw Model"]
bestFeatureSet = result_table.loc[result_table["Monetary Value Per Instance - Mean"].argmax()]["Features"]

bestModel.fit(X[list(bestFeatureSet)], Y)

cm = confusion_matrix(Y, bestModel.predict(X[list(bestFeatureSet)]))
monetary_value = get_monetary_value(cm, Y)

True negative:  48280
False positive:  1
False negative:  0
True positive:  1719
8570 for  50000  instances in the test set
0.1714  per instance in the test set
