In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
train=pd.read_csv('train.csv',sep='|')
test=pd.read_csv('test.csv',sep='|')

In [3]:
print(train.fraud.value_counts())
print(train.fraud.value_counts() / len(train))

0    1775
1     104
Name: fraud, dtype: int64
0    0.944651
1    0.055349
Name: fraud, dtype: float64


# Feature Engineering

In [4]:
# for training set

#train['totalScanTimeInMinutes'] = train['totalScanTimeInSeconds'] / 60.0
train['scannedLineItems'] = train['scannedLineItemsPerSecond'] * train['totalScanTimeInSeconds']
train['pricePerScannedLineItem'] = train['grandTotal'] / train['scannedLineItems']
train['scansWithoutRegistrationPerScannedLineItem'] = train['scansWithoutRegistration'] / train['scannedLineItems']
train['quantityModificationsPerScannedLineItem'] = train['quantityModifications'] / train['scannedLineItems']
train['lineItemVoidsPerSecond'] = train['lineItemVoids'] / train['totalScanTimeInSeconds']
#rain['scansWithoutRegistrationPerSecond'] = train['scansWithoutRegistration'] / train['totalScanTimeInSeconds']
#train['quantityModificationsPerSecond'] = train['quantityModifications'] / train['totalScanTimeInSeconds']
#train['secondsPerEuro'] = train['totalScanTimeInSeconds'] / train['grandTotal']
#train['lineItemVoidsPerEuro'] = train['lineItemVoids'] / train['grandTotal']
#train['scansWithoutRegistrationPerEuro'] = train['scansWithoutRegistration'] / train['grandTotal']
#train['quantityModificationsPerEuro'] = train['quantityModifications'] / train['grandTotal']


# for test set

#train['totalScanTimeInMinutes'] = train['totalScanTimeInSeconds'] / 60.0
test['scannedLineItems'] = test['scannedLineItemsPerSecond'] * test['totalScanTimeInSeconds']
test['pricePerScannedLineItem'] = test['grandTotal'] / test['scannedLineItems']
test['scansWithoutRegistrationPerScannedLineItem'] = test['scansWithoutRegistration'] / test['scannedLineItems']
test['quantityModificationsPerScannedLineItem'] = test['quantityModifications'] / test['scannedLineItems']
test['lineItemVoidsPerSecond'] = test['lineItemVoids'] / test['totalScanTimeInSeconds']
#rain['scansWithoutRegistrationPerSecond'] = train['scansWithoutRegistration'] / train['totalScanTimeInSeconds']
#train['quantityModificationsPerSecond'] = train['quantityModifications'] / train['totalScanTimeInSeconds']
#train['secondsPerEuro'] = train['totalScanTimeInSeconds'] / train['grandTotal']
#train['lineItemVoidsPerEuro'] = train['lineItemVoids'] / train['grandTotal']
#train['scansWithoutRegistrationPerEuro'] = train['scansWithoutRegistration'] / train['grandTotal']
#train['quantityModificationsPerEuro'] = train['quantityModifications'] / train['grandTotal']

# Larger training set due to rules found at Decision Tree Analysis

In [5]:
additional_no_frauds = test[test["trustLevel"] > 2.5]
additional_no_frauds = additional_no_frauds.assign(fraud = 0)

additional_frauds = test[(test["trustLevel"] < 1.5) &  (test["scannedLineItems"] > 20.5) & (test["valuePerSecond"] <= 0.118) & (test["scansWithoutRegistrationPerScannedLineItem"] > 0.168)]
additional_frauds = additional_frauds.assign(fraud = 1)


train = pd.concat([train, additional_no_frauds], ignore_index=True)
train = pd.concat([train, additional_frauds], ignore_index=True)

In [6]:
print(train.fraud.value_counts())
print(train.fraud.value_counts() / len(train))

0    334270
1     12217
Name: fraud, dtype: int64
0    0.96474
1    0.03526
Name: fraud, dtype: float64


In [7]:
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm.classes import OneClassSVM
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
from sklearn.neighbors.classification import RadiusNeighborsClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.linear_model.ridge import RidgeClassifierCV
from sklearn.linear_model.ridge import RidgeClassifier
from sklearn.linear_model.passive_aggressive import PassiveAggressiveClassifier
from sklearn.gaussian_process.gpc import GaussianProcessClassifier
from sklearn.ensemble.voting_classifier import VotingClassifier
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
# from sklearn.mixture import DPGMM
#from sklearn.mixture import GMM
#from sklearn.mixture import GaussianMixture
#from sklearn.mixture import VBGMM



model_factory = [
# RandomForestRegressor(),
# XGBRegressor(nthread=1),
 #MLPRegressor(),
# Ridge(),
# BayesianRidge(),
# ExtraTreesRegressor(),
# ElasticNet(),
# KNeighborsRegressor(),
# GradientBoostingRegressor()
  ExtraTreeClassifier(),
    DecisionTreeClassifier(),
 #   OneClassSVM(),  # doesn't work instantly
    
                  # MLPClassifier(),# takes very long for a larger training set
    
 #   RadiusNeighborsClassifier(), # doesn't work instantly
    KNeighborsClassifier(),
#    ClassifierChain(),     # ensemble method
#    MultiOutputClassifier(), # ensemble method
#    OutputCodeClassifier(), # ensemble method
#    OneVsOneClassifier(), # ensemble method
#    OneVsRestClassifier(), # ensemble method
    SGDClassifier(),
    RidgeClassifierCV(),
    RidgeClassifier(),
    PassiveAggressiveClassifier(),
    
                   # GaussianProcessClassifier(),  # takes very long for a larger training set
    
#    VotingClassifier(), # ensemble method
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    RandomForestClassifier(),
    BernoulliNB(),
  #  CalibratedClassifierCV(), # doesn't work instantly
    GaussianNB(), # causing some problems
 #   LabelPropagation(), # doesn't work instantly
 #   LabelSpreading(),  # doesn't work instantly
    LinearDiscriminantAnalysis(), # causing some problems
 #   LinearSVC(max_iter = 100000), # causing some problems
    LogisticRegression(max_iter = 10000),
    LogisticRegressionCV(max_iter = 10000),
    MultinomialNB(),
    NearestCentroid(),
  #  NuSVC(),   # doesn't work instantly
    Perceptron(),
    QuadraticDiscriminantAnalysis(),
    # SVC(),
#    DPGMM(),
#    GMM(),
#    GaussianMixture(),
#    VBGMM()
]

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer

def my_custom_loss_func(y_true, y_pred):
    cm = confusion_matrix(y_true,y_pred)
    tn, fp, fn, tp = cm.ravel()
    score = ((-25)*fp + (-5)*fn + 5*tp) / len(y_true)
    return (score)

my_custom_score = make_scorer(my_custom_loss_func, greater_is_better=True)

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut

skf = StratifiedKFold(n_splits=10)
loo = LeaveOneOut()

# Take a smaller sample for quicker training

In [9]:
train = train.sample(50000)

In [10]:
print(train.fraud.value_counts())
print(train.fraud.value_counts() / len(train))

0    48281
1     1719
Name: fraud, dtype: int64
0    0.96562
1    0.03438
Name: fraud, dtype: float64


In [11]:
Y = train['fraud']
X = train.drop('fraud',axis=1)

In [14]:
import time
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

result_table = pd.DataFrame(columns=["Model", "Feature Count", "Features", "Monetary Value Per Instance - Mean", "Monetary Value Per Instance - Standard Deviation", "Raw Model"])


for feature_count in range(1,len(list(X))+1):

 for model in model_factory:
 
  t = time.time()
    
  best_features = SelectKBest(f_classif, k=feature_count).fit(X,Y)
  best_feature_list = X.columns[best_features.get_support()]

  X_selected_features = X[best_feature_list]
    
#  print("Started " + model.__class__.__name__ + "with " + str(feature_count) + " features:" + best_feature_list)

  model.seed = 42

  scores = cross_val_score(model, X_selected_features, Y, cv=skf, scoring=my_custom_score)
  model_name = model.__class__.__name__
  score_mean = scores.mean()
  score_std = scores.std()

#  print("Finished " + model.__class__.__name__ + " with " + str(feature_count) + " features:" + " after " + str(time.time() - t) + " seconds")
#  print("Monetary Value Per Instance - Mean " + str(score_mean))
#  print("Monetary Value Per Instance - Standard Deviation " + str(score_std))  
#  print("")  
    
  result_table = result_table.append({
     "Model": model_name,
     "Feature Count": feature_count,
     "Features": best_feature_list.values,
     "Monetary Value Per Instance - Mean":  score_mean,
     "Monetary Value Per Instance - Standard Deviation": score_std,
     "Raw Model": model
      }, ignore_index=True)
 
 print("Iteration " + str(feature_count) + " finished") 
    
result_table.sort_values(by = "Monetary Value Per Instance - Mean", ascending = False)

Iteration: 1 finished
Iteration: 2 finished
Iteration: 3 finished
Iteration: 4 finished
Iteration: 5 finished
Iteration: 6 finished
Iteration: 7 finished
Iteration: 8 finished
Iteration: 9 finished
Iteration: 10 finished
Iteration: 11 finished
Iteration: 12 finished
Iteration: 13 finished
Iteration: 14 finished


ValueError: k should be >=0, <= n_features = 14; got 15. Use k='all' to return all features.

In [21]:
result_table.sort_values(by = "Monetary Value Per Instance - Mean", ascending = False)

Unnamed: 0,Model,Feature Count,Features,Monetary Value Per Instance - Mean,Monetary Value Per Instance - Standard Deviation,Raw Model
219,BaggingClassifier,11,"[grandTotal, lineItemVoidsPerPosition, lineIte...",0.1695,0.003822,"BaggingClassifier(base_estimator=None, bootstr..."
263,RandomForestClassifier,13,"[grandTotal, lineItemVoidsPerPosition, lineIte...",0.1695,0.003822,"RandomForestClassifier(bootstrap=True, class_w..."
240,BaggingClassifier,12,"[grandTotal, lineItemVoidsPerPosition, lineIte...",0.1693,0.003738,"BaggingClassifier(base_estimator=None, bootstr..."
200,RandomForestClassifier,10,"[grandTotal, lineItemVoidsPerPosition, pricePe...",0.1691,0.00364,"RandomForestClassifier(bootstrap=True, class_w..."
156,BaggingClassifier,8,"[lineItemVoidsPerPosition, pricePerScannedLine...",0.1691,0.00364,"BaggingClassifier(base_estimator=None, bootstr..."
209,QuadraticDiscriminantAnalysis,10,"[grandTotal, lineItemVoidsPerPosition, pricePe...",0.169,0.002315,"QuadraticDiscriminantAnalysis(priors=None, reg..."
137,RandomForestClassifier,7,"[lineItemVoidsPerPosition, pricePerScannedLine...",0.169,0.004509,"RandomForestClassifier(bootstrap=True, class_w..."
32,RandomForestClassifier,2,"[scannedLineItems, trustLevel]",0.169,0.004509,"RandomForestClassifier(bootstrap=True, class_w..."
31,ExtraTreesClassifier,2,"[scannedLineItems, trustLevel]",0.169,0.004509,"ExtraTreesClassifier(bootstrap=False, class_we..."
29,GradientBoostingClassifier,2,"[scannedLineItems, trustLevel]",0.169,0.004509,GradientBoostingClassifier(criterion='friedman...


In [27]:
def get_monetary_value(cm, y_holdout):
        tn, fp, fn, tp = cm.ravel()
        print("True negative: ", tn)
        print("False positive: ", fp)
        print("False negative: ", fn)
        print("True positive: ", tp)
        score = (-25)*fp + (-5)*fn + 5*tp
        print(score, 'for ', len(y_holdout), ' instances in the test set')
        print(score/len(y_holdout), ' per instance in the test set')
        return score
        

bestModel = result_table.loc[result_table["Monetary Value Per Instance - Mean"].argmax()]["Raw Model"]
bestFeatureSet = result_table.loc[result_table["Monetary Value Per Instance - Mean"].argmax()]["Features"]

bestModel.fit(X[list(bestFeatureSet)], Y)

cm = confusion_matrix(Y, bestModel.predict(X[list(bestFeatureSet)]))
monetary_value = get_monetary_value(cm, Y)

True negative:  48280
False positive:  1
False negative:  0
True positive:  1719
8570 for  50000  instances in the test set
0.1714  per instance in the test set


# Feature Selection

In [53]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

best_features = SelectKBest(f_classif, k=1).fit(X,Y)
best_feature_list = X.columns[best_features.get_support()]

#X = X[best_feature_list]

#X.columns

best_feature_list.values

array(['trustLevel'], dtype=object)

In [17]:
import time

result_table = pd.DataFrame(columns=["Model", "Monetary Value Per Instance - Mean", "Monetary Value Per Instance - Standard Deviation"])

for model in model_factory:
 
 t = time.time()
# print("Started " + model.__class__.__name__)

 model.seed = 42

 scores = cross_val_score(model, X, Y, cv=skf, scoring=my_custom_score)
 model_name = model.__class__.__name__
 score_mean = scores.mean()
 score_std = scores.std()

# print("Finished " + model.__class__.__name__ + " after " + str(time.time() - t) + " seconds")
# print("Monetary Value Per Instance - Mean " + str(score_mean))
# print("Monetary Value Per Instance - Standard Deviation " + str(score_std))  
# print("")  
    
 result_table = result_table.append({
     "Model": model_name,
     "Monetary Value Per Instance - Mean":  score_mean,
     "Monetary Value Per Instance - Standard Deviation": score_std
      }, ignore_index=True)

       
       
result_table.sort_values(by = "Monetary Value Per Instance - Mean", ascending = False)

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  return (-0.5 * (norm2 + u) + np.log(self.priors_))


Unnamed: 0,Model,Monetary Value Per Instance - Mean,Monetary Value Per Instance - Standard Deviation
0,ExtraTreeClassifier,0.141,0.025474
9,BaggingClassifier,0.141,0.025474
19,Perceptron,0.141,0.025474
16,LogisticRegressionCV,0.141,0.025474
15,LogisticRegression,0.141,0.025474
14,LinearDiscriminantAnalysis,0.141,0.025474
13,GaussianNB,0.141,0.025474
11,RandomForestClassifier,0.141,0.025474
1,DecisionTreeClassifier,0.141,0.025474
10,ExtraTreesClassifier,0.141,0.025474
