In [144]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)

In [145]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=DeprecationWarning) 
warnings.simplefilter("ignore", UserWarning)

In [146]:
data = pd.read_csv("train.csv", sep="|")

In [147]:
data['totalScanTimeInMinutes'] = data['totalScanTimeInSeconds'] / 60.0
data['scannedLineItems'] = data['scannedLineItemsPerSecond'] * data['totalScanTimeInSeconds']
data['pricePerScannedLineItem'] = data['grandTotal'] / data['scannedLineItems']
data['scansWithoutRegistrationPerScannedLineItem'] = data['scansWithoutRegistration'] / data['scannedLineItems']
data['quantityModificationsPerScannedLineItem'] = data['quantityModifications'] / data['scannedLineItems']
data['lineItemVoidsPerSecond'] = data['lineItemVoids'] / data['totalScanTimeInSeconds']
data['scansWithoutRegistrationPerSecond'] = data['scansWithoutRegistration'] / data['totalScanTimeInSeconds']
data['quantityModificationsPerSecond'] = data['quantityModifications'] / data['totalScanTimeInSeconds']
data['secondsPerEuro'] = data['totalScanTimeInSeconds'] / data['grandTotal']
data['lineItemVoidsPerEuro'] = data['lineItemVoids'] / data['grandTotal']
data['scansWithoutRegistrationPerEuro'] = data['scansWithoutRegistration'] / data['grandTotal']
data['quantityModificationsPerEuro'] = data['quantityModifications'] / data['grandTotal']

# PCA Analysis

In [148]:
from sklearn.decomposition import PCA

y = data['fraud']
x = data.drop('fraud',axis=1)
# feature extraction
pca = PCA(n_components=3)

fit = pca.fit(x)

print("Principal Components:")   
print(fit.components_)

print("Explained Variance:") 
print(fit.explained_variance_ratio_)

Principal Components:
[[ 0.00001698  0.0049428  -0.00056844  0.00002907  0.00001725 -0.00002021
  -0.00000058 -0.00000253  0.000002    0.00008238 -0.00003167 -0.00007928
   0.00000057 -0.00000319 -0.00000021 -0.0000002  -0.00000012  0.99995003
   0.00678576  0.0053964   0.00006977]
 [-0.00007792 -0.99984834 -0.00005382 -0.00011805 -0.00005007  0.00003224
   0.00013816  0.0004883   0.00002957 -0.01666414 -0.00016748  0.00043093
   0.00004846  0.00003547  0.0000553   0.00004999  0.00002562  0.00493638
   0.00070076  0.00046868  0.00011373]
 [ 0.00290566  0.00002185  0.98648227  0.00075986 -0.00307059 -0.0005656
  -0.00015973  0.00322995  0.00162239  0.00000036 -0.01467797  0.16291
   0.00213059  0.00090875 -0.00002001 -0.00012738  0.00000699  0.00058634
  -0.00182294 -0.00006399 -0.00753071]]
Explained Variance:
[0.97722355 0.0226862  0.00006889]


-> Consider PCA Regression!

# Extra Trees Classifier

In [149]:
from sklearn.ensemble import ExtraTreesClassifier
# load data

y = data['fraud']
x = data.drop('fraud',axis=1)

# feature extraction
model = ExtraTreesClassifier(criterion = 'entropy')
model.fit(x, y)

pd.DataFrame(model.feature_importances_, list(x), columns =['Importance']).sort_values(by='Importance', ascending = False)

Unnamed: 0,Importance
trustLevel,0.314698
scannedLineItems,0.204901
scannedLineItemsPerSecond,0.052379
totalScanTimeInSeconds,0.04481
scansWithoutRegistrationPerScannedLineItem,0.038406
totalScanTimeInMinutes,0.036784
quantityModificationsPerScannedLineItem,0.030213
scansWithoutRegistration,0.029208
lineItemVoids,0.027214
pricePerScannedLineItem,0.025388


# SelectKBest

In [150]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

y = data['fraud']
x = data.drop('fraud',axis=1)

for feature_count in range(1, len(list(x)) + 1):
    best_features = SelectKBest(f_classif, k=feature_count).fit(x,y)
    print("Feature Count: " + str(feature_count) + ":")
    print(x.columns[best_features.get_support()].values)
    print()

Feature Count: 1:
['trustLevel']

Feature Count: 2:
['trustLevel' 'scannedLineItems']

Feature Count: 3:
['trustLevel' 'totalScanTimeInSeconds' 'scannedLineItems']

Feature Count: 4:
['trustLevel' 'totalScanTimeInSeconds' 'totalScanTimeInMinutes'
 'scannedLineItems']

Feature Count: 5:
['trustLevel' 'totalScanTimeInSeconds' 'totalScanTimeInMinutes'
 'scannedLineItems' 'pricePerScannedLineItem']

Feature Count: 6:
['trustLevel' 'totalScanTimeInSeconds' 'totalScanTimeInMinutes'
 'scannedLineItems' 'pricePerScannedLineItem'
 'quantityModificationsPerScannedLineItem']

Feature Count: 7:
['trustLevel' 'totalScanTimeInSeconds' 'totalScanTimeInMinutes'
 'scannedLineItems' 'pricePerScannedLineItem'
 'scansWithoutRegistrationPerScannedLineItem'
 'quantityModificationsPerScannedLineItem']

Feature Count: 8:
['trustLevel' 'totalScanTimeInSeconds' 'lineItemVoidsPerPosition'
 'totalScanTimeInMinutes' 'scannedLineItems' 'pricePerScannedLineItem'
 'scansWithoutRegistrationPerScannedLineItem'
 'quanti

# Recursive Feature Elimination -> Model Specific

In [151]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut

skf = StratifiedKFold(n_splits=10)
loo = LeaveOneOut()

In [152]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer

def my_custom_loss_func(y_true, y_pred):
    cm = confusion_matrix(y_true,y_pred)
    tn, fp, fn, tp = cm.ravel()
    score = ((-25)*fp + (-5)*fn + 5*tp) / len(y_true)
    return (score)

my_custom_score = make_scorer(my_custom_loss_func, greater_is_better=True)

In [153]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier

rfecv = RFECV(GradientBoostingClassifier(), min_features_to_select=1, cv=skf, scoring=my_custom_score)
rfecv = rfecv.fit(x,y)
print(str(rfecv.n_features_) + " features:")
print(np.array(list(x))[np.array(rfecv.support_)])

12 features:
['trustLevel' 'totalScanTimeInSeconds' 'lineItemVoids'
 'scansWithoutRegistration' 'scannedLineItemsPerSecond' 'valuePerSecond'
 'lineItemVoidsPerPosition' 'totalScanTimeInMinutes' 'scannedLineItems'
 'scansWithoutRegistrationPerScannedLineItem' 'secondsPerEuro'
 'scansWithoutRegistrationPerEuro']


In [143]:
best_model = rfecv.estimator_

from sklearn.metrics import confusion_matrix

def get_monetary_value(cm):
        tn, fp, fn, tp = cm.ravel()
        print("True negative: ", tn)
        print("False positive: ", fp)
        print("False negative: ", fn)
        print("True positive: ", tp)
        score = (-25)*fp + (-5)*fn + 5*tp
        print(str(score) + " for " + str(sum(sum(cm))) + " instances in the test set")
        print(str(score/sum(sum(cm))) + " per instance in the test set")
        return score
    
cm = confusion_matrix(y , best_model.predict(x.loc[:, rfecv.support_]))

monetary_value = get_monetary_value(cm)

True negative:  1775
False positive:  0
False negative:  0
True positive:  104
520 for 1879 instances in the test set
0.2767429483767962 per instance in the test set


# SelectFromModel -> Model Specific

In [136]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

model = SelectFromModel(LogisticRegression())
model.fit(x,y)

print(str(sum(np.array(model.get_support()))) + " features: ")
print(np.array(list(x))[np.array(model.get_support())])

5 features: 
['trustLevel' 'valuePerSecond' 'lineItemVoidsPerPosition'
 'pricePerScannedLineItem' 'scansWithoutRegistrationPerScannedLineItem']


In [138]:
best_model = LogisticRegression().fit(x.loc[:, model.get_support()], y)

from sklearn.metrics import confusion_matrix

def get_monetary_value(cm):
        tn, fp, fn, tp = cm.ravel()
        print("True negative: ", tn)
        print("False positive: ", fp)
        print("False negative: ", fn)
        print("True positive: ", tp)
        score = (-25)*fp + (-5)*fn + 5*tp
        print(str(score) + " for " + str(sum(sum(cm))) + " instances in the test set")
        print(str(score/sum(sum(cm))) + " per instance in the test set")
        return score
    
cm = confusion_matrix(y , best_model.predict(x.loc[:, model.get_support()]))

monetary_value = get_monetary_value(cm)

True negative:  1762
False positive:  13
False negative:  86
True positive:  18
-665 for 1879 instances in the test set
-0.3539116551357105 per instance in the test set
