In [2]:
%run main.ipynb

0    1775
1     104
Name: fraud, dtype: int64
0    0.944651
1    0.055349
Name: fraud, dtype: float64


In [3]:
# Table for training results
result_table = pd.DataFrame(columns=["Model", "Data Preparation", "Feature Count", "Features", 
                                     "Optimal Parameters", "Monetary Value Per Instance - Mean", 
                                     "Monetary Value Per Instance - Standard Deviation", 
                                     "Time needed", "Raw Model"])

In [7]:
from itertools import chain
def get_dict_concat(d1, d2):
    return dict(chain.from_iterable(d.items() for d in (d1, d2)))

# Model factory -> only KNeighborsClassifier for in-depth analysis

In [8]:
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

skf = StratifiedKFold(n_splits=10)

param_bagging = {
#         'bootstrap': [True, False],
#     'bootstrap_features': [True, False],    
    'n_estimators': [50, 120],
#         'warm_start': [True, False],
    'oob_score': [True, False],
}

param_dt = {
    'base_estimator__max_leaf_nodes': [5,10,20,30]
}

param_knn = {
    'base_estimator__n_neighbors' : [15],
    'base_estimator__weights' : ['distance'],
    'base_estimator__p' : [1]
}

param_logistics = {
    'base_estimator__solver' : ['newton-cg'], 
    'base_estimator__C' : [1.2],
    'base_estimator__fit_intercept' : [True],
    'base_estimator__class_weight' : [None, 'balanced'],
    'base_estimator__max_iter': [100000]
}


model_tuning_factory = [
#     GridSearchCV(BaggingClassifier(DecisionTreeClassifier()),
#                  param_grid = get_dict_concat(param_bagging, param_dt), 
#                  cv = skf,
#                  scoring = my_custom_score,
#                  n_jobs = 4
#                  ),
        
    GridSearchCV(BaggingClassifier(LogisticRegression(n_jobs = 4)),
                 param_grid = get_dict_concat(param_bagging, param_logistics), 
                 cv = skf,
                 scoring = my_custom_score,
                 n_jobs = 8
                 ),
    
#     GridSearchCV(BaggingClassifier(KNeighborsClassifier()),
#                  param_grid = get_dict_concat(param_bagging, param_knn), 
#                  cv = skf,
#                  scoring = my_custom_score,
#                  n_jobs = 4
#                  )
] 

In [None]:
run()

In [9]:
result_table = pd.read_pickle("result_table_BaggingClassifier.pkl")

In [11]:
result_table = result_table.sort_values(by = "Monetary Value Per Instance - Mean", ascending = False)
result_table.index = range(0,result_table.shape[0])
result_table

Unnamed: 0,Model,Data Preparation,Feature Count,Features,Optimal Parameters,Monetary Value Per Instance - Mean,Monetary Value Per Instance - Standard Deviation,Time needed,Raw Model,Feature Selection Technique
0,BaggingClassifier,No Scaling,16,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.167642,0.117763,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
1,BaggingClassifier,StandardScaler,15,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.167642,0.118947,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
2,BaggingClassifier,LogScaler,12,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.167642,0.117763,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
3,BaggingClassifier,StandardScaler,17,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.16232,0.103928,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
4,BaggingClassifier,StandardScaler,9,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.16232,0.121516,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
5,BaggingClassifier,StandardScaler,14,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.16232,0.099857,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
6,BaggingClassifier,StandardScaler,10,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.16232,0.121516,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
7,BaggingClassifier,No Scaling,9,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.16232,0.121516,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
8,BaggingClassifier,StandardScaler,13,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.16232,0.099857,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest
9,BaggingClassifier,StandardScaler,18,"[trustLevel, totalScanTimeInSeconds, lineItemV...","{'base_estimator__C': 1.2, 'base_estimator__cl...",0.159659,0.102456,,"(LogisticRegression(C=1.2, class_weight=None, ...",SelectKBest


In [12]:
best_model = result_table.loc[result_table["Monetary Value Per Instance - Mean"].argmax()]["Raw Model"]
best_model_features = result_table.loc[result_table["Monetary Value Per Instance - Mean"].argmax()]["Features"]
best_parameters = result_table.loc[result_table["Monetary Value Per Instance - Mean"].argmax()]["Optimal Parameters"]

print(best_model)
print(best_model_features)
print(best_parameters)

BaggingClassifier(base_estimator=LogisticRegression(C=1.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100000, multi_class='ovr',
          n_jobs=4, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=50, n_jobs=1, oob_score=True,
         random_state=None, verbose=0, warm_start=False)
['trustLevel' 'totalScanTimeInSeconds' 'lineItemVoids'
 'scansWithoutRegistration' 'scannedLineItemsPerSecond' 'valuePerSecond'
 'lineItemVoidsPerPosition' 'scannedLineItems' 'pricePerScannedLineItem'
 'scansWithoutRegistrationPerScannedLineItem'
 'quantityModificationsPerScannedLineItem' 'lineItemVoidsPerSecond'
 'scansWithoutRegistrationPerSecond' 'quantityModificationsPerSecond'
 'secondsPerEuro' 'quantityModificationsPerEuro']
{'base_estimator__C': 1.2, 'base_estimator__class_weight': None, 'ba

# Print performance of best 10 models

In [13]:
for rank in range(0,11):
    best = BestModel(rank)
    monetary_value = get_monetary_value(best)
    print()
    best.print_best_model()
    print("-----------------------------------------------------------------------------------------------")

True negative:  1771
False positive:  4
False negative:  7
True positive:  97
350 for  1879  instances in the test set
0.18626929217668972  per instance in the test set

BaggingClassifier(base_estimator=LogisticRegression(C=1.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100000, multi_class='ovr',
          n_jobs=4, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=50, n_jobs=1, oob_score=True,
         random_state=None, verbose=0, warm_start=False)
['trustLevel' 'totalScanTimeInSeconds' 'lineItemVoids'
 'scansWithoutRegistration' 'scannedLineItemsPerSecond' 'valuePerSecond'
 'lineItemVoidsPerPosition' 'scannedLineItems' 'pricePerScannedLineItem'
 'scansWithoutRegistrationPerScannedLineItem'
 'quantityModificationsPerScannedLineItem' 'lineItemVoidsPerSecond'
 'scansWithoutRegi

True negative:  1770
False positive:  5
False negative:  7
True positive:  97
325 for  1879  instances in the test set
0.1729643427354976  per instance in the test set

BaggingClassifier(base_estimator=LogisticRegression(C=1.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100000, multi_class='ovr',
          n_jobs=4, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=50, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)
['trustLevel' 'totalScanTimeInSeconds' 'lineItemVoids'
 'scansWithoutRegistration' 'scannedLineItemsPerSecond' 'valuePerSecond'
 'lineItemVoidsPerPosition' 'scannedLineItems' 'pricePerScannedLineItem'
 'scansWithoutRegistrationPerScannedLineItem'
 'quantityModificationsPerScannedLineItem' 'lineItemVoidsPerSecond'
 'quantityModifica