<h1 style="color:royalblue; font-size:3em"> This serves as a baseline notebook to be imported by other notebooks </h1>

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=DeprecationWarning) 
warnings.simplefilter("ignore", UserWarning)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

njobs = -1

In [22]:
train=pd.read_csv('../../data/train.csv',sep='|')

In [23]:
print(train.fraud.value_counts())
print(train.fraud.value_counts() / len(train))

0    1775
1     104
Name: fraud, dtype: int64
0    0.944651
1    0.055349
Name: fraud, dtype: float64


# Feature Engineering

In [25]:
train['scannedLineItems'] = train['scannedLineItemsPerSecond'] * train['totalScanTimeInSeconds']
train['pricePerScannedLineItem'] = train['grandTotal'] / train['scannedLineItems']
train['scansWithoutRegistrationPerScannedLineItem'] = train['scansWithoutRegistration'] / train['scannedLineItems']
train['quantityModificationsPerScannedLineItem'] = train['quantityModifications'] / train['scannedLineItems']
train['lineItemVoidsPerSecond'] = train['lineItemVoids'] / train['totalScanTimeInSeconds']
train['scansWithoutRegistrationPerSecond'] = train['scansWithoutRegistration'] / train['totalScanTimeInSeconds']
train['quantityModificationsPerSecond'] = train['quantityModifications'] / train['totalScanTimeInSeconds']
train['secondsPerEuro'] = train['totalScanTimeInSeconds'] / train['grandTotal']
train['lineItemVoidsPerEuro'] = train['lineItemVoids'] / train['grandTotal']
train['scansWithoutRegistrationPerEuro'] = train['scansWithoutRegistration'] / train['grandTotal']
train['quantityModificationsPerEuro'] = train['quantityModifications'] / train['grandTotal']

# Declare global variables

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut

# Cross validation
skf = StratifiedKFold(n_splits=10)
loo = LeaveOneOut()

# Split *train* dataset to feature and target sets 
X = train.drop('fraud',axis=1)
Y = train['fraud']

# Custom loss function

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer

def my_custom_loss_func(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    score = ((-25)*fp + (-5)*fn + 5*tp) / len(y_true)
    return (score)

my_custom_score = make_scorer(my_custom_loss_func, greater_is_better=True)

# Function to run loops for training and cross validation

In [27]:
import time
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

def run():
    global result_table
    # three types of data preparation: No preparation, MaxMinScaler, StandardScaler
    for data_preparation_step in range(1,5):
        if (data_preparation_step == 1):  
            X_scaled = X
            data_preparation = "No Scaling"
        elif (data_preparation_step == 2):
            feature_scaler = MinMaxScaler()  
            X_scaled = pd.DataFrame(feature_scaler.fit_transform(X.values), columns=X.columns, index=X.index) 
            data_preparation = "MinMaxScaler"
        elif (data_preparation_step == 3):
            feature_scaler = StandardScaler()  
            X_scaled = pd.DataFrame(feature_scaler.fit_transform(X.values), columns=X.columns, index=X.index)
            data_preparation = "StandardScaler"
        elif (data_preparation_step == 4):
            transformer = FunctionTransformer(np.log1p, validate=True)  
            X_scaled = pd.DataFrame(transformer.transform(X), columns=X.columns, index=X.index)
            data_preparation = "LogScaler"      

        for feature_count in range(1,len(list(X))+1):

            for model in model_tuning_factory:   # replace with model_tuning_factory_randomized for faster results

                start_time = time.time()

                best_features = SelectKBest(f_classif, k=feature_count).fit(X_scaled,Y)
                best_feature_list = X.columns[best_features.get_support()]
                X_selected_features = X[best_feature_list]

                model.seed = 42
                model.fit(X_selected_features,Y)
                model_name = model.best_estimator_.__class__.__name__
                score_mean = model.cv_results_['mean_test_score'][model.best_index_]
                score_std = model.cv_results_['std_test_score'][model.best_index_]

                end_time = time.time()

                print("Finished " + model.best_estimator_.__class__.__name__ + " with " + data_preparation + " and " + str(feature_count) + " features after " + str(end_time - start_time) + " seconds")

                result_table = result_table.append({
                 "Model": model_name,
                 "Data Preparation": data_preparation,
                 "Feature Count": feature_count,
                 "Features": best_feature_list.values,
                 "Optimal Parameters": model.best_params_,
                 "Monetary Value Per Instance - Mean":  score_mean,
                 "Monetary Value Per Instance - Standard Deviation": score_std,
                 "Time needed": end_time - start_time,    
                 "Raw Model": model.best_estimator_
                  }, ignore_index=True)

    result_table.sort_values(by = "Monetary Value Per Instance - Mean", ascending = False)

# Plot number of features against monetary value

In [28]:
def plot_number_features():
    plt.rcParams['figure.figsize'] = (10,10)

    plt.scatter(result_table["Feature Count"], result_table["Monetary Value Per Instance - Mean"])
    plt.xlabel('Number of features', fontsize=16)
    plt.ylabel('Monetary Value Per Instance - Mean', fontsize=16)

# Class to store the best model 

In [1]:
class BestModel:
    best_model = None
    best_model_features = None
    rank = 0
    
    def __init__(self, rank):
        self.rank = rank
    
    def set(self):
        global result_table
        index = result_table["Monetary Value Per Instance - Mean"].astype(float).argmax() - self.rank
        self.best_model = result_table.loc[index,]["Raw Model"]
        self.best_model_features = result_table.loc[index,]["Features"]
    
    def predict(self):
        self.set()
        return self.best_model.predict(X[self.best_model_features])
    
    def print_best_model(self):
        print(self.best_model)
        print(self.best_model_features)

SyntaxError: invalid syntax (<ipython-input-1-0b24783c38e5>, line 11)

# Calculate performance of the best model

In [30]:
def get_monetary_value(best_model):
        cm = confusion_matrix(Y , best_model.predict())
        tn, fp, fn, tp = cm.ravel()
        size = tn + fp + fn + tp
        print("True negative: ", tn)
        print("False positive: ", fp)
        print("False negative: ", fn)
        print("True positive: ", tp)
        score = (-25)*fp + (-5)*fn + 5*tp
        print(score, 'for ', size, ' instances in the test set')
        print(score/size, ' per instance in the test set')
        return score