In [7]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Normalizer

In [8]:
WORKING_DIR = os.getcwd()
PATH_DATA = os.getcwd()+"/Data/"

In [23]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
data_train = pd.read_csv(PATH_DATA + "train.csv", sep = "|")
data_test = pd.read_csv(PATH_DATA + "test.csv", sep = "|")
column_names_train = data_train.columns.values
column_names_test = data_test.columns.values

### Normalize

First we normalize the hole dataframe. Later on we will revert the normalization of the ordinal features "trustLevel" and "fraud"

In [35]:
scaler = Normalizer()
scaler.fit(data_train.values)
data_train_normalized = scaler.transform(data_train.values) 

It is important to apply exactly the same transformation to the training set and the test set for the supervised model. 
We called fit on the training set, and then called transform on the training and test sets.

In [36]:
data_test_normalized = scaler.transform(data_test.values) 

Revert normalization of ordinal features "trustLevel" and "fraud"

In [37]:
data_train_normalized = pd.DataFrame(data = data_train_normalized, columns=column_names_train)
data_train_normalized.trustLevel = data_train.trustLevel
data_train_normalized.fraud = data_train.fraud

data_test_normalized = pd.DataFrame(data = data_test_normalized, columns=column_names_test)
data_test_normalized.trustLevel = data_test.trustLevel

data_train_normalized.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,0.998619,0.051826,0.006632,0.0,0.002842,2.6e-05,4.9e-05,0.000229,0
1,3,0.967268,0.245041,0.044781,0.017912,0.035825,0.001161,0.002269,0.003199,0
2,3,0.999129,0.040967,0.001977,0.006591,0.003295,6e-06,2.7e-05,0.000152,0
3,6,0.998654,0.051472,0.004461,0.00223,0.00223,9e-06,2.9e-05,0.000154,0
4,5,0.982272,0.186243,0.006853,0.01599,0.004569,0.000143,0.000433,0.000254,0


### One-Hot-Encoding

In [38]:
data_train_preprocessed = pd.get_dummies(data_train_normalized, columns =["trustLevel"])
data_test_preprocessed = pd.get_dummies(data_test_normalized, columns =["trustLevel"])
data_train_preprocessed.head()

Unnamed: 0,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud,trustLevel_1,trustLevel_2,trustLevel_3,trustLevel_4,trustLevel_5,trustLevel_6
0,0.998619,0.051826,0.006632,0.0,0.002842,2.6e-05,4.9e-05,0.000229,0,0,0,0,0,1,0
1,0.967268,0.245041,0.044781,0.017912,0.035825,0.001161,0.002269,0.003199,0,0,0,1,0,0,0
2,0.999129,0.040967,0.001977,0.006591,0.003295,6e-06,2.7e-05,0.000152,0,0,0,1,0,0,0
3,0.998654,0.051472,0.004461,0.00223,0.00223,9e-06,2.9e-05,0.000154,0,0,0,0,0,0,1
4,0.982272,0.186243,0.006853,0.01599,0.004569,0.000143,0.000433,0.000254,0,0,0,0,0,1,0


Save preprocessed data as .csv

In [40]:
data_train_preprocessed.to_csv(path_or_buf = PATH_DATA + "train_preprocessed.csv", sep = "|", index = False)
data_test_preprocessed.to_csv(path_or_buf = PATH_DATA + "test_preprocessed.csv", sep = "|", index = False)