# Importing needed libraries

In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import sys
import warnings
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

if not sys.warnoptions:
    warnings.simplefilter("ignore")

# Reading data set and adding additional features

Then performing principal component analysis for each dataset

1) Training set

In [2]:
data_train = pd.read_csv("train.csv", sep="|")

In [3]:
data_train['totalScanTimeInMinutes'] = data_train['totalScanTimeInSeconds'] / 60.0
data_train['scannedLineItems'] = data_train['scannedLineItemsPerSecond'] * data_train['totalScanTimeInSeconds']
data_train['pricePerScannedLineItem'] = data_train['grandTotal'] / data_train['scannedLineItems']
data_train['scansWithoutRegistrationPerScannedLineItem'] = data_train['scansWithoutRegistration'] / data_train['scannedLineItems']
data_train['quantityModificationsPerScannedLineItem'] = data_train['quantityModifications'] / data_train['scannedLineItems']
data_train['lineItemVoidsPerSecond'] = data_train['lineItemVoids'] / data_train['totalScanTimeInSeconds']
data_train['scansWithoutRegistrationPerSecond'] = data_train['scansWithoutRegistration'] / data_train['totalScanTimeInSeconds']
data_train['quantityModificationsPerSecond'] = data_train['quantityModifications'] / data_train['totalScanTimeInSeconds']
data_train['secondsPerEuro'] = data_train['totalScanTimeInSeconds'] / data_train['grandTotal']
data_train['lineItemVoidsPerEuro'] = data_train['lineItemVoids'] / data_train['grandTotal']
data_train['scansWithoutRegistrationPerEuro'] = data_train['scansWithoutRegistration'] / data_train['grandTotal']
data_train['quantityModificationsPerEuro'] = data_train['quantityModifications'] / data_train['grandTotal']

In [4]:
#Perform PCA on training set
y = data_train['fraud']
x = data_train.drop('fraud',axis=1)
# feature extraction
pca = PCA(n_components=3)

fit = pca.fit(x)

print("Principal Components:")   
print(fit.components_)

print("Explained Variance:") 
print(fit.explained_variance_ratio_)

Principal Components:
[[ 1.69836765e-05  4.94279851e-03 -5.68437765e-04  2.90736664e-05
   1.72466623e-05 -2.02067364e-05 -5.83309384e-07 -2.53372105e-06
   1.99624126e-06  8.23799752e-05 -3.16663395e-05 -7.92848758e-05
   5.66477382e-07 -3.19447746e-06 -2.05142807e-07 -1.95332705e-07
  -1.20118214e-07  9.99950027e-01  6.78575640e-03  5.39639953e-03
   6.97704479e-05]
 [-7.79188519e-05 -9.99848342e-01 -5.38209628e-05 -1.18053661e-04
  -5.00672397e-05  3.22371458e-05  1.38161085e-04  4.88303250e-04
   2.95686197e-05 -1.66641390e-02 -1.67481821e-04  4.30930454e-04
   4.84600388e-05  3.54659600e-05  5.52959996e-05  4.99883583e-05
   2.56150733e-05  4.93638201e-03  7.00757346e-04  4.68676694e-04
   1.13734210e-04]
 [ 2.90566217e-03  2.18461590e-05  9.86482268e-01  7.59863929e-04
  -3.07059059e-03 -5.65596625e-04 -1.59725671e-04  3.22995273e-03
   1.62238667e-03  3.64102649e-07 -1.46779698e-02  1.62910002e-01
   2.13059450e-03  9.08753108e-04 -2.00075020e-05 -1.27381137e-04
   6.98691845e-0

2) Test set

In [5]:
data_test = pd.read_csv("test.csv", sep="|")

In [6]:
data_test['totalScanTimeInMinutes'] = data_test['totalScanTimeInSeconds'] / 60.0
data_test['scannedLineItems'] = data_test['scannedLineItemsPerSecond'] * data_test['totalScanTimeInSeconds']
data_test['pricePerScannedLineItem'] = data_test['grandTotal'] / data_test['scannedLineItems']
data_test['scansWithoutRegistrationPerScannedLineItem'] = data_test['scansWithoutRegistration'] / data_test['scannedLineItems']
data_test['quantityModificationsPerScannedLineItem'] = data_test['quantityModifications'] / data_test['scannedLineItems']
data_test['lineItemVoidsPerSecond'] = data_test['lineItemVoids'] / data_test['totalScanTimeInSeconds']
data_test['scansWithoutRegistrationPerSecond'] = data_test['scansWithoutRegistration'] / data_test['totalScanTimeInSeconds']
data_test['quantityModificationsPerSecond'] = data_test['quantityModifications'] / data_test['totalScanTimeInSeconds']
data_test['secondsPerEuro'] = data_test['totalScanTimeInSeconds'] / data_test['grandTotal']
data_test['lineItemVoidsPerEuro'] = data_test['lineItemVoids'] / data_test['grandTotal']
data_test['scansWithoutRegistrationPerEuro'] = data_test['scansWithoutRegistration'] / data_test['grandTotal']
data_test['quantityModificationsPerEuro'] = data_test['quantityModifications'] / data_test['grandTotal']

In the test data set we have NaN values and infinity values, which PCA cannot handle so we need to drop these problematic instances.

In [7]:
data_test.describe()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScanTimeInMinutes,...,pricePerScannedLineItem,scansWithoutRegistrationPerScannedLineItem,quantityModificationsPerScannedLineItem,lineItemVoidsPerSecond,scansWithoutRegistrationPerSecond,quantityModificationsPerSecond,secondsPerEuro,lineItemVoidsPerEuro,scansWithoutRegistrationPerEuro,quantityModificationsPerEuro
count,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,...,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498108.0,498115.0,498107.0
mean,3.503257,915.608772,49.98899,5.495926,5.001281,2.499015,0.068054,0.222182,0.73519,15.260146,...,6.693747,0.669799,0.336203,0.02437,0.021893,0.010991,inf,inf,inf,inf
std,1.707662,528.77288,28.873426,3.447683,3.163795,1.708182,0.521092,1.717867,1.320235,8.812881,...,11.680538,1.208735,0.627875,0.193313,0.171088,0.088685,,,,
min,1.0,1.0,0.0,0.0,0.0,0.0,0.000546,0.0,0.0,0.016667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01002908,0.0,0.0,0.0
25%,2.0,458.0,24.93,3.0,2.0,1.0,0.008682,0.027348,0.16,7.633333,...,1.6112,0.142857,0.0625,0.002735,0.002431,0.001082,9.166667,0.05006258,0.04447409,0.01990446
50%,4.0,916.0,50.03,5.0,5.0,2.0,0.01694,0.05455,0.352941,15.266667,...,3.224706,0.333333,0.16,0.006002,0.005474,0.002729,18.33183,0.1099959,0.1002339,0.05002501
75%,5.0,1374.0,75.02,8.0,8.0,4.0,0.033929,0.109091,0.692308,22.9,...,6.2725,0.636364,0.333333,0.012,0.010959,0.005464,36.56578,0.2198608,0.2002002,0.09996002
max,6.0,1831.0,99.99,11.0,10.0,5.0,30.0,99.71,11.0,30.516667,...,99.99,10.0,5.0,11.0,10.0,5.0,inf,inf,inf,inf


In [8]:
data_test.shape

(498121, 21)

In [9]:
#Dropping all rows with NaN and infinity values
data_test = data_test.replace([np.inf, -np.inf], np.nan)
datat_test = data_test.dropna(axis=0, how='any')

In [10]:
#Testing dropping
data_test.describe()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScanTimeInMinutes,...,pricePerScannedLineItem,scansWithoutRegistrationPerScannedLineItem,quantityModificationsPerScannedLineItem,lineItemVoidsPerSecond,scansWithoutRegistrationPerSecond,quantityModificationsPerSecond,secondsPerEuro,lineItemVoidsPerEuro,scansWithoutRegistrationPerEuro,quantityModificationsPerEuro
count,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,...,498121.0,498121.0,498121.0,498121.0,498121.0,498121.0,498050.0,498050.0,498050.0,498050.0
mean,3.503257,915.608772,49.98899,5.495926,5.001281,2.499015,0.068054,0.222182,0.73519,15.260146,...,6.693747,0.669799,0.336203,0.02437,0.021893,0.010991,91.286762,0.556,0.495557,0.248515
std,1.707662,528.77288,28.873426,3.447683,3.163795,1.708182,0.521092,1.717867,1.320235,8.812881,...,11.680538,1.208735,0.627875,0.193313,0.171088,0.088685,1357.099052,8.677895,7.654257,3.650966
min,1.0,1.0,0.0,0.0,0.0,0.0,0.000546,0.0,0.0,0.016667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.010029,0.0,0.0,0.0
25%,2.0,458.0,24.93,3.0,2.0,1.0,0.008682,0.027348,0.16,7.633333,...,1.6112,0.142857,0.0625,0.002735,0.002431,0.001082,9.165723,0.050056,0.044471,0.019897
50%,4.0,916.0,50.03,5.0,5.0,2.0,0.01694,0.05455,0.352941,15.266667,...,3.224706,0.333333,0.16,0.006002,0.005474,0.002729,18.32963,0.109976,0.10022,0.05002
75%,5.0,1374.0,75.02,8.0,8.0,4.0,0.033929,0.109091,0.692308,22.9,...,6.2725,0.636364,0.333333,0.012,0.010959,0.005464,36.549707,0.21978,0.200133,0.09992
max,6.0,1831.0,99.99,11.0,10.0,5.0,30.0,99.71,11.0,30.516667,...,99.99,10.0,5.0,11.0,10.0,5.0,181900.0,1100.0,1000.0,500.0


In [11]:
data_test.shape

(498121, 21)

Somehow it doesn't drop the whole row but just the values in the column having the NaN Value. However the documentation states it is dropping the whole row.. Any ideas?

In [None]:
#Perform PCA on test set
# y = data_test['fraud']
# x = data_test.drop('fraud',axis=1)

#Fraud label is not relevant for test set

# feature extraction
pca = PCA(n_components=3)

fit = pca.fit(data_test)

print("Principal Components:")   
print(fit.components_)

print("Explained Variance:") 
print(fit.explained_variance_ratio_)

In [None]:
#CHOOSE ATTRIBUTES HERE
features = data.columns
print(features)

for feature in features:
    for feature2 in features:
        
        test =  {feature: 1, feature2: 2}
        s = '->'
        print(s.join(test))

        X = data[[feature]]
        Y = data[[feature2]]

        Nc = range(1, 20)
        kmeans = [KMeans(n_clusters=i) for i in Nc]
        kmeans
        score = [kmeans[i].fit(Y).score(Y) for i in range(len(kmeans))]
        score

        pl.plot(Nc,score)
        pl.xlabel('Number of Clusters')
        pl.ylabel('Score')
        pl.title('Elbow Curve')
        pl.show()

        pca = PCA(n_components=1).fit(Y)
        pca_d = pca.transform(Y)
        pca_c = pca.transform(X)

        kmeans=KMeans(n_clusters=3)
        kmeansoutput=kmeans.fit(Y)
        kmeansoutput

        pl.figure('3 Cluster K-Means')
        pl.scatter(pca_c[:, 0], pca_d[:, 0], c=kmeansoutput.labels_)
        pl.xlabel(feature)
        pl.ylabel(feature2)
        pl.title('3 Cluster K-Means')
        pl.show()
