# Kernel PCA

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn.decomposition import KernelPCA
from sklearn.metrics import roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt

## Load the data

In [2]:
data = pd.read_csv('./data/arp_mitm/ARP_MitM_dataset.csv', dtype=np.float32)
labels = pd.read_csv('./data/arp_mitm/ARP_MitM_labels.csv',index_col=0, dtype=np.int32)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2504266 entries, 0 to 2504265
Columns: 115 entries, 1.000000000000000000e+00 to 0.000000000000000000e+00.54
dtypes: float32(115)
memory usage: 1.1 GB


In [4]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2504267 entries, 1 to 2504267
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   x       int32
dtypes: int32(1)
memory usage: 28.7 MB


In [5]:
data.head(5)

Unnamed: 0,1.000000000000000000e+00,1.294000000000000000e+03,0.000000000000000000e+00,1.000000000000000000e+00.1,1.294000000000000000e+03.1,0.000000000000000000e+00.1,1.000000000000000000e+00.2,1.294000000000000000e+03.2,0.000000000000000000e+00.2,1.000000000000000000e+00.3,...,0.000000000000000000e+00.48,0.000000000000000000e+00.49,0.000000000000000000e+00.50,1.000000000000000000e+00.24,1.294000000000000000e+03.28,0.000000000000000000e+00.51,1.294000000000000000e+03.29,0.000000000000000000e+00.52,0.000000000000000000e+00.53,0.000000000000000000e+00.54
0,1.0,1514.0,0.0,1.0,1514.0,0.0,1.0,1514.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1514.0,0.0,1514.0,0.0,0.0,0.0
1,1.999505,1294.0,6.984919e-10,1.999703,1294.0,2.328306e-10,1.999901,1294.0,6.984919e-10,1.99999,...,0.0,0.0,0.0,1.999999,1294.0,0.0,1294.0,0.0,0.0,0.0
2,2.998985,1294.0,9.313226e-10,2.999391,1294.0,4.656613e-10,2.999797,1294.0,6.984919e-10,2.99998,...,6.984919e-10,0.0,0.0,2.999998,1294.0,1.5e-05,1294.0,2.328306e-10,0.0,0.0
3,3.998061,1294.0,9.313226e-10,3.998836,1294.0,2.328306e-10,3.999612,1294.0,6.984919e-10,3.999961,...,2.328306e-10,0.0,0.0,3.999996,1294.0,0.0,1294.0,0.0,0.0,0.0
4,4.996578,1294.0,6.984919e-10,4.997946,1294.0,4.656613e-10,4.999315,1294.0,0.0,4.999931,...,4.656613e-10,0.0,0.0,4.999993,1294.0,0.0,1294.0,0.0,0.0,0.0


In [6]:
labels.head(5)

Unnamed: 0,x
1,0
2,0
3,0
4,0
5,0


In [7]:
data.describe()

Unnamed: 0,1.000000000000000000e+00,1.294000000000000000e+03,0.000000000000000000e+00,1.000000000000000000e+00.1,1.294000000000000000e+03.1,0.000000000000000000e+00.1,1.000000000000000000e+00.2,1.294000000000000000e+03.2,0.000000000000000000e+00.2,1.000000000000000000e+00.3,...,0.000000000000000000e+00.48,0.000000000000000000e+00.49,0.000000000000000000e+00.50,1.000000000000000000e+00.24,1.294000000000000000e+03.28,0.000000000000000000e+00.51,1.294000000000000000e+03.29,0.000000000000000000e+00.52,0.000000000000000000e+00.53,0.000000000000000000e+00.54
count,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,...,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0
mean,316.1842,1263.43,162956.4,520.6961,1263.908,162429.2,1542.138,1264.399,161828.3,15082.09,...,161879.3,-0.002279081,-7.277524e-07,24333.48,1264.685,384.815,1319.794,161735.3,-0.002131456,-5.30928e-07
std,126.131,273.1945,43871.2,201.8027,273.0517,42505.26,591.7657,272.9755,41401.91,5965.269,...,45708.86,1.8494,0.0006071146,6679.178,277.1984,105.0555,74.98083,45636.88,1.538249,0.0005666413
min,1.0,60.0,0.0,1.0,60.0,0.0,1.0,60.0,0.0,1.0,...,0.0,-1464.452,-0.4688243,1.0,60.0,0.0,60.0,0.0,-832.9919,-0.2887022
25%,189.3585,1302.018,149010.6,307.1748,1301.232,146620.9,889.0711,1297.967,143519.2,8741.032,...,179307.1,0.0,0.0,23477.41,1335.24,424.8047,1335.24,180459.0,0.0,0.0
50%,360.4484,1328.082,172712.8,617.8137,1331.693,175439.9,1921.768,1336.063,178978.0,19450.02,...,180994.0,0.0,0.0,27635.2,1340.819,425.4162,1340.819,180978.9,0.0,0.0
75%,404.1042,1342.751,188220.0,666.227,1342.305,186347.8,1974.742,1341.165,184349.3,19640.7,...,182148.5,0.0,0.0,28023.81,1341.263,425.9398,1341.263,181424.8,0.0,0.0
max,536.5876,1514.0,494229.1,807.3135,1514.0,494825.9,2124.893,1514.0,495307.5,19839.36,...,425756.2,70.38126,0.084822,30912.6,1514.0,652.5,1514.0,425756.2,333.5587,0.1560442


## Scale the data

In [8]:
# scale the data
scaler = pp.StandardScaler()
data = scaler.fit_transform(data)
data = pd.DataFrame(data)
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,105,106,107,108,109,110,111,112,113,114
count,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,...,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0,2504266.0
mean,1.083966e-08,2.851579e-09,-8.530366e-09,3.229353e-10,-4.911053e-09,-1.535466e-09,-1.157693e-08,-6.473938e-09,6.385588e-09,2.48843e-08,...,-8.365851e-09,9.855619e-10,1.671799e-10,1.93225e-07,-1.136405e-08,-1.336525e-08,-1.103616e-09,-5.456387e-09,1.531467e-09,-1.561362e-11
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.498866,-4.405028,-3.71443,-2.575269,-4.409085,-3.821389,-2.604304,-4.412116,-3.908716,-2.528148,...,-3.541532,-791.8514,-772.2161,-3.643037,-4.345931,-3.662969,-16.80155,-3.543964,-541.5183,-509.4965
25%,-1.00551,0.1412495,-0.3178832,-1.05807,0.1366912,-0.3719123,-1.10359,0.1229701,-0.4422288,-1.062995,...,0.3812793,0.001232335,0.001198707,-0.1281708,0.2545306,0.3806518,0.2060031,0.4102741,0.001385638,0.0009369738
50%,0.3509367,0.2366555,0.2223868,0.4812503,0.248251,0.3060983,0.641522,0.2625269,0.4142249,0.7322295,...,0.4181847,0.001232335,0.001198707,0.4943288,0.2746561,0.3864726,0.2804053,0.4216664,0.001385638,0.0009369738
75%,0.6970516,0.2903494,0.5758576,0.7211547,0.287115,0.5627237,0.7310404,0.2812183,0.5439616,0.7641943,...,0.4434421,0.001232335,0.001198707,0.5525125,0.2762581,0.3914571,0.286328,0.4314354,0.001385638,0.0009369738
max,1.747416,0.9171873,7.551028,1.420286,0.9159153,7.820133,0.9847738,0.9143703,8.054685,0.7974971,...,5.772995,38.0575,139.7146,0.9850183,0.8994111,2.548033,2.590077,5.785254,216.8445,275.3854


## Train-Test Split

In [9]:
# drop the last row
labels=labels[:-1]
labels.tail(5)

Unnamed: 0,x
2504262,1
2504263,1
2504264,1
2504265,1
2504266,1


In [10]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)

In [11]:
print(f'Number anomalies in train data: {y_train.sum()}')

Number anomalies in train data: x    801180
dtype: int64


In [12]:
print(f'Number anomalies in test data: {y_test.sum()}')

Number anomalies in test data: x    344091
dtype: int64


## Train Kernel PCA

In [None]:
# create and fit pca
kernel_pca = KernelPCA(n_components=20, kernel='linear', gamma=None, fit_inverse_transform=True, n_jobs=3)
kernel_pca.fit(X_train[:2000])
train_pca = kernel_pca.transform(X_train)

In [None]:
inverse_train = kernel_pca.inverse_transform(train_pca)

In [None]:
def reconstr_error(original, reconstr):
    errors = np.sqrt(np.sum((original - reconstr)**2, axis=1))
    return errors

In [None]:
errors = reconstr_error(X_train, inverse_train)

In [None]:
# average precision score train
aps_train = average_precision_score(y_train, errors)
print(f'Average precision score for train: {aps_train:.4f}')

## Test Kernel PCA

In [None]:
test_pca = kernel_pca.transform(X_test)
inverse_test = kernel_pca.inverse_transform(test_pca)

In [None]:
test_errors = reconstr_error(X_test, inverse_test)

In [None]:
# average precision score test
aps_test = average_precision_score(y_test, test_errors)
print(f'Average precision score for test: {aps_test:.4f}')

In [None]:
# precision-recall curve for test results
precision, recall, thresholds = precision_recall_curve(y_test, test_errors)
plt.title(f'Precision-Recall Curve. Average precision: {aps_test:.2f}')
plt.xlabel('Recall')
plt.xlim([0, 1.1])
plt.ylim([0, 1.1])
plt.ylabel('Precision')
#plt.ylim([0, 1.1])
#plt.xlim([0, 1.0])
plt.plot(recall, precision)
plt.show()

In [None]:
# receiver operational characteristic
fpr, tpr, thresholds = roc_curve(y_test, test_errors)
auROC = auc(fpr, tpr)
plt.figure()
plt.title(f'ROCurve. AuROC={auROC:.2f}')
plt.plot(fpr, tpr, color='r', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0,1], [0,1], color='k', lw=2, linestyle='--')
plt.show()