# One-Class SVM for Traffic Anomaly Detection

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt

## Load Data

In [2]:
#data = pd.read_csv('../Datasets/kitsune_dataset/SSDP_Flood/SSDP_Flood_dataset.csv', index_col=0, dtype=np.float32)
#labels = pd.read_csv('../Datasets/kitsune_dataset/SSDP_Flood/SSDP_Flood_labels.csv', index_col=0, dtype=np.int32)
data = pd.read_csv('../Datasets/kitsune_dataset/Active_Wiretap/Active_Wiretap_dataset.csv', index_col=0, dtype=np.float32)
labels = pd.read_csv('../Datasets/kitsune_dataset/Active_Wiretap/Active_Wiretap_labels.csv', index_col=0, dtype=np.int32)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 2278688 entries, 1.9973247051239014 to 332.7803649902344
Columns: 114 entries, 2.090000000000000000e+02 to 0.000000000000000000e+00.54
dtypes: float32(114)
memory usage: 1008.3 MB


In [4]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2278689 entries, 1 to 2278689
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   x       int32
dtypes: int32(1)
memory usage: 26.1 MB


## Scale Data

In [5]:
scaler = pp.MinMaxScaler()
data = scaler.fit_transform(data)
data = pd.DataFrame(data)
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,104,105,106,107,108,109,110,111,112,113
count,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,...,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0,2278688.0
mean,0.82224,0.3050174,0.5430585,0.8226944,0.3036735,0.6130465,0.8231573,0.3058952,0.6388294,0.9045548,...,0.3271092,0.5798556,0.4435584,0.6684283,0.8808641,0.5449103,0.9242014,0.327065,0.5824689,0.4416649
std,0.1945486,0.0860251,0.2220364,0.1944936,0.08325344,0.2463958,0.1944886,0.08199255,0.2640495,0.2136368,...,0.09835514,0.004594623,0.008572696,0.262088,0.2113756,0.1556235,0.05621867,0.09787289,0.004652583,0.01008293
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.8506727,0.2760243,0.3670852,0.8512515,0.2729489,0.4148379,0.850618,0.2727349,0.439887,0.9339065,...,0.3675718,0.5799512,0.4438865,0.4752771,0.9380996,0.6083499,0.9373477,0.3700897,0.582601,0.4425227
50%,0.8692419,0.3233634,0.5141586,0.8723988,0.3296277,0.5494377,0.8765783,0.3420107,0.5686493,0.9657908,...,0.3709228,0.5799512,0.4438865,0.7865087,0.9421977,0.6091749,0.941444,0.3710942,0.582601,0.4425227
75%,0.880791,0.3565571,0.7468726,0.8808829,0.3530799,0.8264682,0.8805224,0.3537428,0.8531925,0.9667862,...,0.3734404,0.5799512,0.4438865,0.8771517,0.9425868,0.6100653,0.9418011,0.3721799,0.582601,0.4425227
max,1.0,1.0,0.9999999,1.0,1.0,1.0,0.9999999,1.0,1.0,1.0,...,1.0,1.0,1.0,0.9999999,1.0,1.0,0.9999999,1.0,1.0,1.0


## Train-Test Split

In [6]:
# drop the last row
labels=labels[:-1]
labels.tail(5)
# train-test split
X_train, X_test, y_train, y_test = train_test_split(data, labels, shuffle=False, test_size=0.3)

In [7]:
# anomaly counts
print(f'Number anomalies in train data: {y_train.sum()}')
print(f'Number anomalies in test data: {y_test.sum()}')

Number anomalies in train data: x    279997
dtype: int64
Number anomalies in test data: x    643218
dtype: int64


## Create Model

In [None]:
model = OneClassSVM(kernel='rbf')
model.fit(X_train)

## Evaluate

In [None]:
predictions = model.predict(X_test)

In [None]:
# convert to 0-1
predictions = [0 if x>0 else 1 for x in predictions] 

In [None]:
# average precision score train
aps_test = average_precision_score(y_test, predictions)
print(f'Average precision score for test: {aps_test:.4f}')

In [None]:
# precision-recall curve for test results
precision, recall, thresholds = precision_recall_curve(y_test, predictions)
plt.title(f'Средняя точность: {aps_test*100:.1f}%')
plt.xlabel('Чувствительность')
plt.xlim([0, 1.1])
plt.ylim([0, 1.1])
plt.ylabel('Точность')
#plt.ylim([0, 1.1])
#plt.xlim([0, 1.0])
plt.grid()
plt.plot(recall, precision, color='k')
plt.savefig('OSVM_Active_Wiretap_AP.png')
plt.show()

In [None]:
# receiver operational characteristic
fpr, tpr, thresholds = roc_curve(y_test, predictions)
auROC = auc(fpr, tpr)
plt.figure()
plt.title(f'Площадь под кривой={auROC:.2f}')
plt.plot(fpr, tpr, color='k', lw=2)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.plot([0,1], [0,1], color='k', lw=2, linestyle='--')
plt.grid()
plt.savefig('OSVM_Active_Wiretap_AUC.png')
plt.show()