In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import utils 
import matplotlib

##CSV Created from PCAP to CSV Utility in /PCAP Manipulation/

data = pd.read_csv('path', low_memory=False)

# extract the specific type of data wanted, such as
data = data[data['protocol'] == "tcp"]
data = data[data["mac"] == "xx:xx:xx:xx:xx"]

# the full dataset contains features for SMTP, NDP, ICMP, etc.
# here we'll grab just the relevant features for HTTP. 
relevant_features = [
  "duration",
  "src_bytes",
  "dst_bytes",
  "length",
  "label" ## -> if it is labeled
]
#Subset the data with only the relevant features, leave as is if desire no feature engineering (will take longer)
data = data[relevant_features]
#Normalize data, performs better with SVM
data["duration"] = np.log((data["duration"] + 0.1).astype(float)) 
data["src_bytes"] = np.log((data["src_bytes"] + 0.1).astype(float)) 
data["dst_bytes"] = np.log((data["dst_bytes"] + 0.1).astype(float))

data.loc[data['label'] == "home", "nothome"] = 1 
data.loc[data['label'] != "home", "nothome"] = -1

target = data['nothome']

outliers = target[target == -1]
print("outliers.shape", outliers.shape)
print("outlier fraction", outliers.shape[0]/target.shape[0])

data.drop(["label", "nothome"], axis=1, inplace=True)
##Verify that the dimension has dropped by 1 and therefore label has been removed
data.shape

In [None]:
#Split some training and test data from the source set to evaluate later
from sklearn.model_selection import train_test_split
train_data, test_data, train_target, test_target = train_test_split(data, target, train_size = 0.8)
train_data.shape

In [None]:
from sklearn import svm
nu = outliers.shape[0] / target.shape[0]
print("nu", nu)
#Set model with hypermarameters, rbf is the most common kernel (non linear) and gamma can be modified later to fine tune the success
model = svm.OneClassSVM(nu=nu, kernel='rbf', gamma=0.00005) 
model.fit(train_data)

In [None]:
##Test the model
from sklearn import metrics
preds = model.predict(train_data)
targs = train_target print("accuracy: ", metrics.accuracy_score(targs, preds))
print("precision: ", metrics.precision_score(targs, preds)) 
print("recall: ", metrics.recall_score(targs, preds))
print("f1: ", metrics.f1_score(targs, preds))
print("area under curve (auc): ", metrics.roc_auc_score(targs, preds))

In [None]:
##Evaluate on unseen test data
preds = model.predict(test_data)
targs = test_target print("accuracy: ", metrics.accuracy_score(targs, preds))
print("precision: ", metrics.precision_score(targs, preds)) 
print("recall: ", metrics.recall_score(targs, preds))
print("f1: ", metrics.f1_score(targs, preds))
print("area under curve (auc): ", metrics.roc_auc_score(targs, preds))

In [None]:
##To save model for later use
outputfile = 'oneclass_v1.model'
from sklearn.externals import joblib
joblib.dump(model, outputfile, compress=9)