# Different approaches to anomaly detection

This Python notebook proposes three different approaches for anomaly detection in network flows. In addition, their advantages and disadvantages are detailed.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from river import anomaly
from river import preprocessing
from river import optim
from river import sketch
import re

In [2]:
# Training flows for the online learning algorithm
FLOWS_TRAIN_SCALER = 1000
FLOWS_TRAIN_OML = 100000


This dataset was created based on one of the datasets provided by Proactivanet. Subsequently, the following anomalies were introduced, with 2,500 flows of each type:

- Benign IP connecting to an anomalous IP during working hours
- Benign IP connecting to a benign IP at an anomalous time
- Benign IP connecting to an anomalous domain during working hours
- Benign IP connecting to an anomalous domain at an anomalous time
- Anomalous IP connecting to an anomalous IP during working hours

## Exact Match Detector

In [3]:
DATASET_PATH_DATASET2 = './dataset/'
DATASET_NAME_DATASET2 = 'dataset_anonymized.csv'

In [4]:
# Read CSV
dataset = pd.read_csv(DATASET_PATH_DATASET2 + DATASET_NAME_DATASET2, sep=',')

# Use the following features
dataset = dataset[['FIRST_SWITCHED', 'IPV_SRC_ADDR', 'L_SRC_PORT', 'IPV_DST_ADDR', 'DIRECTION', 'L_DST_PORT', 'IN_BYTES', 'OUT_BYTES', 'Label']]

# The FIRST_SWITCHED feature is converted into the hour of the day
dataset['FIRST_SWITCHED'] = pd.to_datetime(dataset['FIRST_SWITCHED']).dt.hour

# A PORT feature is created that is L_SRC_PORT if DIRECTION is 0 and L_DST_PORT if DIRECTION is 1
dataset['PORT'] = np.where(dataset['DIRECTION'] == 0, dataset['L_SRC_PORT'], dataset['L_DST_PORT'])

# Remove the columns L_SRC_PORT, L_DST_PORT, and DIRECTION
dataset = dataset.drop(columns=['L_SRC_PORT', 'L_DST_PORT', 'DIRECTION'])

# Preprocess IN_BYTES and OUT_BYTES by rounding to the nearest multiple of 100
dataset['IN_BYTES'] = dataset['IN_BYTES'].apply(lambda x: round(x, -2))
dataset['OUT_BYTES'] = dataset['OUT_BYTES'].apply(lambda x: round(x, -2))

# Remove label
dataset_sin_etiqueta = dataset.drop(columns=['Label'])


In [5]:
# Create the training set
X_train = dataset_sin_etiqueta[dataset['Label'] == 0].iloc[:100000]
print("Number of training samples: ", len(X_train))

Number of training samples:  100000


In [6]:
model = {}

for row in X_train.iterrows():
    key = tuple(row[1].values)
    model[key] = 0

El procedimiento anterior es igual al descrito anteriormente. Sin embargo, en este caso se ha entrenado con 200000 flujos benignos. En este caso, la fase de evaluación se ha realizado con 12500 flujos benignos y 12500 anómalos.

In [7]:
# Create the test set
X_test = dataset_sin_etiqueta[dataset['Label'] == 1].iloc[:12500]
X_test = pd.concat([X_test, dataset_sin_etiqueta[dataset['Label'] == 0].iloc[100000:112500]], ignore_index=True)

# Create the test set labels (1 for anomalies, 0 for normal flows)
y_test = np.ones(12500)
y_test = np.concatenate([y_test, np.zeros(12500)])

print("Number of anomalies in the test dataset: ", y_test[y_test == 1].shape[0])
print("Number of normal samples in the test dataset: ", y_test[y_test == 0].shape[0])


Number of anomalies in the test dataset:  12500
Number of normal samples in the test dataset:  12500


In [8]:
# Evaluation
tp = 0
tn = 0
fp = 0
fn = 0

for i, row in X_test.iterrows():

    key = tuple(row.values)
    is_anomaly = not key in model
    label = y_test[i]
    if is_anomaly and label == 1:
        tp += 1
    elif not is_anomaly and label == 0:
        tn += 1
    elif is_anomaly and label == 0:
        fp += 1
    elif not is_anomaly and label == 1:
        fn += 1

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
false_positive = fp / (fp + tn)
print("Accuracy: ", accuracy)
print("Recall: ", recall)
print("False Positive Rate: ", false_positive)
print("TP: ", tp, "TN: ", tn, "FP: ", fp, "FN: ", fn)

Accuracy:  0.741
Recall:  1.0
False Positive Rate:  0.518
TP:  12500 TN:  6025 FP:  6475 FN:  0


All anomalies are correctly detected (recall = 1), but the number of false positives is very high. This is due to the model's inability to generalize, as it relies on exact match searches.

## Supervised learning - DecisionTreeClassifier

In [9]:
# Shuffle the dataset
dataset = dataset.sample(frac=1, random_state=111).reset_index(drop=True)

# Create LabelEncoder objects
le_src = LabelEncoder()
le_dst = LabelEncoder()

# Fit and transform the IPV_SRC_ADDR and IPV_DST_ADDR features
dataset['IPV_SRC_ADDR'] = le_src.fit_transform(dataset['IPV_SRC_ADDR'])
dataset['IPV_DST_ADDR'] = le_dst.fit_transform(dataset['IPV_DST_ADDR'])

# Remove label column
dataset_sin_etiqueta = dataset.drop(columns=['Label'])


In this case, the model was trained with 100,000 benign flows and 6,250 anomalous flows.

In [10]:
# Split into training and test sets
X_train = dataset_sin_etiqueta[dataset['Label'] == 0].iloc[:93750]
# y_train contains 0s
y_train = np.zeros(93750)

# Add 6,250 anomalous flows to the training set
X_train = pd.concat([X_train, dataset_sin_etiqueta[dataset['Label'] == 1].iloc[:6250]])

# Add 1s to y_train for the anomalous samples
y_train = np.concatenate([y_train, np.ones(6250)])


In [11]:
print("Number of samples in the training dataset: ", y_train.shape[0])

Number of samples in the training dataset:  100000


In [12]:
# Create Grid Search for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Create the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=123)

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)
# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters: ", best_params)
# Create the Decision Tree Classifier with the best parameters
clf = DecisionTreeClassifier(**best_params, random_state=987)
# Fit the model
clf.fit(X_train, y_train)

Best parameters:  {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


For evaluation, 6,250 benign flows and 6,250 anomalous flows will be used.

In [13]:
# Retrieve the anomalous flows from the dataset
X_test = dataset_sin_etiqueta[dataset['Label'] == 1].iloc[6250:]
y_test = np.ones(X_test.shape[0])

# Add 6,250 benign flows to the test set
X_test = pd.concat([X_test, dataset_sin_etiqueta[dataset['Label'] == 0].iloc[93750:100000]])
y_test = np.concatenate([y_test, np.zeros(6250)])


In [14]:
print("Number of samples in the test dataset: ", y_test.shape[0])

Number of samples in the test dataset:  12500


In [15]:
# Evaluate the classifier
y_pred = clf.predict(X_test)

# Compute evaluation metrics
tp = np.sum((y_pred == 1) & (y_test == 1))
tn = np.sum((y_pred == 0) & (y_test == 0))
fp = np.sum((y_pred == 1) & (y_test == 0))
fn = np.sum((y_pred == 0) & (y_test == 1))

accuracy = (tp + tn) / (tp + tn + fp + fn)
# If there are no true positives, recall is 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
false_positive = fp / (fp + tn)

print("Accuracy: ", accuracy)
print("Recall: ", recall)
print("False Positive Rate: ", false_positive)
print("TP: ", tp, "TN: ", tn, "FP: ", fp, "FN: ", fn)


Accuracy:  0.99976
Recall:  0.99952
False Positive Rate:  0.0
TP:  6247 TN:  6250 FP:  0 FN:  3


In this case, the results are better than those obtained with the exact match search. Most anomalies are successfully detected, and the number of false positives is lower. However, it is important to note that a labeled dataset is required to train the model. Furthermore, it is very difficult for the model to generalize across different networks.

## Unsupervised Online Learning Using One-Class SVM (oSVM)

In [16]:
# Read CSV
dataset = pd.read_csv(DATASET_PATH_DATASET2 + DATASET_NAME_DATASET2, sep=',', index_col=False)

# Features
FEATURES = ['IPV_SRC_ADDR', 'IPV_DST_ADDR', 'L_DST_PORT', 'L_SRC_PORT', 'DIRECTION', 'FIRST_SWITCHED', 'LAST_SWITCHED', 'PROTOCOL', 'END_TYPE', 'IN_BYTES', 'OUT_BYTES', 'Label']

# Preprocess flows
dataset = dataset[FEATURES]
dataset = dataset.iloc[:, :].values

In [17]:
# Significative port
print("ℹ️ | MODE SIGNIFICANT PORT ACTIVATED")
for i in range(len(dataset)):
    if dataset[i][FEATURES.index('DIRECTION')] == 1:
        dataset[i][FEATURES.index('DIRECTION')] = dataset[i][FEATURES.index('L_DST_PORT')]
    else:
        dataset[i][FEATURES.index('DIRECTION')] = dataset[i][FEATURES.index('L_SRC_PORT')]
print('✅ | Significant port added')
    

for i in range(len(dataset)):
    dataset[i][FEATURES.index('FIRST_SWITCHED')] = int(dataset[i][FEATURES.index('FIRST_SWITCHED')].split(' ')[1].split(':')[0])

ℹ️ | MODE SIGNIFICANT PORT ACTIVATED
✅ | Significant port added


In [18]:
# Transform numpy array to pandas dataframe
dataset = pd.DataFrame(dataset, columns=FEATURES)

# Significant port
dataset = dataset.drop(columns=['L_SRC_PORT', 'L_DST_PORT'])
# Change DIRECTION column name to PORT
dataset = dataset.rename(columns={'DIRECTION': 'PORT'})



# Create the two datasets for trainning
datasets_training = dataset.loc[dataset['Label'] == 0].iloc[:FLOWS_TRAIN_SCALER + FLOWS_TRAIN_OML]
dataset_train_scaler = datasets_training.loc[dataset['Label'] == 0].iloc[:FLOWS_TRAIN_SCALER]
dataset_train_oml = datasets_training.loc[dataset['Label'] == 0].iloc[FLOWS_TRAIN_SCALER:FLOWS_TRAIN_SCALER + FLOWS_TRAIN_OML + 1]


# Drop train_scaler and train_oml from the dataset
dataset = dataset.drop(dataset_train_scaler.index)
dataset = dataset.drop(dataset_train_oml.index)


# Number of anomalies and normal samples
anomalies_samples= dataset.loc[dataset['Label'] == 1].shape[0]
benign_samples = dataset.loc[dataset['Label'] == 0].shape[0]

# Drop benign samples to equilibrate the dataset
if benign_samples > anomalies_samples:
    dataset = dataset.drop(dataset.loc[dataset['Label'] == 0].index[:benign_samples - anomalies_samples])
else:
    dataset = dataset.drop(dataset.loc[dataset['Label'] == 1].index[:anomalies_samples- benign_samples])

# Create the test dataset
dataset_test_labeled = dataset

# Shuffle the datasets
dataset_train_scaler = dataset_train_scaler.sample(frac=1,random_state=111).reset_index(drop=True)
dataset_train_oml = dataset_train_oml.sample(frac=1,random_state=111).reset_index(drop=True)
dataset_test_labeled = dataset_test_labeled.sample(frac=1,random_state=111).reset_index(drop=True)

In [19]:
encoder = preprocessing.OrdinalEncoder()

for i in range(len(dataset_train_scaler)):
    # Obtain first two elements (IP addresses) pandas datraframe
    first_two_elements = {j: str(dataset_train_scaler.iloc[i, j]) for j in range(2)}
    # Encode the IP addresses
    encoder.learn_one(first_two_elements)
    first_two_elements = encoder.transform_one(first_two_elements)
    dataset_train_scaler.iloc[i, 0] = first_two_elements[0]
    dataset_train_scaler.iloc[i, 1] = first_two_elements[1]

for i in range(len(dataset_train_oml)):
    # Obtain first two elements (IP addresses)
    first_two_elements = {j: str(dataset_train_oml.iloc[i, j]) for j in range(2)}
    # Encode the IP addresses
    encoder.learn_one(first_two_elements)
    first_two_elements = encoder.transform_one(first_two_elements)
    dataset_train_oml.iloc[i, 0] = first_two_elements[0]
    dataset_train_oml.iloc[i, 1] = first_two_elements[1]

for i in range(len(dataset_test_labeled)):
    # Obtain first two elements (IP addresses)
    first_two_elements = {j: str(dataset_test_labeled.iloc[i, j]) for j in range(2)}
    # Encode the IP addresses
    encoder.learn_one(first_two_elements)
    first_two_elements = encoder.transform_one(first_two_elements)
    dataset_test_labeled.iloc[i, 0] = first_two_elements[0]
    dataset_test_labeled.iloc[i, 1] = first_two_elements[1]

In [20]:
print('=====================================================================================================')
print('Number of anomalies in the train scaler dataset:\t', dataset_train_scaler.loc[dataset_train_scaler['Label'] == 1].shape[0])
print('Number of normal samples in the train scaler dataset:\t', dataset_train_scaler.loc[dataset_train_scaler['Label'] == 0].shape[0])

print('Number of normal samples in the trainOML dataset:\t', dataset_train_oml.loc[dataset_train_oml['Label'] == 0].shape[0])
print('Number of anomaly samples in the trainOML dataset:\t', dataset_train_oml.loc[dataset_train_oml['Label'] == 1].shape[0])

print('Number of anomalies in the test dataset:\t', dataset_test_labeled.loc[dataset_test_labeled['Label'] == 1].shape[0])
print('Number of normal samples in the test dataset:\t', dataset_test_labeled.loc[dataset_test_labeled['Label'] == 0].shape[0])
print('=====================================================================================================')

# Obtain the datasets without the label
dataset_train_no_labels = np.delete(dataset_train_oml, -1, axis=1)
dataset_test_no_labels = np.delete(dataset_test_labeled, -1, axis=1)
dataset_train_scaler = dataset_train_scaler.drop(columns=['Label'])

Number of anomalies in the train scaler dataset:	 0
Number of normal samples in the train scaler dataset:	 1000
Number of normal samples in the trainOML dataset:	 100000
Number of anomaly samples in the trainOML dataset:	 0
Number of anomalies in the test dataset:	 12500
Number of normal samples in the test dataset:	 12500


In [21]:
scaler = preprocessing.MaxAbsScaler()

scaler_dataset_train = []
scaler_dataset_test = []

# Convert the dataset to pandas DataFrame format
dataset_train_scaler = pd.DataFrame(dataset_train_scaler)
dataset_train_no_labels = pd.DataFrame(dataset_train_no_labels)
dataset_test_no_labels = pd.DataFrame(dataset_test_no_labels)

# Train the scaler model using the training dataset
for _, row in dataset_train_scaler.iterrows():
    # Convert row to dict using keys 0, 1, 2, 3, ...
    row = {i: value for i, value in enumerate(row)}
    scaler.learn_one(row)

# Scale the training dataset using the trained scaler
for _, row in dataset_train_no_labels.iterrows():
    row = row.to_dict()
    row = scaler.transform_one(row)
    scaler_dataset_train.append(list(row.values()))

# Scale the test dataset using the previously trained scaler
for _, row in dataset_test_no_labels.iterrows():
    row = row.to_dict()
    row = scaler.transform_one(row)
    scaler_dataset_test.append(list(row.values()))

print("✅ | Dataset Scaled")


✅ | Dataset Scaled


In [22]:
model = anomaly.QuantileFilter(
        anomaly.OneClassSVM(nu=0.05,intercept_lr=optim.schedulers.InverseScaling(learning_rate=0.25)),
        q = 0.99
    )

probability_preprocessing = preprocessing.MinMaxScaler()
probability = sketch.Histogram()


In [23]:
# Traning phase of OML
print("⏳ | Training OML")
for row in scaler_dataset_train:
    # Change dict to numpy array
    row_dict = {f'feature_{i}': value for i, value in enumerate(row)}

    model.learn_one(row_dict)

    score = model.score_one(row_dict)
    probability_preprocessing.learn_one({0: score})
    probability.update(probability_preprocessing.transform_one({0: score})[0])

print("✅ | Training phase completed")

⏳ | Training OML
✅ | Training phase completed


In [24]:
fp = 0
fn = 0
tp = 0
tn = 0
accuracies = []
recalls = []
false_positives = []
anomalies = []



for i, row in enumerate(scaler_dataset_test):
    # To dict
    if isinstance(row, list):
        row = {f'feature_{j}': value for j, value in enumerate(row)}

    score = model.score_one(row)
    anomalo = model.classify(score)
    anomalies.append(anomalo)

    # Update probability
    probability_preprocessing.learn_one({0: score})
    probability.update(probability_preprocessing.transform_one({0: score})[0])
    rank = probability.cdf(probability_preprocessing.transform_one({0: score})[0])


    if not anomalo:
        model.learn_one(row)

    label = dataset_test_labeled.iloc[i, -1]

    if anomalo and label == 1:
        tp += 1
    elif not anomalo and label == 0:
        tn += 1
    elif anomalo and label == 0:
        fp += 1
    elif not anomalo and label == 1:
        fn += 1
        
    accuracies.append((tp + tn) / (tp + tn + fp + fn))
    recalls.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
    false_positives.append(fp / (fp + tn) if (fp + tn) > 0 else 0)


accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
false_positive = fp / (fp + tn)
print("✅ | Testing phase completed")
print("🟢 | Accuracy: ", accuracy)
print("🟢 | Recall: ", recall)
print("🟢 | False Positive Rate: ", false_positive)
print("TP: ", tp, "TN: ", tn, "FP: ", fp, "FN: ", fn)

✅ | Testing phase completed
🟢 | Accuracy:  0.98712
🟢 | Recall:  0.98608
🟢 | False Positive Rate:  0.01184
TP:  12326 TN:  12352 FP:  148 FN:  174


The best results were achieved by this model, which is capable of detecting anomalies without relying on a labeled dataset—in other words, by analyzing the underlying patterns in the data.