# Install and load dependencies

In [None]:
%pip install numpy pandas scikit-learn seaborn matplotlib "nfstream==6.5.3"

In [15]:
from nfstream import NFStreamer, NFPlugin
from math import log2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Process PCAP files

## Load data

In [16]:
# Code taken from Adrián Pekár <apekar@hit.bme.hu>

class FlowSlicerWithLabels(NFPlugin):
    # Shared state across all flows
    tracked_flows = {}
 
    @staticmethod
    def normalize_flow_key(src_ip, dst_ip, src_port, dst_port, protocol):
        # Normalize 5-tuple to ensure consistent key regardless of direction
        if (src_ip, src_port) < (dst_ip, dst_port):
            return (src_ip, dst_ip, src_port, dst_port, protocol)
        else:
            return (dst_ip, src_ip, dst_port, src_port, protocol)
 
    def on_init(self, packet, flow):
        # Normalize the 5-tuple key
        flow_key = self.normalize_flow_key(
            flow.src_ip, flow.dst_ip, flow.src_port, flow.dst_port, flow.protocol
        )
 
        # Check if it's the first time this flow's 5-tuple is seen
        if flow_key not in FlowSlicerWithLabels.tracked_flows:
            FlowSlicerWithLabels.tracked_flows[flow_key] = 1
            flow.udps.label = "first"
        else:
            FlowSlicerWithLabels.tracked_flows[flow_key] += 1
            flow.udps.label = "residual"
 
    def on_update(self, packet, flow):
        # Expire the flow if packet limit is reached
        if flow.bidirectional_packets >= self.limit:
            flow.expiration_id = -1  # Expire the flow to create a new one
 
    def on_expire(self, flow):
        # No specific action needed here for labeling
        pass

In [17]:
def sample_entropy(X):
    p = {}
    # calculate frequencies
    for xi in X:
        p[xi] = p.get(xi, 0) + 1
    # normalize frequencies
    for xi in p:
        p[xi] /= len(X)

    # calculate Sample entropy
    return -sum(p[xi] * log2(p[xi] / len(X)) for xi in p)

class FingerprintPlugin(NFPlugin):
    def __init__(self, win_size, buf_size, u, t, sigma, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.payload_buffers = {}
        
        self.win_size = win_size
        self.buf_size = buf_size
        self.u = u
        self.t = t
        self.sigma = sigma

    def processing(self, packet, flow):
        # append the hex encoded payload to the flow
        # note: packet.ip_packet is a bytes object
        if len(self.payload_buffers[flow.id]) < self.buf_size:
            self.payload_buffers[flow.id] += packet.ip_packet.hex()
        
    def on_init(self, packet, flow):
        self.payload_buffers[flow.id] = ''
        self.processing(packet, flow)
        

    def on_update(self, packet, flow):
        self.processing(packet, flow)

    def on_expire(self, flow):
        # if len(self.payload_buffers[flow.id]) < self.buf_size:  # add padding to payload buffer if it does not exceed the buffer size
        #     self.payload_buffers[flow.id] += ['\0'.encode().hex()] * (self.buf_size - len(self.payload_buffers[flow.id]))
        actual_buf_size = min(self.buf_size, len(self.payload_buffers[flow.id]))

        count = 0
        Hf = []
        for i in range(actual_buf_size - self.win_size + 1):
            win_buffer = self.payload_buffers[flow.id][i : i + self.win_size]
            Hi = sample_entropy(win_buffer)
            Hf.append(Hi)
            if Hi > self.u - self.t * self.sigma:
                count += 1

        flow.udps.sus = (count == actual_buf_size - self.win_size + 1)
        flow.udps.Hf = Hf

# parameters
IDLE_TIMEOUT = 10000
ACTIVE_TIMEOUT = 100000
WIN_SIZE = 32  # Luo
BUF_SIZE = 1024  # during the analysis
U = 4.8817  # Luo
T = 3  # to gain 99.4% confidence
SIGMA = 0.08134

def load_streams_from_pcap(path):
    df = NFStreamer(
        source=path,
        # how to handle packets of a flow
        decode_tunnels=True,
        idle_timeout=IDLE_TIMEOUT,
        active_timeout=ACTIVE_TIMEOUT,
        n_dissections=20,
        accounting_mode=1,
        # what to look for
        statistical_analysis=True,
        splt_analysis=20,
        udps=FingerprintPlugin(WIN_SIZE, BUF_SIZE, U, T, SIGMA),
    ).to_pandas()
    return df

## Construct labeling based on filenames

In [None]:
from pathlib import Path

file_paths = [p for p in Path("work/pcaps").iterdir() if p.is_file() and p.suffix in {".pcap", ".pcapng"}]

for p in file_paths:
    vpn_type, *_ = p.stem.split("_")
    is_vpn = (vpn_type == "vpn")
    if is_vpn:
        traffic = "vpn"
    else:
        traffic = "web"

    print(f"Processing: {p.name} (labeled as {traffic} traffic)")
    df = load_streams_from_pcap(p.absolute())
    break

## Plot PLS like in the whitepaper

In [None]:
# TODO: some hardcore PLS plotting

# Machine Learning 'n stuff

## Feature extraction

In [None]:
# TODO:
#  - include everything from NFStreamer except time-related infos, protocols etc.
#  - keep PIAT and packet size like stats, SEF, PLS, ...

## Training the model(s)

## Plotting feature importances

In [None]:
importances = <MODEL_VAR>.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

## Evaluating model performance

In [None]:
# TODO: make some nice graphs of the model performance on the testing dataset