# Install and load dependencies

In [None]:
%pip install numpy pandas scikit-learn seaborn matplotlib "nfstream==6.5.3"

In [67]:
from nfstream import NFStreamer, NFPlugin
from math import log2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from pathlib import Path
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

# Process PCAP files

## Load data

In [68]:
# Code taken from Adrián Pekár <apekar@hit.bme.hu>

def normalize_flow_key(src_ip, dst_ip, src_port, dst_port, protocol):
    # Normalize 5-tuple to ensure consistent key regardless of direction
    if (src_ip, src_port) < (dst_ip, dst_port):
        return (src_ip, dst_ip, src_port, dst_port, protocol)
    else:
        return (dst_ip, src_ip, dst_port, src_port, protocol)

class FlowSlicerWithLabels(NFPlugin):
    # Shared state across all flows
    tracked_flows = {}
    
    def __init__(self, limit, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.limit = limit
 
    def on_init(self, packet, flow):
        # Normalize the 5-tuple key
        flow_key = normalize_flow_key(
            flow.src_ip, flow.dst_ip, flow.src_port, flow.dst_port, flow.protocol
        )
 
        # Check if it's the first time this flow's 5-tuple is seen
        flow.udps.flow_key = flow_key
        if flow_key not in FlowSlicerWithLabels.tracked_flows:
            FlowSlicerWithLabels.tracked_flows[flow_key] = 1
            flow.udps.label = "first"
        else:
            FlowSlicerWithLabels.tracked_flows[flow_key] += 1
            flow.udps.label = "residual"
 
    def on_update(self, packet, flow):
        # Expire the flow if packet limit is reached
        if flow.bidirectional_packets >= self.limit:
            flow.expiration_id = -1  # Expire the flow to create a new one
 
    def on_expire(self, flow):
        # No specific action needed here for labeling
        pass

In [69]:
def sample_entropy(X):
    p = {}
    # calculate frequencies
    for xi in X:
        p[xi] = p.get(xi, 0) + 1
    # normalize frequencies
    for xi in p:
        p[xi] /= len(X)

    # calculate Sample entropy
    return -sum(p[xi] * log2(p[xi] / len(X)) for xi in p)

class FingerprintPlugin(NFPlugin):
    def __init__(self, win_size, buf_size, u, t, sigma, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.payload_buffers = {}
        
        self.win_size = win_size
        self.buf_size = buf_size
        self.u = u
        self.t = t
        self.sigma = sigma

    def processing(self, packet, flow):
        # append the hex encoded payload to the flow
        # note: packet.ip_packet is a bytes object
        if len(self.payload_buffers[flow.id]) < self.buf_size:
            self.payload_buffers[flow.id] += packet.ip_packet.hex()
        
    def on_init(self, packet, flow):
        if flow.udps.label == "residual":
            return
        self.payload_buffers[flow.id] = ''
        self.processing(packet, flow)
        

    def on_update(self, packet, flow):
        if flow.udps.label == "residual":
            return
        self.processing(packet, flow)

    def on_expire(self, flow):
        if flow.udps.label == "residual":
            return
        # if len(self.payload_buffers[flow.id]) < self.buf_size:  # add padding to payload buffer if it does not exceed the buffer size
        #     self.payload_buffers[flow.id] += ['\0'.encode().hex()] * (self.buf_size - len(self.payload_buffers[flow.id]))
        actual_buf_size = min(self.buf_size, len(self.payload_buffers[flow.id]))

        count = 0
        Hf = []
        for i in range(actual_buf_size - self.win_size + 1):
            win_buffer = self.payload_buffers[flow.id][i : i + self.win_size]
            Hi = sample_entropy(win_buffer)
            Hf.append(Hi)
            if Hi > self.u - self.t * self.sigma:
                count += 1

        flow.udps.sus = (count == actual_buf_size - self.win_size + 1)
        flow.udps.Hf = Hf

# parameters
SLICING_LIMIT = 30
IDLE_TIMEOUT = 10000
ACTIVE_TIMEOUT = 100000
WIN_SIZE = 32  # Luo
BUF_SIZE = 1024  # during the analysis
U = 4.8817  # Luo
T = 3  # to gain 99.4% confidence
SIGMA = 0.08134

def load_streams_from_pcap(path):
    df = NFStreamer(
        source=path,
        # how to handle packets of a flow
        decode_tunnels=True,
        idle_timeout=IDLE_TIMEOUT,
        active_timeout=ACTIVE_TIMEOUT,
        n_dissections=SLICING_LIMIT,
        accounting_mode=1,
        # what to look for
        statistical_analysis=True,
        splt_analysis=SLICING_LIMIT,
        udps=[FlowSlicerWithLabels(SLICING_LIMIT), FingerprintPlugin(WIN_SIZE, BUF_SIZE, U, T, SIGMA)],
    ).to_pandas()
    return df

## Construct labeling based on filenames

In [None]:
CLEAN_RUN = False

if not CLEAN_RUN and os.path.exists("work/flows.csv"):
    df = pd.read_csv("work/flows.csv")
else:
    file_paths = [p for p in Path("work/pcaps").iterdir() if p.is_file() and p.suffix in {".pcap", ".pcapng"}]

    df = pd.DataFrame()

    for p in file_paths:
        print(f"Processing: {p.name}")

        vpn_type, l7, *_ = p.stem.split("_")
        category = f"{vpn_type}_{l7}"

        pcap_df = load_streams_from_pcap(p.absolute())
        max_flow_key = pcap_df["udps.flow_key"].mode()[0]
        pcap_df = pcap_df[pcap_df["udps.flow_key"] == max_flow_key]
        pcap_df = pcap_df[pcap_df["udps.label"] == "first"]
        pcap_df = pcap_df.drop(["udps.label", "udps.flow_key"], axis=1)
        pcap_df["category"] = category
        df = pd.concat([df, pcap_df], axis=0)

    df.to_csv("work/flows.csv")

df

## Plot PLS like in the whitepaper

In [None]:
for row in df["splt_ps"]:
    print(len(eval(str(row))), row)

In [None]:
ps_df = df[["category", "splt_ps"]]

packet_positions = list(range(1, SLICING_LIMIT + 1))
categories = sorted(list(set(ps_df["category"])))
category_colors = ['blue', 'pink', 'green', 'red', 'purple', 'orange', 'black', 'brown', 'cyan', 'magenta', 'yellow', 'gray', 'olive', 'lime', 'teal', 'navy', 'maroon', 'aqua', 'fuchsia', 'silver', 'lime', 'teal', 'navy', 'maroon', 'aqua', 'fuchsia', 'silver']

# Generate data for each category
data = {}

for row in ps_df.itertuples():
    if row[1] not in data:
        data[row[1]] = ([], [])
    data[row[1]][0].append(packet_positions)
    data[row[1]][1].append(eval(row[2]))

# Plot setup
plt.figure(figsize=(8, 6), dpi=300)

# Plot data points for each category
for category, color in zip(categories, category_colors):
    positions, lengths = data[category]
    plt.scatter(lengths, positions, label=category, color=color)

# Add labels, grid, and legend
plt.title('PLS')
plt.xlabel('Payload Length')
plt.ylabel('Packet Position')
plt.yticks(packet_positions)
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

# Machine Learning 'n stuff

## Feature extraction

In [73]:
def extract_features(df):
    columns = [
        # Packet count (???)
        # "src2dst_packets", "dst2src_packets", "src2dst_bytes", "dst2src_bytes",

        # Packet Size
        "bidirectional_mean_ps", "bidirectional_stddev_ps",
        "src2dst_max_ps", "src2dst_min_ps", "src2dst_mean_ps", "src2dst_stddev_ps",
        "dst2src_max_ps", "dst2src_min_ps", "dst2src_mean_ps", "dst2src_stddev_ps",

        # Packet Interarrival Time (PIAT)
        "bidirectional_mean_piat_ms", "bidirectional_stddev_piat_ms",
        "bidirectional_max_piat_ms", "bidirectional_min_piat_ms",
        "src2dst_mean_piat_ms", "src2dst_stddev_piat_ms",
        "src2dst_max_piat_ms", "src2dst_min_piat_ms",
        "dst2src_mean_piat_ms", "dst2src_stddev_piat_ms",
        "dst2src_max_piat_ms", "dst2src_min_piat_ms",

        # Sample Entropy Fingerprint (SEF)
        "udps.Hf",
        "udps.sus",

        # Packet Length Sequence (PLS)
        "splt_ps",

        # Label
        "category",
    ]

    df = df[columns]
    return df

df_dropped = extract_features(df)

## Training the model(s)

In [None]:
def flatten(df):
    Hf_avg = df["udps.Hf"].apply(eval).apply(np.mean)
    Hf_median = df["udps.Hf"].apply(eval).apply(np.median)
    df["udps.Hf_avg"] = Hf_avg
    df["udps.Hf_med"] = Hf_median
    df = df.drop(["udps.Hf"], axis=1)

    splt_df = pd.DataFrame(list(eval(x) for x in df["splt_ps"]), columns=[f"splt_ps_{i}" for i in range(1, SLICING_LIMIT + 1)])
    df = pd.concat([df.drop(["splt_ps"], axis=1), splt_df], axis=1)
    return df

df_flattened = flatten(df_dropped)

In [75]:
def encode(df):
    le = LabelEncoder()
    df["category_encoded"] = le.fit_transform(df["category"])
    return df, le

df_encoded, label_encoder = encode(df_flattened)

In [76]:
X = df_encoded.drop(["category", "category_encoded"], axis=1)
y = df_encoded["category_encoded"]

smote = SMOTE(k_neighbors=2, random_state=42)  # Adjust k_neighbors to smallest class size - 1
smote_enn = SMOTEENN(random_state=42, smote=smote)
X_comb, y_comb = smote_enn.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_comb, y_comb, test_size=0.2, random_state=42, stratify=y_comb)

In [None]:
def plot_label_frequencies(ds_names, ds, classes):
    fig_ds, axes = plt.subplots(1, 2, figsize=(10, 4), dpi=300)  # Two subplots for training and test data
    for i, (ds_type, ds_data) in enumerate(zip(ds_names, ds)):
        # Assuming ds_data contains the labels (y_train, y_test) as numerical labels
        labels = ds_data
        bin_edges = np.concatenate(([-.5], np.arange(len(classes)) + .5))  # For correct bin placement in histogram
        axes[i].hist(labels, bins=bin_edges, density=False, rwidth=0.6)
        axes[i].set_title(f'{ds_type} Labels')
        axes[i].set_xticks(np.arange(len(classes)))
        axes[i].set_xticklabels(classes)
        axes[i].tick_params(axis='x', rotation=90)
        axes[i].set_xlabel('Class Labels')
        axes[i].set_ylabel('Frequency')
        
    plt.tight_layout()
    plt.show()

plot_label_frequencies(
    ['Training', 'Test'],   # Names of the datasets
    [y_train, y_test],      # Label data for training and testing
    label_encoder.classes_  # Class names or labels
)


In [None]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

In [None]:
test_predictions = rf_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Test Accuracy: {test_accuracy}')

cm = confusion_matrix(y_test, test_predictions, normalize='true')

plt.figure(figsize=(10, 10), dpi=300)
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_, cmap='Blues')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Normalized Confusion Matrix')
plt.show()

## Plotting feature importances

In [None]:
importances = rf_clf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 3), dpi=300)
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], color="r", align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

X_train.columns[indices.tolist()[:10]]

## Evaluating model performance

In [None]:
def load_custom_flow(name, path):
    category = name

    pcap_df = load_streams_from_pcap(path)
    flows = {}
    for _, flow in pcap_df.iterrows():
        flow_key = flow["udps.flow_key"]
        if flow_key not in flows:
            flows[flow_key] = 1
        else:
            flows[flow_key] += 1
    max_flow = max(flows, key=flows.get)
    pcap_df = pcap_df.loc[lambda x: x["udps.flow_key"] == max_flow]
    pcap_df["category"] = category
    pcap_df = pcap_df[pcap_df["udps.label"] == "first"]
    pcap_df = pcap_df.drop(["udps.label", "udps.flow_key"], axis=1)
    return pcap_df

custom_df, _ = encode(flatten(extract_features(load_custom_flow("nonvpn_sftp", "work/pcaps/sftptest.pcapng"))))
# custom_df, _ = encode(flatten(extract_features(dani("vpn_youtube", "work/pcaps/urbanvpn_youtube.pcapng"))))
custom_df

In [82]:
X_custom_test = custom_df.drop(["category", "category_encoded"], axis=1)
y_custom_test = custom_df["category_encoded"]

In [None]:
custom_test_predictions = rf_clf.predict(X_custom_test)
custom_test_predictions = label_encoder.inverse_transform(custom_test_predictions)
print(f"Predicted: {custom_test_predictions.max()}, actual: {custom_df['category'].max()}")