# Packet Anomaly detection

We used the data set available (sometimes) at the Canadian institute of cybersecurity: https://www.unb.ca/cic/datasets/iotdataset-2022.html

This dataset is simulated data from the folowing setup:

<img src="https://raw.githubusercontent.com/b-yond-infinite-network/sharkfest-europe-2023/main/assets/iot-setup.jpg">


To create chunks from a large PCAP file:


```
editcap -c [chunk size] [inputfile.pcap]  [outputdir]/[prefix].pcap
```

**In the following, we fix the chunk size to 10000 Frames.**

To extract pcap into a csv file:


```
tshark -r [file.pcap]  -T fields -e frame.number -e frame.interface_id -e frame.len -e frame.protocols -e frame.time_delta -e ip.hdr_len -e ip.len -e ip.proto -e ip.ttl -e ip.version -E aggregator="$" -E separator=";" -E header=y```



## Verify runtime environment

In [None]:
import pandas as pd
import numpy as np
try:
    import google.colab
    IN_COLAB = True
    # Load the autoreload extension for IPython
    %load_ext autoreload
    # Set the autoreload extension to reload modules every time they are imported, so that changes made to code in the src folder are reflected in the running code
    %autoreload 2
    %pip install scikit-learn==1.3.1
except:
    IN_COLAB = False


## Load Data
First part is to read the data from the disk. The data has been extracted using `tshark` and stored in a CSV files `extract_active.csv` and `extract_idle.csv`.

In [None]:
%%time
df_active = pd.read_csv("https://github.com/b-yond-infinite-network/sharkfest-europe-2023-data/raw/main/packet-anomaly-detection/packet-anomaly-detection-data-active.zip", compression='zip' ,index_col=0)
df_idle = pd.read_csv("https://github.com/b-yond-infinite-network/sharkfest-europe-2023-data/raw/main/packet-anomaly-detection/packet-anomaly-detection-data-idle.zip", compression='zip' ,index_col=0)


In [None]:
df_active.info()
df_idle.info()

In [None]:

def keep_columns_with_data(df):
    return df.loc[:, df.apply(lambda x: x.isnull().sum() != df.shape[0], axis=0)]

def encode_protocols(df, colname):
    protocols_df = df[colname].str.get_dummies(sep=':')

    data_with_protocols = pd.concat([df, protocols_df], axis=1)

    return data_with_protocols.drop(colname, axis=1)

def create_index(df):
    df.index = df.apply(lambda x: f"{x['file']}", axis=1)
    df.drop(['file', 'frame.number'], axis=1, inplace=True)
    return df

def clean_nested(df):
    non_numeric_cols = ['ip.hdr_len', 'ip.len', 'ip.proto', 'ip.ttl', 'ip.version']
    for col in non_numeric_cols:
        df[col] = df[col].apply(lambda x: str(x).split('$')[0])
    df[non_numeric_cols] = df[non_numeric_cols].apply(pd.to_numeric, errors='coerce')
    return df

def fill_missing_values(df):
    df.fillna(-1, inplace=True)
    return df

def preprocess(df):
    res = encode_protocols(df, 'frame.protocols')
    res = create_index(res)
    res = clean_nested(res)
    res = keep_columns_with_data(res)
    res = fill_missing_values(res)
    return res

def create_features(df):
    return df.groupby(level=0).sum()

In [None]:
%%time
df_active = create_features(preprocess(df_active))

In [None]:
%%time
df_idle = create_features(preprocess(df_idle))

In [None]:
df_idle = df_idle.loc[:,df_idle.mean().sort_values()>10]
columns = list(set(df_active.columns) & set(df_idle.columns))


In [None]:
df_active = df_active[columns]
df_idle = df_idle[columns]

In [None]:
df_active.shape

In [None]:
%%time
from sklearn.model_selection import train_test_split
pos_train, pos_test = train_test_split(df_idle,test_size=0.01)
neg_train, neg_test = train_test_split(df_active,test_size=0.99)

In [None]:
train = pd.concat([pos_train])
test = pd.concat([pos_test,neg_test])

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_train = pd.DataFrame(pca.fit_transform(train))
pca_test =  pd.DataFrame(pca.transform(test))
train['label'] = len(train)*[1]
test['label'] = len(pos_test)*[1]+len(neg_test)*[-1]

In [None]:
%%time
from sklearn.ensemble import IsolationForest
iforest = IsolationForest(contamination = 0.5,random_state=42)
iforest.fit(pca_train)
preds_train = iforest.predict(pca_train)
preds_test = iforest.predict(pca_test)
train['preds'] =  preds_train
test['preds'] = preds_test

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(test['label'],test['preds']))
print(confusion_matrix(test['label'],test['preds']))
print(confusion_matrix(train['label'],train['preds']))

In [None]:
%%time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Concatenate the datasets
# combined_df = pd.concat([train, test])
combined_df = pd.concat([ train])

# Handle missing values by replacing NaNs with median of each column
# combined_df.fillna(combined_df.median(), inplace=True)
labels = combined_df['label'].to_list()
preds = combined_df['preds'].to_list()
combined_df.drop(['label','preds'],axis=1,inplace=True)
# Scale the features
# scaler = StandardScaler()
# data_scaled = scaler.fit_transform(combined_df)

# Apply PCA
pca = PCA(n_components=2)
pca.fit(combined_df)
principal_components = pca.transform(combined_df)

# Create a DataFrame with the principal components
pc_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pc_df['label'] = labels
pc_df['preds'] = preds
# Plot the 2D PCA results
plt.figure(figsize=(10, 7))
colors = {1: 'red', -1: 'blue'}
markers = {-1: 'x', 1: '^'}

for actual_value, group in pc_df.groupby('label'):
    for pred_value, sub_group in group.groupby('preds'):
        alpha = 1
        color = colors[actual_value]
        if pred_value != actual_value:
            alpha=1
            # color = 'green'
        plt.scatter(sub_group['PC1'], sub_group['PC2'], c=color, s=50, marker=markers[pred_value],alpha=alpha)

# plt.scatter(pc_df['PC1'], pc_df['PC2'], c=pc_df['label'].map(colors), s=50)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('2D PCA of Idle vs. Active PCAP Summaries')
plt.legend(handles=[plt.Line2D([0], [0], marker='x', color='red', markerfacecolor='red', markersize=10, label='Active, predicted Idle'),
    plt.Line2D([0], [0], marker='^', color='red', markerfacecolor='red', markersize=10, label='Active, predicted Active'),
    plt.Line2D([0], [0], marker='x', color='blue', markerfacecolor='blue', markersize=10, label='Idle, predicted Idle'),
    plt.Line2D([0], [1], marker='^', color='blue', markerfacecolor='blue', markersize=10, label='Idle, predicted Active')
    ])
plt.grid(True)
plt.show()


# Compute the explained variance by the two principal components
explained_variance = pca.explained_variance_ratio_

print(explained_variance)

In [None]:
%%time
from sklearn.inspection import DecisionBoundaryDisplay
from matplotlib.pyplot import figure


pc_df_test = pd.DataFrame(pca.transform(test.drop(['preds','label'],axis=1)),columns=['PC1','PC2'])
labels = train['label'].to_list(),test['label'].to_list()
X = pd.concat([pc_df.drop(['label','preds'],axis=1),pc_df_test])

disp = DecisionBoundaryDisplay.from_estimator(
    iforest,
    X,
    response_method="decision_function",
    alpha=0.5,

)

scatter = disp.ax_.scatter(X['PC1'], X['PC2'], c=labels, s=20, edgecolor="k",alpha=0.3)
disp.ax_.scatter(pc_df_test['PC1'],pc_df_test['PC2'])
disp.ax_.set_title("Path length decision boundary \nof IsolationForest")
# plt.axis("square")
handles, _ = scatter.legend_elements()
# disp.ax_.set_ylim([-1,1])
# plt.legend(handles=handles, labels=["outliers", "inliers"], title="true class")
plt.colorbar(disp.ax_.collections[1])
display(figure())

In [None]:
%%time
pc_df_test

In [None]:
pd.options.display.max_rows= 99999
pd.set_option('max_colwidth', 400)

In [None]:
len(df_idle.columns[df_idle.mean().sort_values()>3])

In [None]:
fig, ax1 = plt.subplots()
ax1.hist([df_idle['frame.time_delta'],df_active['frame.time_delta']],bins=20)


In [None]:
df_idle['frame.time_delta'].hist()