In [1]:
################################################################################
# Load dataset and split it into training and test set
################################################################################

import pandas as pd
import os
from tabulate import tabulate

sample_size = 100000

# Load dateset
df = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2.csv')

# Split dataset according to attack type
normal_df = df[df['Target'] == 0]
attack_df = df[df['Target'] == 1]

# Drop columns
normal_df = normal_df.drop(columns=['Target', 'Traffic'])
attack_df = attack_df.drop(columns=['Target', 'Traffic'])

# Split dataset into training and test set
normal_df_train = normal_df.sample(frac=0.8, random_state=42)
normal_df_test = normal_df.drop(normal_df_train.index)
attack_df_train = attack_df.sample(frac=0.8, random_state=42)
attack_df_test = attack_df.drop(attack_df_train.index)

# Print dataset sizes in a table
data = [
    ["Normal", normal_df.shape[0], normal_df_train.shape[0], normal_df_test.shape[0]],
    ["Attack", attack_df.shape[0], attack_df_train.shape[0], attack_df_test.shape[0]]
]
print(tabulate(data, headers=["Atack type", "Total", "Train", "Test"], tablefmt="grid"))

+--------------+---------+---------+--------+
| Atack type   |   Total |   Train |   Test |
| Normal       |   50000 |   40000 |  10000 |
+--------------+---------+---------+--------+
| Attack       |   50000 |   40000 |  10000 |
+--------------+---------+---------+--------+


In [None]:
################################################################################
# Predict from Vector Store - Search Type Similarity (default)
################################################################################

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

train_set_size = 10000

embeddings = HuggingFaceEmbeddings()

vector_store = Chroma(
    collection_name="wustl-iiot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

retriever = vector_store.as_retriever(
    search_type="similarity", 
    search_kwargs={"k": 10})

datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        query_document = str(dataset.iloc[i].to_list())
        similar_documents = retriever.invoke(query_document, filter={"source": "wustl-iiot"})
        y_true.append(attack_type)
        y_pred.append(mode([doc.metadata["label"] for doc in similar_documents]))

c_report = classification_report(y_true, y_pred, digits=4)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/vs/result-vs-df-{sample_size}-2-{train_set_size}.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

In [None]:
################################################################################
# Predict from Vector Store - Search Type MMR
################################################################################

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

train_set_size = sample_size

embeddings = HuggingFaceEmbeddings()

vector_store = Chroma(
    collection_name="wustl-iiot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 10, "fetch_k": 10})

datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        query_document = str(dataset.iloc[i].to_list())
        similar_documents = retriever.invoke(query_document, filter={"source": "wustl-iiot"})
        y_true.append(attack_type)
        y_pred.append(mode([doc.metadata["label"] for doc in similar_documents]))

c_report = classification_report(y_true, y_pred, digits=4)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/vs/result-vs-mr-{sample_size}-2-{train_set_size}.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

In [2]:
################################################################################
# Predict from Vector Store - Mean Vector
################################################################################

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import numpy as np
from sentence_transformers import util

train_set_size = 100000

embeddings = HuggingFaceEmbeddings()

vector_store = Chroma(
    collection_name="wustl-iiot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()

datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        query_content = str(dataset.iloc[i].to_list())
        query_embedding = embeddings.embed_query(query_content)
        # Calculate cosine similarity
        # normal_similarity = np.dot(normal_mean_vector, query_embedding) / (np.linalg.norm(normal_mean_vector) * np.linalg.norm(query_embedding))
        # attack_similarity = np.dot(attack_mean_vector, query_embedding) / (np.linalg.norm(attack_mean_vector) * np.linalg.norm(query_embedding))
        normal_similarity = util.cos_sim(query_embedding, normal_mean_vector).float()
        attack_similarity = util.cos_sim(query_embedding, attack_mean_vector).float()
        y_true.append(attack_type)
        y_pred.append("normal" if normal_similarity > attack_similarity else "attack")

c_report = classification_report(y_true, y_pred, digits=4)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/vs/result-vs-mv-{sample_size}-2-{train_set_size}.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

  from tqdm.autonotebook import tqdm, trange
Predicting normal entries...: 100%|█████████████████████████| 10000/10000 [1:30:30<00:00,  1.84it/s]
Predicting attack entries...: 100%|█████████████████████████| 10000/10000 [1:47:07<00:00,  1.56it/s]


              precision    recall  f1-score   support

      attack     0.9824    0.8759    0.9261     10000
      normal     0.8880    0.9843    0.9337     10000

    accuracy                         0.9301     20000
   macro avg     0.9352    0.9301    0.9299     20000
weighted avg     0.9352    0.9301    0.9299     20000

[[8759 1241]
 [ 157 9843]]


In [9]:
################################################################################
# Predict from Vector Store - Mean Vector
################################################################################

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import numpy as np
from sentence_transformers import util

train_set_size = 100000
test_set_size = 100000
dataset_name = "wustl-iiot"

embeddings = HuggingFaceEmbeddings()

# Calculate mean vectors
vector_store = Chroma(
    collection_name=dataset_name,
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()

# Load test dataset
vector_store_test = Chroma(
    collection_name=dataset_name+"-test",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{test_set_size}-2-test")

normal_vectors_test = vector_store_test._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
attack_vectors_teset = vector_store_test._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']

vectors_test = {"normal": normal_vectors_test, "attack": attack_vectors_teset}
y_pred = []
y_true = []
for attack_type, vectors in vectors_test.items():
    for i in tqdm(range(len(vectors)), ncols=100, desc=f"Predicting {attack_type} entries..."):
        query_embedding = vectors[i]
        normal_similarity = util.cos_sim(query_embedding, normal_mean_vector).float()
        attack_similarity = util.cos_sim(query_embedding, attack_mean_vector).float()
        y_true.append(attack_type)
        y_pred.append("normal" if normal_similarity > attack_similarity else "attack")

c_report = classification_report(y_true, y_pred, digits=4)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/vs/result-vs-mv-{train_set_size}-{test_set_size}-new.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

Predicting normal entries...: 100%|██████████████████████████| 10000/10000 [00:11<00:00, 881.30it/s]
Predicting attack entries...: 100%|██████████████████████████| 10000/10000 [00:24<00:00, 415.42it/s]


              precision    recall  f1-score   support

      attack     0.9824    0.8759    0.9261     10000
      normal     0.8880    0.9843    0.9337     10000

    accuracy                         0.9301     20000
   macro avg     0.9352    0.9301    0.9299     20000
weighted avg     0.9352    0.9301    0.9299     20000

[[8759 1241]
 [ 157 9843]]
