# Evaluating Vector Store

In [1]:
################################################################################
# Load dataset and split it into training and test set
################################################################################

import pandas as pd
import os
from tabulate import tabulate

sample_size = 10000

# Load dateset
df_train = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2_train.csv')
df_test = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2_test.csv')

# Split dataset according to attack type and drop columns
normal_df_train = df_train[df_train['attack'] == 0].drop(columns=['attack', 'category', 'subcategory'])
normal_df_test = df_test[df_test['attack'] == 0].drop(columns=['attack', 'category', 'subcategory'])
attack_df_train = df_train[df_train['attack'] == 1].drop(columns=['attack', 'category', 'subcategory'])
attack_df_test = df_test[df_test['attack'] == 1].drop(columns=['attack', 'category', 'subcategory'])

# Print dataset sizes in a table
data = [
    ["Normal", normal_df_train.shape[0] + normal_df_test.shape[0], normal_df_train.shape[0], normal_df_test.shape[0]],
    ["Attack", attack_df_train.shape[0] + attack_df_test.shape[0], attack_df_train.shape[0], attack_df_test.shape[0]]
]
print(tabulate(data, headers=["Atack type", "Total", "Train", "Test"], tablefmt="grid"))

+--------------+---------+---------+--------+
| Atack type   |   Total |   Train |   Test |
| Normal       |    5000 |    4000 |   1000 |
+--------------+---------+---------+--------+
| Attack       |    5000 |    4000 |   1000 |
+--------------+---------+---------+--------+


In [2]:
################################################################################
# Predict from Vector Store
################################################################################

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

train_set_size = sample_size

embeddings = HuggingFaceEmbeddings()

vector_store = Chroma(
    collection_name="bot-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})

datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        query_document = str(dataset.iloc[i].to_list())
        similar_documents = retriever.invoke(query_document, filter={"source": "bot-iot"})
        y_true.append(attack_type)
        y_pred.append(mode([doc.metadata["label"] for doc in similar_documents]))

c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-{sample_size}-2-{train_set_size}.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

  from tqdm.autonotebook import tqdm, trange
Predicting normal entries...: 100%|█████████████████████████████| 1000/1000 [21:33<00:00,  1.29s/it]
Predicting attack entries...: 100%|█████████████████████████████| 1000/1000 [22:51<00:00,  1.37s/it]

              precision    recall  f1-score   support

      attack       0.87      0.99      0.93      1000
      normal       0.99      0.85      0.92      1000

    accuracy                           0.92      2000
   macro avg       0.93      0.92      0.92      2000
weighted avg       0.93      0.92      0.92      2000

[[992   8]
 [146 854]]



