In [39]:
################################################################################
# Predict from Vector Store
################################################################################

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import numpy as np

train_set_size = 100

embeddings = HuggingFaceEmbeddings()

vector_store = Chroma(
    collection_name="cic-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})

# https://stackoverflow.com/questions/76482987/chroma-database-embeddings-none-when-using-get
# print(len(vector_store._collection.get(include=['embeddings'])["embeddings"]))

vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
mean_vector = np.mean(vectors, axis=0).tolist()

# get the most similar documents using the mean vector
similar_documents = vector_store._collection.query(query_embeddings=[mean_vector], n_results=5)
print(similar_documents['documents'])


# for attack_type, dataset in datasets.items():
#     test_set_size = dataset.shape[0]
#     for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
#         query_document = str(dataset.iloc[i].to_list())
#         similar_documents = retriever.invoke(query_document, filter={"source": "cic-iot"})
#         y_true.append(attack_type)
#         y_pred.append(mode([doc.metadata["label"] for doc in similar_documents]))

# c_report = classification_report(y_true, y_pred)
# c_matrix = confusion_matrix(y_true, y_pred)

# with open(f"results/result-{sample_size}-2-{train_set_size}.txt", "w") as f:
#     f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

# print(c_report)
# print(c_matrix)

[['[0.5280917382240295, 87.48, 6.0, 64.0, 2.349469286178324, 2.349469286178324, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.62, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 567.0, 54.0, 54.0, 54.0, 0.0, 54.0, 83361180.06697729, 9.5, 10.392304845413264, 0.0, 0.0, 0.0, 141.55]', '[4.034660081863404, 108.0, 6.0, 64.0, 0.4957047676197929, 0.4957047676197929, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 567.0, 54.0, 54.0, 54.0, 0.0, 54.0, 82980885.10488626, 9.5, 10.392304845413264, 0.0, 0.0, 0.0, 141.55]', '[7.582636857032776, 108.0, 6.0, 64.0, 0.2637605508842739, 0.2637605508842739, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 567.0, 54.0, 54.0, 54.0, 0.0, 54.0, 82947503.09824194, 9.5, 10.392304845413264, 0.0, 0.0, 0.0, 141.55]', '[0.0, 54.0, 6.0, 64.0, 1.4805266960844423, 1.4805266960844423, 0.0, 0.0, 0.0, 0.0, 0.

In [40]:
embeddings.model_name
Intel(R) Xeon(R) Silver 4309Y CPU @ 2.80GHz

'sentence-transformers/all-mpnet-base-v2'