### Load and save dataset as json

In [2]:
import pandas as pd
import os
import json

# Load dateset
attack_types = ['DDoS_HTTP', 'DDoS_UDP', 'DDoS_ICMP', 'DDoS_TCP']
df = pd.read_csv(os.getcwd() + '/../data/edge-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv', low_memory=False)
ddos_df = df[df['Attack_type'].isin(attack_types)]
ddos_df = ddos_df.drop(columns=['Attack_label', 'Attack_type'])
ddos_df_train = ddos_df.sample(frac=0.8, random_state=42)
ddos_df_test = ddos_df.drop(ddos_df_train.index)

print("Training set size: ", ddos_df_train.shape)
print("Test set size: ", ddos_df_test.shape)

# Save dataset to json
with open('edge-iiotset-ddos-train.json', 'w') as f:
    json.dump(ddos_df_train.to_dict(orient='records'), f, indent=4)

with open('edge-iiotset-ddos-test.json', 'w') as f:
    json.dump(ddos_df_test.to_dict(orient='records'), f, indent=4)

Training set size:  (39517, 61)
Test set size:  (9879, 61)


In [3]:
import pandas as pd
import os
import json

# Load dateset
attack_types = ['Normal']
df = pd.read_csv(os.getcwd() + '/../data/edge-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv', low_memory=False)
normal_df = df[df['Attack_type'].isin(attack_types)]
normal_df = normal_df.drop(columns=['Attack_label', 'Attack_type'])
normal_df_train = normal_df.sample(frac=0.79, random_state=42)
normal_df_test = normal_df.drop(normal_df_train.index)

print("Training set size: ", normal_df_train.shape)
print("Test set size: ", normal_df_test.shape)

# Save dataset to json
# with open('edge-iiotset-normal-train.json', 'w') as f:
#     json.dump(normal_df_train.to_dict(orient='records'), f, indent=4)

# with open('edge-iiotset-normal-test.json', 'w') as f:
#     json.dump(normal_df_test.to_dict(orient='records'), f, indent=4)

Training set size:  (19198, 61)
Test set size:  (5103, 61)


### Save dataset (training set) as a vector store

Using following tools
- [Chroma db](https://docs.trychroma.com/getting-started)
- [Langchain chroma integration](https://python.langchain.com/v0.2/docs/integrations/vectorstores/chroma/)

Saving only 100 records as it takes 0.1min per record to save.

In [11]:
import json

# Load json dataset from json
with open('edge-iiotset-ddos-train.json', 'r') as f:
    ddos_json_train = json.load(f)

with open('edge-iiotset-normal-train.json', 'r') as f:
    normal_json_train = json.load(f)

In [12]:
# pip install langchain
# pip install langchain-huggingface
# pip install sentence-transformers
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# https://python.langchain.com/v0.2/docs/integrations/text_embedding/huggingfacehub/

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from tqdm import tqdm
import os
import dotenv

# Initialize embeddings

# 1. OpenAIEmbeddings
# dotenv.load_dotenv()
# API_KEY = os.getenv("MY_OPENAI_API_KEY")
# embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=API_KEY)

# 2. OllamaEmbeddings
# embeddings = OllamaEmbeddings(model="all-minilm")

# 3. HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

# Initialize vector store
vector_store = Chroma(
    collection_name="edge-iiotset",
    embedding_function=embeddings, 
    persist_directory="./chroma_langchain_db")

# Add batched documents to vector store
batch_size = 4
total_ddos_documents = 32
total_normal_documents = 32

for i in tqdm(range(0, total_ddos_documents, batch_size), ncols=100, desc="Adding ddos documents to vector store"):
    documents = [Document(page_content=str(ddos_json_train[i+j]), metadata={"source": "edge-iiotset", "label": "ddos"}) for j in range(min(batch_size, total_ddos_documents-i))]
    vector_store.add_documents(documents)

for i in tqdm(range(0, total_normal_documents, batch_size), ncols=100, desc="Adding normal documents to vector store"):
    documents = [Document(page_content=str(normal_json_train[i+j]), metadata={"source": "edge-iiotset", "label": "normal"}) for j in range(min(batch_size, total_normal_documents-i))]
    vector_store.add_documents(documents)

vector_store.persist()
print(f"Total number of documents added: {total_ddos_documents+total_normal_documents}")

Adding ddos documents to vector store: 100%|██████████████████████████| 8/8 [00:10<00:00,  1.25s/it]
Adding normal documents to vector store: 100%|████████████████████████| 8/8 [00:11<00:00,  1.44s/it]

Total number of documents added: 64





### Retrieve similar documents from vector store

In [1]:
import json

# Load json dataset from json
with open('edge-iiotset-ddos-test.json', 'r') as f:
    ddos_json_test = json.load(f)

with open('edge-iiotset-normal-test.json', 'r') as f:
    normal_json_test = json.load(f)

In [14]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = OllamaEmbeddings(model="all-minilm")
vector_store = Chroma(
    collection_name="edge-iiotset",
    embedding_function=embeddings, 
    persist_directory="./chroma_langchain_db")
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 5})

query_document = ddos_json_test[0]
similar_documents = retriever.invoke(query_document, filter={"source": "edge-iiotset"})
print("Query document:", query_document)
print("Label of result documents:", [doc.metadata["label"] for doc in similar_documents])

query_document = normal_json_test[0]
similar_documents = retriever.invoke(query_document, filter={"source": "edge-iiotset"})
print("Query document:", query_document)
print("Label of result documents:", [doc.metadata["label"] for doc in similar_documents])

Query document: {'frame.time': ' 2021 11:35:30.015530000 ', 'ip.src_host': '192.168.0.170', 'ip.dst_host': '192.168.0.128', 'arp.dst.proto_ipv4': '0', 'arp.opcode': 0.0, 'arp.hw.size': 0.0, 'arp.src.proto_ipv4': '0', 'icmp.checksum': 0.0, 'icmp.seq_le': 0.0, 'icmp.transmit_timestamp': 0.0, 'icmp.unused': 0.0, 'http.file_data': '0', 'http.content_length': 0.0, 'http.request.uri.query': '0.0', 'http.request.method': '0', 'http.referer': '0', 'http.request.full_uri': '0', 'http.request.version': '0', 'http.response': 0.0, 'http.tls_port': 0.0, 'tcp.ack': 0.0, 'tcp.ack_raw': 0.0, 'tcp.checksum': 10089.0, 'tcp.connection.fin': 0.0, 'tcp.connection.rst': 0.0, 'tcp.connection.syn': 1.0, 'tcp.connection.synack': 0.0, 'tcp.dstport': 80.0, 'tcp.flags': 2.0, 'tcp.flags.ack': 0.0, 'tcp.len': 0.0, 'tcp.options': '020405b40402080a0aa4805a0000000001030307', 'tcp.payload': '0', 'tcp.seq': 0.0, 'tcp.srcport': '56070.0', 'udp.port': 0.0, 'udp.stream': 0.0, 'udp.time_delta': 0.0, 'dns.qry.name': 0.0, 'dn