In [1]:
import pandas as pd
import os
import json

# Load dateset
df = pd.read_csv(os.getcwd() + '/../../data/edge-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv', low_memory=False)
attack_df = df[df['Attack_label'] == 1]
attack_df = attack_df.drop(columns=['Attack_label', 'Attack_type'])
attack_df_train = attack_df.sample(frac=0.8, random_state=42)
attack_df_test = attack_df.drop(attack_df_train.index)

normal_df = df[df['Attack_label'] == 0]
normal_df = normal_df.drop(columns=['Attack_label', 'Attack_type'])
normal_df_train = normal_df.sample(frac=0.8, random_state=42)
normal_df_test = normal_df.drop(normal_df_train.index)

print("Attack Training set size: ", attack_df_train.shape)
print("Attack Test set size: ", attack_df_test.shape)

print("Normal Training set size: ", normal_df_train.shape)
print("Normal Test set size: ", normal_df_test.shape)

Attack Training set size:  (106799, 61)
Attack Test set size:  (26700, 61)
Normal Training set size:  (19441, 61)
Normal Test set size:  (4860, 61)


In [2]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="edge-iiotset",
    embedding_function=embeddings, 
    persist_directory="./chroma_db_binary")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})

  from tqdm.autonotebook import tqdm, trange


In [12]:
from statistics import mode
from sklearn.metrics import classification_report
from langchain_openai import ChatOpenAI
import os
import dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

dotenv.load_dotenv()
llm = ChatOpenAI(openai_api_key=os.getenv("OPENAI_API_KEY"))
prompt = ChatPromptTemplate.from_messages([
    ("system", "Given a network packet data in `INPUT` and similar network packet data in `SIMILAR`. Output the similarities between the INPUT and SIMILAR."),
    ("user", "HEADERS:\n{headers}\n\nINPUT:\n{input}\n\nSIMILAR:\n{similar}")
])
chain = prompt | llm | StrOutputParser()

sample_size = 3 # attack_df_test.shape[0]

y_pred = []
y_true = []
headers = "'frame.time', 'ip.src_host', 'ip.dst_host', 'arp.dst.proto_ipv4', 'arp.opcode', 'arp.hw.size', 'arp.src.proto_ipv4', 'icmp.checksum', 'icmp.seq_le', 'icmp.transmit_timestamp', 'icmp.unused', 'http.file_data', 'http.content_length', 'http.request.uri.query', 'http.request.method', 'http.referer', 'http.request.full_uri', 'http.request.version', 'http.response', 'http.tls_port', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.options', 'tcp.payload', 'tcp.seq', 'tcp.srcport', 'udp.port', 'udp.stream', 'udp.time_delta', 'dns.qry.name', 'dns.qry.name.len', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conack.flags', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msg', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.protoname', 'mqtt.topic', 'mqtt.topic_len', 'mqtt.ver', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']"

for i in range(sample_size):
    query_document = str(normal_df_test.iloc[i].to_list())
    similar_documents = retriever.invoke(query_document, filter={"source": "edge-iiotset"})
    response = chain.invoke({
        "headers": headers, 
        "input": str(query_document), 
        "similar": "\n".join([similar_document.page_content for similar_document in similar_documents])
    })
    print(response)

The similarities between the INPUT packet and the SIMILAR packets are as follows:

1. 'ip.src_host', 'ip.dst_host': Both the INPUT and SIMILAR packets have the same source and destination IP addresses.
2. 'tcp.dstport': The TCP destination port is the same in both the INPUT and SIMILAR packets.
3. 'mqtt.msg': The MQTT message field is the same in both the INPUT and SIMILAR packets.
4. 'tcp.flags': The TCP flags field is the same in both the INPUT and SIMILAR packets.
5. 'tcp.seq': The TCP sequence number field is the same in both the INPUT and SIMILAR packets.

These similarities indicate that the packets have similar network characteristics related to IP addresses, ports, protocol messages, and TCP flags and sequence numbers.
The similarities between the INPUT packet and the SIMILAR packets are as follows:

1. Both the INPUT and SIMILAR packets have the same source and destination IP addresses: '192.168.0.101' and '192.168.0.128' respectively.
2. The protocol being used in both packet