# Load Dataset

In [45]:
################################################################################
# Load dataset and split it into training and test set
################################################################################

import pandas as pd
import os
from tabulate import tabulate

dataset_name = "ton-iot"
sample_size = 100000

# Load dateset
df = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2.csv')

# Split dataset according to attack type
normal_df = df[df['label'] == 0]
attack_df = df[df['label'] == 1]

# Drop columns
normal_df = normal_df.drop(columns=['label', 'type'])
attack_df = attack_df.drop(columns=['label', 'type'])

# Split dataset into training and test set
normal_df_train = normal_df.sample(frac=0.8, random_state=42)
normal_df_test = normal_df.drop(normal_df_train.index)
attack_df_train = attack_df.sample(frac=0.8, random_state=42)
attack_df_test = attack_df.drop(attack_df_train.index)

# Print dataset sizes in a table
data = [
    ["Normal", normal_df.shape[0], normal_df_train.shape[0], normal_df_test.shape[0]],
    ["Attack", attack_df.shape[0], attack_df_train.shape[0], attack_df_test.shape[0]]
]
print(tabulate(data, headers=["Atack type", "Total", "Train", "Test"], tablefmt="grid"))

+--------------+---------+---------+--------+
| Atack type   |   Total |   Train |   Test |
| Normal       |   42040 |   33632 |   8408 |
+--------------+---------+---------+--------+
| Attack       |   57960 |   46368 |  11592 |
+--------------+---------+---------+--------+


# Feature Importance

In [None]:
################################################################################
# Generate Feature Importance
################################################################################

import os
import dotenv
import time
import numpy as np
import json
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
You are provided with network data entries categorized as either normal or attack, along with their corresponding feature names.
Carefully analyze the differences between normal and attack entries by comparing corresponding fields.
Output top 10 important features that can be used to filter an entry as either normal or attack.
Output only in the Python list structure.

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```

Example output:
['feature1', 'feature2', 'feature3', ..., 'feature10']
"""

prompt = PromptTemplate(template=template, input_variables=["normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
model_name = "gpt-4o"
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.0)
# model_name = "gemini-1.5-pro"
# llm = ChatAnthropic(model='claude-3-opus-20240229')
# model_name = "claude-3-opus-20240229"
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name=dataset_name,
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()
normal_documents = vector_store._collection.query(query_embeddings=[normal_mean_vector], n_results=10)['documents'][0]

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()
attack_documents = vector_store._collection.query(query_embeddings=[attack_mean_vector], n_results=10)['documents'][0]

normal_entries = {}
for i, feature_name in enumerate(normal_df_train.columns.to_list()):
    normal_entries[feature_name] = [json.loads(doc)[i] for doc in normal_documents]

attack_entries = {}
for i, feature_name in enumerate(attack_df_train.columns.to_list()):
    attack_entries[feature_name] = [json.loads(doc)[i] for doc in attack_documents]

completions = []
for i in range(10):
    completion = chain.invoke({
        "normal_entries": json.dumps(normal_entries),
        "attack_entries": json.dumps(attack_entries)
    })
    completions.append(completion.content)
    print(completion.content)
    time.sleep(10)

with open(f"results/feature-importance-{sample_size}-llm-{model_name}.txt", "a") as f:
    f.write("\n".join(completions))

# Prediction

In [7]:
################################################################################
# Generate Rules with transposed data
################################################################################

import os
import dotenv
import json
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import numpy as np
import uuid
import tiktoken     # https://github.com/openai/tiktoken

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
You are provided with network data entries categorized as either normal or attack, along with their corresponding feature names.
Carefully analyze the differences between normal and attack entries by comparing corresponding fields.
Generate 5 simple and deterministic rules for top 5 important features to filter an entry as either normal or attack. 
Output only in the JSON format with the structure: 
{{'feature1': 'rule', 'feature2': 'rule', ..., 'feature5': 'rule'}}.

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
model_name = "gpt-4o"
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.0)
# model_name = "gemini-1.5-pro"
# llm = ChatAnthropic(model='claude-3-opus-20240229')
# model_name = "claude-3-opus-20240229"
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name=dataset_name,
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()
normal_documents = vector_store._collection.query(query_embeddings=[normal_mean_vector], n_results=10)['documents'][0]

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()
attack_documents = vector_store._collection.query(query_embeddings=[attack_mean_vector], n_results=10)['documents'][0]

normal_entries = {}
for i, feature_name in enumerate(normal_df_train.columns.to_list()):
    normal_entries[feature_name] = [json.loads(doc.replace("'", "\""))[i] for doc in normal_documents]

attack_entries = {}
for i, feature_name in enumerate(attack_df_train.columns.to_list()):
    attack_entries[feature_name] = [json.loads(doc.replace("'", "\""))[i] for doc in attack_documents]

# prompt_text = prompt.invoke({
#     "normal_entries": json.dumps(normal_entries),
#     "attack_entries": json.dumps(attack_entries)
# }).text

# print(prompt_text)

completion = chain.invoke({
    "normal_entries": json.dumps(normal_entries),
    "attack_entries": json.dumps(attack_entries)
})

print(completion.content)

id = str(uuid.uuid4())
with open(f"results/llm/generated-rules-{sample_size}-llm-{model_name}.txt", "a") as f:
    f.write(f"{id}\n")
    f.write(f"{completion.content}\n")

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
num_tokens_prompt = len(encoding.encode(prompt.invoke({"normal_entries": json.dumps(normal_entries),"attack_entries": json.dumps(attack_entries)}).text))
num_tokens_completion = len(encoding.encode(str(completion.content)))

print(f"Prompt tokens: {num_tokens_prompt}")
print(f"Completion tokens: {num_tokens_completion}")
print(f"Total tokens: {num_tokens_prompt + num_tokens_completion}")
print(f"Percentage of tokens used: {(num_tokens_prompt + num_tokens_completion) / 128000}")



```json
{
    "proto": "If proto is 'udp', then entry is normal; if proto is 'tcp', then entry is attack.",
    "service": "If service is 'dns', then entry is normal; if service is '-', then entry is attack.",
    "conn_state": "If conn_state is 'S0', then entry is normal; if conn_state is 'SF' or 'REJ', then entry is attack.",
    "dns_query": "If dns_query is 'desktop-7q9apbo', then entry is normal; if dns_query is '-', then entry is attack.",
    "dst_port": "If dst_port is 5355, then entry is normal; if dst_port is 80, then entry is attack."
}
```
Prompt tokens: 2761
Completion tokens: 159
Total tokens: 2920
Percentage of tokens used: 0.0228125


In [9]:
################################################################################
# Evaluate generated rules
################################################################################

from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        predicted_attack_types = []
        predicted_attack_types.append("normal" if dataset.iloc[i]['proto'] == "udp" else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['service'] == "dns" else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['conn_state'] == "S0" else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['dns_query'] == 'desktop-7q9apbo' else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['dst_port'] == 5355 else "attack")
        predicted_attack_types.append("attack" if dataset.iloc[i]['proto'] == "tcp" else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['service'] == "-" else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['conn_state'] in ["SF", "REJ"] else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['dns_query'] == '-' else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['dst_port'] == 80 else "normal")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['duration'] < 0.001 else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['src_bytes'] == 66 else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['dst_bytes'] == 0 else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['missed_bytes'] == 0 else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['src_pkts'] == 2 else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['src_ip'] == "192.168.1.195" else "attack")
        y_true.append(attack_type)
        y_pred.append(mode(predicted_attack_types))

c_report = classification_report(y_true, y_pred, digits=4)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/llm/result-llm-{sample_size}-2.txt", "a") as f:
    f.write(f"{id}\n")
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}\n")

print(c_report)
print(c_matrix)

Predicting normal entries...: 100%|███████████████████████████| 1000/1000 [00:00<00:00, 1377.62it/s]
Predicting attack entries...: 100%|███████████████████████████| 1000/1000 [00:00<00:00, 1556.92it/s]

              precision    recall  f1-score   support

      attack     0.7530    0.9300    0.8322      1000
      normal     0.9085    0.6950    0.7875      1000

    accuracy                         0.8125      2000
   macro avg     0.8308    0.8125    0.8099      2000
weighted avg     0.8308    0.8125    0.8099      2000

[[930  70]
 [305 695]]





# Feedback Loop

In [31]:
################################################################################
# Prompt Template
################################################################################
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder

system_message = ("system",
"""
You are a good data analyst.
You are provided with network data entries categorized as either normal or attack, along with their corresponding feature names.
Carefully analyze the differences between normal and attack entries by comparing corresponding fields.
Your task is to generate {k} simple and deterministic rules for top {k} important features to filter attack entries.
Supported operators are '==', '!=', '>', '<', '>=', '<='.
Generate exactly {k} rules to filter attack entries and make a tool call for each rule.
"""
)
human_message = ("user",
"""
Analyze the following network data and generate rules for the top 5 important features to filter attack entries.

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
)

prompt = ChatPromptTemplate.from_messages([
    system_message,
    human_message,
    MessagesPlaceholder("msgs")
])

# Invoke prompt
# prompt.invoke({"k": 5, "normal_entries": normal_entries, "attack_entries": attack_entries, "msgs": []})

In [32]:
################################################################################
# Tool
################################################################################

from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import operator
from typing import Annotated
from langchain_core.tools import tool

show_progress = True
operations = {'<': operator.lt, '>': operator.gt, '==': operator.eq, '<=': operator.le, '>=': operator.ge, '!=': operator.ne}

@tool
def evaluate_rule(
    feature_name: Annotated[str, "Feature name"],
    value: Annotated[str, "Value"], 
    op: Annotated[str, "Operator"]
) -> bool:
    """Evaluate the rule and return the macro f1-score."""
    try:
        value = float(value)
    except ValueError:
        value
    datasets = {"normal": normal_df_train, "attack": attack_df_train}
    y_pred = []
    y_true = []
    if op in operations:
        for attack_type, dataset in datasets.items():
            test_set_size = dataset.shape[0]
            for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries...", disable=not show_progress):
                y_true.append(attack_type)
                y_pred.append("attack" if operations[op](dataset.iloc[i][feature_name], value) else "normal")
        c_report = classification_report(y_true, y_pred, digits=4, output_dict=True)
        return c_report['macro avg']['f1-score']
    else:
        raise ValueError(f"Unsupported operator: {op}")

# Invoke tool
# print(evaluate_rule.invoke({"feature_name": "flow_duration", "value": "1", "op": "<"}))

In [33]:
################################################################################
# LLM
################################################################################

import os
import dotenv
from langchain_openai import ChatOpenAI
# from langchain_google_genai import ChatGoogleGenerativeAI
# from langchain_anthropic import ChatAnthropic

dotenv.load_dotenv(os.getcwd() + '/../.env')

model_name = "gpt-4o"
llm = ChatOpenAI(model=model_name, temperature=0.1)
# model_name = "gemini-1.5-pro"
# llm = ChatGoogleGenerativeAI(model=model_name, temperature=0.0)
# model_name = "claude-3-opus-20240229"
# llm = ChatAnthropic(model=model_name, temperature=0.0)

llm_with_tool = llm.bind_tools([evaluate_rule])

In [34]:
################################################################################
# Vector Store
################################################################################

import json
import numpy as np
from langchain_chroma import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings

train_set_size = sample_size
n_results = 10
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name=dataset_name,
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()
normal_documents = vector_store._collection.query(query_embeddings=[normal_mean_vector], n_results=n_results)['documents'][0]

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()
attack_documents = vector_store._collection.query(query_embeddings=[attack_mean_vector], n_results=n_results)['documents'][0]

normal_entries_dict = {}
for i, feature_name in enumerate(normal_df_train.columns.to_list()):
    normal_entries_dict[feature_name] = [json.loads(doc.replace("'", "\""))[i] for doc in normal_documents]

attack_entries_dict = {}
for i, feature_name in enumerate(attack_df_train.columns.to_list()):
    attack_entries_dict[feature_name] = [json.loads(doc.replace("'", "\""))[i] for doc in attack_documents]

In [39]:
################################################################################
# Chain
################################################################################

from langchain_core.messages import HumanMessage

chain = prompt | llm_with_tool

n_repetitions = 5
context_window = 128000
show_progress = False

def get_initial_state():
  n = 0
  k = 5
  mean_f1s = 0
  max_f1s = 0
  n_max = 0
  token_usage = {}
  normal_entries = json.dumps(normal_entries_dict)
  attack_entries = json.dumps(attack_entries_dict)
  msgs = []
  return locals()

state = get_initial_state()
train_f1_scores = []
while state["n"] < n_repetitions:
    ai_msg = chain.invoke(state)
    tool_msgs = []
    for tool_call in ai_msg.tool_calls:
        tool_msg = evaluate_rule.invoke(tool_call)
        tool_msgs.append(tool_msg)
    state["mean_f1s"] = sum(float(msg.content) for msg in tool_msgs) / len(tool_msgs)
    human_msg = HumanMessage(f"The current mean f1-score for the generated rules is {state['mean_f1s']}. "
                             "If this mean f1-score is greater than the previous rounds, keep the better performing "
                             "rules and revise or replace only the underperforming ones (those with a score less than mean). "
                             "Otherwise, revise or replace any rules that have a score less than mean. "
                             f"Based on the feedback, generate exactly {state['k']} rules to filter attack entries and "
                             "make a tool call for each rule, ensuring that a tool call is made for every entry every time.")
    state["n"] += 1
    state["msgs"].extend([ai_msg, *tool_msgs, human_msg])
    train_f1_scores.append(state["mean_f1s"])
    state["max_f1s"] = state["mean_f1s"] if state["mean_f1s"] > state["max_f1s"] else state["max_f1s"]
    state["n_max"] = state["n"] if state["mean_f1s"] > state["max_f1s"] else state["n_max"]
    state["token_usage"] = {key: ai_msg.response_metadata["token_usage"][key] for key in ["completion_tokens", "prompt_tokens", "total_tokens"]} 
    print("Round:", state["n"], "Current mean f1-score:", state["mean_f1s"], "Token usage:", state["token_usage"])

print(train_f1_scores)

Round: 1 Current mean f1-score: 0.630859345128614 Token usage: {'completion_tokens': 355, 'prompt_tokens': 2814, 'total_tokens': 3169}
Round: 2 Current mean f1-score: 0.6839445189143156 Token usage: {'completion_tokens': 405, 'prompt_tokens': 3343, 'total_tokens': 3748}
Round: 3 Current mean f1-score: 0.5795140335456447 Token usage: {'completion_tokens': 445, 'prompt_tokens': 3923, 'total_tokens': 4368}
Round: 4 Current mean f1-score: 0.631331024544524 Token usage: {'completion_tokens': 438, 'prompt_tokens': 4543, 'total_tokens': 4981}
Round: 5 Current mean f1-score: 0.6425807218772863 Token usage: {'completion_tokens': 396, 'prompt_tokens': 5155, 'total_tokens': 5551}
[0.630859345128614, 0.6839445189143156, 0.5795140335456447, 0.631331024544524, 0.6425807218772863]


In [46]:
################################################################################
# Evaluate generated rules
################################################################################

from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import operator
from statistics import mode

operations = {'<': operator.lt, '>': operator.gt, '==': operator.eq, '<=': operator.le, '>=': operator.ge, '!=': operator.ne}

def evaluate_rules(tool_calls):
    datasets = {"normal": normal_df_test, "attack": attack_df_test}
    y_pred = []
    y_true = []
    for attack_type, dataset in datasets.items():
        test_set_size = dataset.shape[0]
        for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries...", disable=not show_progress):
            predicted_attack_types = []
            for tool_call in tool_calls:
                args = json.loads(tool_call["function"]["arguments"])
                op = args["op"]
                feature_name = args["feature_name"]
                value = args["value"]
                try:
                    value = float(value)
                except ValueError:
                    value
                predicted_attack_types.append("attack" if operations[op](dataset.iloc[i][feature_name], value) else "normal")
            y_true.append(attack_type)
            y_pred.append(mode(predicted_attack_types))
    c_report = classification_report(y_true, y_pred, digits=4, output_dict=True)
    c_matrix = confusion_matrix(y_true, y_pred)
    # print(c_report)
    # print(c_matrix)
    return c_report

# tool_calls = state["msgs"][-7].additional_kwargs["tool_calls"]
# for tool_call in tool_calls:
#     rule = json.loads(tool_call["function"]["arguments"])
#     print("attack if", rule["feature_name"], rule["op"], rule["value"], "else normal")

# evaluate_rules(tool_calls)

# test_f1_scores = []
# for i in range(20, 0, -1):
#     index = -7 * i
#     tool_calls = state["msgs"][index].additional_kwargs["tool_calls"]
#     for tool_call in tool_calls:
#         rule = json.loads(tool_call["function"]["arguments"])
#     test_f1_scores.append(evaluate_rules(tool_calls)['macro avg']['f1-score'])

# print(test_f1_scores)

for i in range(len(state["msgs"])):
    if state["msgs"][i].type != "ai":
        continue
    tool_calls = state["msgs"][i].additional_kwargs["tool_calls"]
    for tool_call in tool_calls:
        rule = json.loads(tool_call["function"]["arguments"])
        print("attack if", rule["feature_name"], rule["op"], rule["value"], "else normal")
    c_report = evaluate_rules(tool_calls)
    print(c_report["macro avg"]["f1-score"])
    print(c_report["attack"]["precision"])

attack if src_ip != 192.168.1.17 else normal
attack if proto == tcp else normal
attack if service == - else normal
attack if conn_state == SF else normal
attack if dns_query == - else normal
0.8076550589224163
0.7954343971631206
attack if src_ip == 192.168.1.31 else normal
attack if proto == tcp else normal
attack if service == - else normal
attack if dst_port == 80 else normal
attack if dns_query == - else normal
0.8388243850646617
0.8342749529190208
attack if src_bytes == 0 else normal
attack if proto == tcp else normal
attack if duration > 0 else normal
attack if missed_bytes > 0 else normal
attack if dns_query == - else normal
0.7833477985868386
0.7726978649989217
attack if dst_bytes > 0 else normal
attack if proto == tcp else normal
attack if src_pkts == 4 else normal
attack if dst_ip_bytes > 0 else normal
attack if dns_query == - else normal
0.7687828251201123
0.8748254002363812
attack if conn_state == SF else normal
attack if proto == tcp else normal
attack if src_port != 5355 e

In [43]:
################################################################################
# Evaluate generated rules for efficiency
################################################################################

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from tabulate import tabulate
from statistics import mode
import time
import warnings
import pandas as pd
import os

warnings.filterwarnings("ignore")

sample_size = 100000

# Load dateset
df = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2.csv')

# Encode categorical columns
label_encoder = LabelEncoder()
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

# Split dataset according to attack type
normal_df = df[df['label'] == 0]
attack_df = df[df['label'] == 1]

# Split dataset into training and test set
normal_df_train = normal_df.sample(frac=0.8, random_state=42)
normal_df_test = normal_df.drop(normal_df_train.index)
attack_df_train = attack_df.sample(frac=0.8, random_state=42)
attack_df_test = attack_df.drop(attack_df_train.index)

X_train = pd.concat([normal_df_train, attack_df_train]).drop(columns=['label', 'type'])
y_train = pd.concat([normal_df_train, attack_df_train])['label']
X_test = pd.concat([normal_df_test, attack_df_test]).drop(columns=['label', 'type'])
y_test = pd.concat([normal_df_test, attack_df_test])['label']

# Create instances of ML models
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier()

# Fit the models to the training data
model_dt.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Predict the labels for the test data
y_true = y_test

elapsed_times_dt = []
elapsed_times_rf = []
elapsed_times_llm = []
y_pred_dt = []
y_pred_rf = []
y_pred_llm = []
for i in range(len(X_test)):
    # Predict using DT
    start = time.time()
    y_pred_dt.append(model_dt.predict([X_test.iloc[i]]))
    end = time.time()
    elapsed_times_dt.append(end - start)

    # Predict using RF
    start = time.time()
    y_pred_rf.append(model_rf.predict([X_test.iloc[i]]))
    end = time.time()
    elapsed_times_rf.append(end - start)

    # Predict using LLM
    start = time.time()
    row = X_test.iloc[i]
    # conditions = [
    #     row['proto'] == "tcp",
    #     row['service'] == "-",
    #     row['src_ip'] == "192.168.1.31",
    #     row['dst_ip'] != "224.0.0.252",
    #     row['dst_port'] == 80
    # ]
    conditions = [
        row['proto'] == "tcp",
        row['service'] == "-",
        row['src_ip'] == "192.168.1.31",
        row['dst_port'] == "80",
        row['dns_query'] == "-"
    ]
    predicted_attack_types = ["attack" if condition else "normal" for condition in conditions]
    y_pred_llm.append(mode(predicted_attack_types))
    end = time.time()
    elapsed_times_llm.append(end - start)

print(f"DT time taken: {sum(elapsed_times_dt)/len(X_test)}")
print(classification_report(y_true, y_pred_dt, digits=4, output_dict=False))
print(confusion_matrix(y_true, y_pred_dt))
print("\n")

print(f"RF time taken: {sum(elapsed_times_rf)/len(X_test)}")
print(classification_report(y_true, y_pred_rf, digits=4, output_dict=False))
print(confusion_matrix(y_true, y_pred_rf))
print("\n")

print(f"LLM time taken: {sum(elapsed_times_llm)/len(X_test)}\n")
print(classification_report(["attack" if y else "normal" for y in y_true], y_pred_llm, digits=4, output_dict=False))
print(confusion_matrix(["attack" if y else "normal" for y in y_true], y_pred_llm))

DT time taken: 0.00033612308502197266
              precision    recall  f1-score   support

           0     0.9996    0.9999    0.9998      8408
           1     0.9999    0.9997    0.9998     11592

    accuracy                         0.9998     20000
   macro avg     0.9998    0.9998    0.9998     20000
weighted avg     0.9998    0.9998    0.9998     20000

[[ 8407     1]
 [    3 11589]]


RF time taken: 0.0036594917893409728
              precision    recall  f1-score   support

           0     0.9999    1.0000    0.9999      8408
           1     1.0000    0.9999    1.0000     11592

    accuracy                         1.0000     20000
   macro avg     0.9999    1.0000    0.9999     20000
weighted avg     1.0000    1.0000    1.0000     20000

[[ 8408     0]
 [    1 11591]]


LLM time taken: 0.0002504175901412964

              precision    recall  f1-score   support

      attack     0.0000    0.0000    0.0000     11592
      normal     0.4204    1.0000    0.5919      8408

  

In [42]:
y_pred_llm

['normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',
 'normal',

# Other

In [97]:
################################################################################
# Generate Rules
################################################################################

import os
import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
You are provided with network data entries categorized as either normal or attack, along with their corresponding feature names.
Carefully analyze the differences between normal and attack entries by comparing corresponding fields.
Generate 5 simple and deterministic rules for top 5 important features to filter an entry as either normal or attack. 
Output only in the JSON format with the structure: 
{{'feature1': 'rule', 'feature2': 'rule', ..., 'feature5': 'rule'}}.

Feature Names:
```{feature_names}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["feature_names", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
model_name = "gpt-4o"
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.0)
# model_name = "gemini-1.5-pro"
# llm = ChatAnthropic(model='claude-3-opus-20240229')
# model_name = "claude-3-opus-20240229"
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="ton-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()
normal_documents = vector_store._collection.query(query_embeddings=[normal_mean_vector], n_results=10)['documents'][0]

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()
attack_documents = vector_store._collection.query(query_embeddings=[attack_mean_vector], n_results=10)['documents'][0]

completion = chain.invoke({
    "feature_names": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc} --> normal" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc} --> attack" for doc in attack_documents])
    })

print(completion.content)

with open(f"results/generated-rules-{sample_size}-llm-{model_name}.txt", "a") as f:
    f.write(completion.content)



```json
{
  "proto": "if proto == 'udp' then normal else attack",
  "service": "if service == 'dns' then normal else attack",
  "conn_state": "if conn_state == 'S0' then normal else attack",
  "src_bytes": "if src_bytes == 66 then normal else attack",
  "dst_bytes": "if dst_bytes == 0 then normal else attack"
}
```


In [107]:
################################################################################
# Evaluate generated rules
################################################################################

from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        predicted_attack_types = []
        # print((dataset.iloc[i]['conn_state']))
        # predicted_attack_types.append("normal" if dataset.iloc[i]['src_ip'] == "192.168.1.195" else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['src_port'] in range(52333, 60743) else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['dst_ip'] == "224.0.0.252" else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['dst_port'] == 5355 else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['proto'] == "udp" else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['service'] == "dns" else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['duration'] >= 0.001 else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['src_bytes'] == 66 else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['dst_bytes'] == 0 else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['conn_state'] in ["S0"] else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['dst_ip_bytes'] == 0 else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['src_ip_bytes'] == 122 else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['src_pkts'] == 2 else "attack")
        # predicted_attack_types.append("normal" if dataset.iloc[i]['dst_pkts'] == 0 else "attack")
        y_true.append(attack_type)
        y_pred.append(mode(predicted_attack_types))
        # y_pred.append("normal" if predicted_attack_types.count("normal") > 0 else "attack")
        # y_pred.append("attack" if predicted_attack_types.count("attack") > 0 else "normal")
        # y_pred.append("normal" if predicted_attack_types.count("normal") == 5 else "attack")
        # y_pred.append("normal" if predicted_attack_types.count("normal") == 5 else "attack")

c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-llm-{sample_size}-2.txt", "a") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

Predicting normal entries...: 100%|███████████████████████████| 1000/1000 [00:00<00:00, 3019.36it/s]
Predicting attack entries...: 100%|███████████████████████████| 1000/1000 [00:00<00:00, 3651.30it/s]

              precision    recall  f1-score   support

      attack       0.65      1.00      0.79      1000
      normal       1.00      0.47      0.64      1000

    accuracy                           0.73      2000
   macro avg       0.82      0.73      0.72      2000
weighted avg       0.82      0.73      0.72      2000

[[998   2]
 [528 472]]





In [None]:
################################################################################
# Get a Summary
################################################################################

import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
import tiktoken     # https://github.com/openai/tiktoken

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
Given normal and attack network data entries, output human understandable small summary on 
how attack and normal entries can be simply separated.

Feature Names:
```{feature_names}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["feature_names", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="ton-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})
normal_documents = retriever.invoke(str(normal_df_test.iloc[0].to_list()), filter={"source": "ton-iot", "label": "normal"})
attack_documents = retriever.invoke(str(attack_df_test.iloc[0].to_list()), filter={"source": "ton-iot", "label": "attack"})
completion = chain.invoke({
    "feature_names": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in attack_documents])
    })
print(completion)



content='Based on the provided data entries, we can see that normal entries have higher values for features such as flow duration, header length, rate, duration, and total size compared to attack entries. Additionally, normal entries have more occurrences of protocols like HTTP, HTTPS, DNS, SSH, TCP, UDP, DHCP, ARP, ICMP, and IPv.\n\nOn the other hand, attack entries have lower values for the mentioned features and do not have as many occurrences of the mentioned protocols. Attack entries also tend to have higher values for features like magnitude, radius, covariance, variance, and weight compared to normal entries.\n\nIn summary, normal entries exhibit higher network activity and a wider range of protocols, while attack entries show lower network activity and fewer protocol occurrences.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 149, 'prompt_tokens': 2787, 'total_tokens': 2936}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint':

In [None]:
################################################################################
# 
################################################################################

import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix

dotenv.load_dotenv()

template = """
Your task is to identify whether the query is attack or normal. Then 
generate a policy to filter the given query based on the values. 
You will be given feature names of the entries and similar entries along 
with the input query to make a decision.

Feature Names:
```{feature_names}```

Similar Entries:
```{similar_entries}```

Input Query: 
```{query}```

Policy:
"""
prompt = PromptTemplate(template=template, input_variables=["feature_names", "similar_entries", "query"])
# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)
llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="ton-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})
query_document = str(normal_df_test.iloc[5].to_list())
similar_documents = retriever.invoke(query_document, filter={"source": "ton-iot"})
chain.invoke({
    "feature_names": normal_df_train.columns.to_list(),
    "similar_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in similar_documents]),
    "query": query_document
    })

In [None]:
# print(completion.text)
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
num_tokens = len(encoding.encode(str(completion.text)))
print("Num tokens:", num_tokens)

Num tokens: 2780
