In [3]:
################################################################################
# Load dataset and split it into training and test set
################################################################################

import pandas as pd
import os
from tabulate import tabulate

sample_size = 10000

# Load dateset
df = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2.csv')

# Split dataset according to attack type
normal_df = df[df['label'] == 0]
attack_df = df[df['label'] == 1]

# Drop columns
normal_df = normal_df.drop(columns=['label', 'type'])
attack_df = attack_df.drop(columns=['label', 'type'])

# Split dataset into training and test set
normal_df_train = normal_df.sample(frac=0.8, random_state=42)
normal_df_test = normal_df.drop(normal_df_train.index)
attack_df_train = attack_df.sample(frac=0.8, random_state=42)
attack_df_test = attack_df.drop(attack_df_train.index)

# Print dataset sizes in a table
data = [
    ["Normal", normal_df.shape[0], normal_df_train.shape[0], normal_df_test.shape[0]],
    ["Attack", attack_df.shape[0], attack_df_train.shape[0], attack_df_test.shape[0]]
]
print(tabulate(data, headers=["Atack type", "Total", "Train", "Test"], tablefmt="grid"))

+--------------+---------+---------+--------+
| Atack type   |   Total |   Train |   Test |
| Normal       |    5000 |    4000 |   1000 |
+--------------+---------+---------+--------+
| Attack       |    5000 |    4000 |   1000 |
+--------------+---------+---------+--------+


In [None]:
################################################################################
# Generate Feature Importance
################################################################################

import os
import dotenv
import time
import numpy as np
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
You are provided with network data entries categorized as either normal or attack, along with their corresponding feature names.
Carefully analyze the differences between normal and attack entries by comparing corresponding fields.
Output top 5 important features that can be used to filter an entry as either normal or attack.
Output only in the Python list structure.

Feature Names:
```{feature_names}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```

Example output:
['feature1', 'feature2', 'feature3', 'feature4', 'feature5']
"""
prompt = PromptTemplate(template=template, input_variables=["feature_names", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
model_name = "gpt-4o"
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.0)
# model_name = "gemini-1.5-pro"
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="ton-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()
normal_documents = vector_store._collection.query(query_embeddings=[normal_mean_vector], n_results=10)['documents'][0]

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()
attack_documents = vector_store._collection.query(query_embeddings=[attack_mean_vector], n_results=10)['documents'][0]

completions = []
for i in range(10):
    completion = chain.invoke({
        "feature_names": normal_df_train.columns.to_list(),
        "normal_entries": ",\n".join([f"{doc} --> normal" for doc in normal_documents]),
        "attack_entries": ",\n".join([f"{doc} --> attack" for doc in attack_documents])
        })
    completions.append(completion.content)
    print(completion.content)
    time.sleep(10)

with open(f"results/feature-importance-{sample_size}-llm-{model_name}.txt", "a") as f:
    f.write("\n".join(completions))

In [4]:
################################################################################
# Generate Rules
################################################################################

import os
import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
You are provided with network data entries categorized as either normal or attack, along with their corresponding feature names.
Carefully analyze the differences between normal and attack entries by comparing corresponding fields.
Generate 5 simple and deterministic rules for top 5 important features to filter an entry as either normal or attack. 
Output only in the JSON format with the structure: 
{{'feature1': 'rule', 'feature2': 'rule', 'feature3': 'rule', 'feature4': 'rule', 'feature5': 'rule'}}.

Feature Names:
```{feature_names}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["feature_names", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
model_name = "gpt-4o"
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.0)
# model_name = "gemini-1.5-pro"
# llm = ChatAnthropic(model='claude-3-opus-20240229')
# model_name = "claude-3-opus-20240229"
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="ton-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()
normal_documents = vector_store._collection.query(query_embeddings=[normal_mean_vector], n_results=10)['documents']

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()
attack_documents = vector_store._collection.query(query_embeddings=[attack_mean_vector], n_results=10)['documents']

completion = chain.invoke({
    "feature_names": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc} --> normal" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc} --> attack" for doc in attack_documents])
    })

print(completion.content)

with open(f"results/generated-rules-{sample_size}-llm-{model_name}.txt", "a") as f:
    f.write(completion.content)



```json
{
  "proto": "if proto == 'udp' then normal else attack",
  "service": "if service == 'dns' then normal else attack",
  "conn_state": "if conn_state == 'S0' then normal else attack",
  "src_bytes": "if src_bytes == 66 then normal else attack",
  "dst_bytes": "if dst_bytes == 0 then normal else attack"
}
```


In [5]:
################################################################################
# Evaluate generated rule
################################################################################

from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    predicted_attack_types = []
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        predicted_attack_types.append("normal" if dataset.iloc[i]['proto'] == "udp" else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['service'] == "dns" else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['conn_state'] in ["S0"] else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['src_bytes'] == 66 else "attack")
        predicted_attack_types.append("normal" if dataset.iloc[i]['dst_bytes'] == 0 else "attack")
        # predicted_attack_types.append("attack" if dataset.iloc[i]['dst_ip_bytes'] == 40 else "normal")
        # predicted_attack_types.append("attack" if dataset.iloc[i]['src_pkts'] == 1 else "normal")
        # predicted_attack_types.append("attack" if dataset.iloc[i]['dst_port'] == 8080 else "normal")
        y_true.append(attack_type)
        y_pred.append(mode(predicted_attack_types))

c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-llm-{sample_size}-2.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

Predicting normal entries...:   0%|                                        | 0/1000 [00:00<?, ?it/s]

Predicting normal entries...: 100%|███████████████████████████| 1000/1000 [00:00<00:00, 2138.15it/s]
Predicting attack entries...: 100%|███████████████████████████| 1000/1000 [00:00<00:00, 2384.59it/s]

              precision    recall  f1-score   support

      attack       1.00      1.00      1.00      1000
      normal       1.00      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

[[1000    0]
 [   0 1000]]





In [None]:
################################################################################
# Get a Summary
################################################################################

import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
import tiktoken     # https://github.com/openai/tiktoken

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
Given normal and attack network data entries, output human understandable small summary on 
how attack and normal entries can be simply separated.

Feature Names:
```{feature_names}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["feature_names", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="ton-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})
normal_documents = retriever.invoke(str(normal_df_test.iloc[0].to_list()), filter={"source": "ton-iot", "label": "normal"})
attack_documents = retriever.invoke(str(attack_df_test.iloc[0].to_list()), filter={"source": "ton-iot", "label": "attack"})
completion = chain.invoke({
    "feature_names": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in attack_documents])
    })
print(completion)



content='Based on the provided data entries, we can see that normal entries have higher values for features such as flow duration, header length, rate, duration, and total size compared to attack entries. Additionally, normal entries have more occurrences of protocols like HTTP, HTTPS, DNS, SSH, TCP, UDP, DHCP, ARP, ICMP, and IPv.\n\nOn the other hand, attack entries have lower values for the mentioned features and do not have as many occurrences of the mentioned protocols. Attack entries also tend to have higher values for features like magnitude, radius, covariance, variance, and weight compared to normal entries.\n\nIn summary, normal entries exhibit higher network activity and a wider range of protocols, while attack entries show lower network activity and fewer protocol occurrences.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 149, 'prompt_tokens': 2787, 'total_tokens': 2936}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint':

In [None]:
################################################################################
# 
################################################################################

import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix

dotenv.load_dotenv()

template = """
Your task is to identify whether the query is attack or normal. Then 
generate a policy to filter the given query based on the values. 
You will be given feature names of the entries and similar entries along 
with the input query to make a decision.

Feature Names:
```{feature_names}```

Similar Entries:
```{similar_entries}```

Input Query: 
```{query}```

Policy:
"""
prompt = PromptTemplate(template=template, input_variables=["feature_names", "similar_entries", "query"])
# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)
llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="ton-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})
query_document = str(normal_df_test.iloc[5].to_list())
similar_documents = retriever.invoke(query_document, filter={"source": "ton-iot"})
chain.invoke({
    "feature_names": normal_df_train.columns.to_list(),
    "similar_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in similar_documents]),
    "query": query_document
    })

In [None]:
# print(completion.text)
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
num_tokens = len(encoding.encode(str(completion.text)))
print("Num tokens:", num_tokens)

Num tokens: 2780
