In [1]:
################################################################################
# Load dataset and split it into training and test set
################################################################################

import pandas as pd
import os
from tabulate import tabulate

sample_size = 100

# Load dateset
df_train = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2_train.csv')
df_test = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2_test.csv')

# Split dataset according to attack type and drop columns
normal_df_train = df_train[df_train['label'] == 0].drop(columns=['attack_cat', 'label'])
normal_df_test = df_test[df_test['label'] == 0].drop(columns=['attack_cat', 'label'])
attack_df_train = df_train[df_train['label'] == 1].drop(columns=['attack_cat', 'label'])
attack_df_test = df_test[df_test['label'] == 1].drop(columns=['attack_cat', 'label'])

# Print dataset sizes in a table
data = [
    ["Normal", normal_df_train.shape[0] + normal_df_test.shape[0], normal_df_train.shape[0], normal_df_test.shape[0]],
    ["Attack", attack_df_train.shape[0] + attack_df_test.shape[0], attack_df_train.shape[0], attack_df_test.shape[0]]
]
print(tabulate(data, headers=["Atack type", "Total", "Train", "Test"], tablefmt="grid"))

+--------------+---------+---------+--------+
| Atack type   |   Total |   Train |   Test |
| Normal       |      50 |      40 |     10 |
+--------------+---------+---------+--------+
| Attack       |      50 |      40 |     10 |
+--------------+---------+---------+--------+


In [2]:
################################################################################
# Generate a Rule
################################################################################

import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
Given normal and attack network data entries, output simple rule to verify an entry is
normal or attack by evaluating top 3 features. Organize your answer into JSON format 
{{'header1': 'rule', 'header2': 'rule', 'header3': 'rule'}}.

Headers:
```{headers}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["headers", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="unsw-nb15",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})
normal_documents = retriever.invoke(str(normal_df_test.iloc[0].to_list()), filter={"source": "unsw-nb15", "label": "normal"})
attack_documents = retriever.invoke(str(attack_df_test.iloc[0].to_list()), filter={"source": "unsw-nb15", "label": "attack"})
completion = chain.invoke({
    "headers": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in attack_documents])
    })
print(completion)

  from .autonotebook import tqdm as notebook_tqdm


content='{\n    "dur": "If dur <= 0.351856, then normal else attack",\n    "spkts": "If spkts <= 10, then normal else attack",\n    "sbytes": "If sbytes <= 804, then normal else attack"\n}' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 59, 'prompt_tokens': 2044, 'total_tokens': 2103}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-1bc064f8-9539-4065-85f3-e81c684da432-0' usage_metadata={'input_tokens': 2044, 'output_tokens': 59, 'total_tokens': 2103}


In [3]:
################################################################################
# Evaluate generated rule
################################################################################

from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        predicted_attack_type1 = "attack" if dataset.iloc[i]['dur'] > 0.351856 else "normal"
        predicted_attack_type2 = "attack" if dataset.iloc[i]['spkts'] > 10 else "normal"
        predicted_attack_type3 = "attack" if dataset.iloc[i]['sbytes'] > 804 else "normal"
        y_true.append(attack_type)
        y_pred.append(mode([predicted_attack_type1, predicted_attack_type2, predicted_attack_type3]))

c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

with open(f"results/result-llm-{sample_size}-2-{train_set_size}.txt", "w") as f:
    f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

Predicting normal entries...: 100%|███████████████████████████████| 10/10 [00:00<00:00, 1067.20it/s]
Predicting attack entries...: 100%|███████████████████████████████| 10/10 [00:00<00:00, 1389.99it/s]

              precision    recall  f1-score   support

      attack       0.25      0.20      0.22        10
      normal       0.33      0.40      0.36        10

    accuracy                           0.30        20
   macro avg       0.29      0.30      0.29        20
weighted avg       0.29      0.30      0.29        20

[[2 8]
 [6 4]]





In [None]:
################################################################################
# Get a Summary
################################################################################

import os
import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
Given normal and attack network data entries, output human understandable small summary on 
how attack and normal entries can be simply separated.

Headers:
```{headers}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["headers", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="unsw-nb15",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})
normal_documents = retriever.invoke(str(normal_df_test.iloc[0].to_list()), filter={"source": "unsw-nb15", "label": "normal"})
attack_documents = retriever.invoke(str(attack_df_test.iloc[0].to_list()), filter={"source": "unsw-nb15", "label": "attack"})
completion = chain.invoke({
    "headers": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in attack_documents])
    })
print(completion)



content='To separate attack and normal network data entries, we can look for distinguishing patterns or characteristics in the provided data. Here are some key observations that can help in differentiating between attack and normal entries:\n\n1. **Packet Counts (`spkts` and `dpkts`)**:\n   - Attack entries tend to have a lower number of destination packets (`dpkts`). For example, `dpkts` values for attack entries are often 6 or 8, whereas normal entries have values like 8, 10, or 18.\n\n2. **Byte Counts (`sbytes` and `dbytes`)**:\n   - Attack entries generally have lower destination byte counts (`dbytes`). For instance, attack entries have `dbytes` values like 268 and 1032, while normal entries have higher values like 1120, 1330, and 10168.\n\n3. **Load (`sload` and `dload`)**:\n   - Attack entries show significantly lower destination load (`dload`). For example, attack entries have `dload` values like 5092.992676 and 30557.85742, whereas normal entries have higher values like 17406.6

In [8]:
# print(completion.text)
import tiktoken     # https://github.com/openai/tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
num_tokens = len(encoding.encode(str(completion.text)))
print("Num tokens:", num_tokens)

Num tokens: 2780


In [None]:
################################################################################
# 
################################################################################

import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix

dotenv.load_dotenv()

template = """
Your task is to identify whether the query is attack or normal. Then 
generate a policy to filter the given query based on the values. 
You will be given headers of the entries and similar entries along 
with the input query to make a decision.

Headers:
```{headers}```

Similar Entries:
```{similar_entries}```

Input Query: 
```{query}```

Policy:
"""
prompt = PromptTemplate(template=template, input_variables=["headers", "similar_entries", "query"])
# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)
llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="cic-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})
query_document = str(normal_df_test.iloc[5].to_list())
similar_documents = retriever.invoke(query_document, filter={"source": "cic-iot"})
chain.invoke({
    "headers": normal_df_train.columns.to_list(),
    "similar_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in similar_documents]),
    "query": query_document
    })