In [18]:
################################################################################
# Load dataset and split it into training and test set
################################################################################

import pandas as pd
import os
from tabulate import tabulate

sample_size = 100

# Load dateset
df = pd.read_csv(os.getcwd() + f'/data/sample-{sample_size}-2.csv')

# Split dataset according to attack type
normal_df = df[df['label'] == 'BenignTraffic']
attack_df = df[df['label'] != 'BenignTraffic']

# Drop columns
normal_df = normal_df.drop(columns=['label'])
attack_df = attack_df.drop(columns=['label'])

# Split dataset into training and test set
normal_df_train = normal_df.sample(frac=0.8, random_state=42)
normal_df_test = normal_df.drop(normal_df_train.index)
attack_df_train = attack_df.sample(frac=0.8, random_state=42)
attack_df_test = attack_df.drop(attack_df_train.index)

# Print dataset sizes in a table
data = [
    ["Normal", normal_df.shape[0], normal_df_train.shape[0], normal_df_test.shape[0]],
    ["Attack", attack_df.shape[0], attack_df_train.shape[0], attack_df_test.shape[0]]
]
print(tabulate(data, headers=["Atack type", "Total", "Train", "Test"], tablefmt="grid"))

+--------------+---------+---------+--------+
| Atack type   |   Total |   Train |   Test |
| Normal       |      50 |      40 |     10 |
+--------------+---------+---------+--------+
| Attack       |      50 |      40 |     10 |
+--------------+---------+---------+--------+


In [19]:
################################################################################
# Generate a Rule
################################################################################

import os
import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
You are provided with network data entries categorized as either normal or attack, along with their corresponding headers.
Carefully analyze the differences between normal and attack entries by comparing corresponding fields.
Generate 5 deterministic and simple rules that can be used to filter an entry as either normal or attack. 
Output only in the JSON format with the structure: 
{{'header1': 'rule', 'header2': 'rule', 'header3': 'rule', 'header4': 'rule', 'header5': 'rule'}}.

Headers:
```{headers}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["headers", "normal_entries", "attack_entries"])
# llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
llm = ChatAnthropic(model='claude-3-opus-20240229')
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="cic-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")

# retriever = vector_store.as_retriever(
#     search_type="mmr", 
#     search_kwargs={"k": 10, "fetch_k": 10})
# normal_documents = retriever.invoke(str(normal_df_test.iloc[0].to_list()), filter={"source": "cic-iot", "label": "normal"})
# attack_documents = retriever.invoke(str(attack_df_test.iloc[0].to_list()), filter={"source": "cic-iot", "label": "attack"})
# completion = chain.invoke({
#     "headers": normal_df_train.columns.to_list(),
#     "normal_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in normal_documents]),
#     "attack_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in attack_documents])
#     })
# print(completion)

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()
normal_documents = vector_store._collection.query(query_embeddings=[normal_mean_vector], n_results=10)['documents']

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()
attack_documents = vector_store._collection.query(query_embeddings=[attack_mean_vector], n_results=10)['documents']

completion = chain.invoke({
    "headers": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc} --> normal" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc} --> attack" for doc in attack_documents])
    })
print(completion.content)



BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Claude API. Please go to Plans & Billing to upgrade or purchase credits.'}}

In [16]:
################################################################################
# Evaluate generated rule
################################################################################

from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

# train_set_size = 100
datasets = {"normal": normal_df_test, "attack": attack_df_test}
y_pred = []
y_true = []
for attack_type, dataset in datasets.items():
    test_set_size = dataset.shape[0]
    predicted_attack_types = []
    for i in tqdm(range(test_set_size), ncols=100, desc=f"Predicting {attack_type} entries..."):
        # predicted_attack_types.append("attack" if dataset.iloc[i]['flow_duration'] < 1 else "normal")
        # predicted_attack_types.append("attack" if dataset.iloc[i]['Header_Length'] < 100 else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['Protocol Type'] == 6 else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['Rate'] < 50 else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['ack_flag_number'] == 0 else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['syn_flag_number'] == 1 else "normal")
        predicted_attack_types.append("attack" if dataset.iloc[i]['Duration'] == 64 else "normal")
        # predicted_attack_types.append("attack" if dataset.iloc[i]['Weight'] < 200 else "normal")
        # predicted_attack_types.append("attack" if dataset.iloc[i]['HTTP'] == 0 else "normal")
        # predicted_attack_types.append("attack" if dataset.iloc[i]['Tot sum'] == 525 else "normal")
        # predicted_attack_types.append("attack" if dataset.iloc[i]['Max'] == 50 else "normal")
        y_true.append(attack_type)
        y_pred.append(mode(predicted_attack_types))

c_report = classification_report(y_true, y_pred)
c_matrix = confusion_matrix(y_true, y_pred)

# with open(f"results/result-llm-{sample_size}-2-{train_set_size}.txt", "w") as f:
#     f.write(f"Classication Report\n{c_report}\n\nConfusion Matrix\n{c_matrix}")

print(c_report)
print(c_matrix)

Predicting normal entries...: 100%|███████████████████████████████| 10/10 [00:00<00:00, 1434.69it/s]


Predicting attack entries...: 100%|███████████████████████████████| 10/10 [00:00<00:00, 1854.41it/s]

              precision    recall  f1-score   support

      attack       1.00      1.00      1.00        10
      normal       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

[[10  0]
 [ 0 10]]





In [6]:
################################################################################
# Generate Feature Importance
################################################################################

import os
import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
You are provided with network data entries categorized as either normal or attack, along with their corresponding headers.
Carefully analyze the differences between normal and attack entries by comparing corresponding fields.
Generate top 5 features that can be used to filter an entry as either normal or attack.
Output only in the Python list structure.

Headers:
```{headers}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```

Example output:
['feature', 'feature2', 'feature3', 'feature4', 'feature5']
"""
prompt = PromptTemplate(template=template, input_variables=["headers", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="cic-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 10, "fetch_k": 10})

# normal_documents = retriever.invoke(str(normal_df_test.iloc[0].to_list()), filter={"source": "cic-iot", "label": "normal"})
# attack_documents = retriever.invoke(str(attack_df_test.iloc[0].to_list()), filter={"source": "cic-iot", "label": "attack"})

normal_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'normal'})['embeddings']
normal_mean_vector = np.mean(normal_vectors, axis=0).tolist()
normal_documents = vector_store._collection.query(query_embeddings=[normal_mean_vector], n_results=10)['documents']

attack_vectors = vector_store._collection.get(include=['embeddings'], where={'label': 'attack'})['embeddings']
attack_mean_vector = np.mean(attack_vectors, axis=0).tolist()
attack_documents = vector_store._collection.query(query_embeddings=[attack_mean_vector], n_results=10)['documents']

completion = chain.invoke({
    "headers": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc} --> normal" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc} --> attack" for doc in attack_documents])
    })
print(completion.content)



```python
['flow_duration', 'Header_Length', 'Rate', 'syn_flag_number', 'ack_flag_number']
```


In [None]:
################################################################################
# Get a Summary
################################################################################

import dotenv
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from statistics import mode
from sklearn.metrics import classification_report, confusion_matrix
import tiktoken     # https://github.com/openai/tiktoken

dotenv.load_dotenv(os.getcwd() + '/../.env')

template = """
Given normal and attack network data entries, output human understandable small summary on 
how attack and normal entries can be simply separated.

Headers:
```{headers}```

Normal Entries:
```{normal_entries}```

Attack Entries:
```{attack_entries}```
"""
prompt = PromptTemplate(template=template, input_variables=["headers", "normal_entries", "attack_entries"])
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro")
chain = prompt | llm
train_set_size = sample_size
embeddings = HuggingFaceEmbeddings()
vector_store = Chroma(
    collection_name="cic-iot",
    embedding_function=embeddings, 
    persist_directory=f"./vector-stores/chroma-db-{train_set_size}-2")
retriever = vector_store.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 5, "fetch_k": 5})
normal_documents = retriever.invoke(str(normal_df_test.iloc[0].to_list()), filter={"source": "cic-iot", "label": "normal"})
attack_documents = retriever.invoke(str(attack_df_test.iloc[0].to_list()), filter={"source": "cic-iot", "label": "attack"})
completion = chain.invoke({
    "headers": normal_df_train.columns.to_list(),
    "normal_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in normal_documents]),
    "attack_entries": ",\n".join([f"{doc.page_content} --> {doc.metadata['label']}" for doc in attack_documents])
    })
print(completion)



content='Based on the provided data entries, we can see that normal entries have higher values for features such as flow duration, header length, rate, duration, and total size compared to attack entries. Additionally, normal entries have more occurrences of protocols like HTTP, HTTPS, DNS, SSH, TCP, UDP, DHCP, ARP, ICMP, and IPv.\n\nOn the other hand, attack entries have lower values for the mentioned features and do not have as many occurrences of the mentioned protocols. Attack entries also tend to have higher values for features like magnitude, radius, covariance, variance, and weight compared to normal entries.\n\nIn summary, normal entries exhibit higher network activity and a wider range of protocols, while attack entries show lower network activity and fewer protocol occurrences.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 149, 'prompt_tokens': 2787, 'total_tokens': 2936}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint':

In [None]:
# print(completion.text)
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
num_tokens = len(encoding.encode(str(completion.text)))
print("Num tokens:", num_tokens)

Num tokens: 2780
