In [3]:
import pandas as pd
import os 

df = pd.read_csv(os.getcwd() + '/data/edge-iiot/Edge-IIoTset dataset/Selected dataset for ML and DL/ML-EdgeIIoT-dataset.csv', low_memory=False)

In [4]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

ddos_dfs = {}
attack_types = ['DDoS_HTTP', 'DDoS_UDP', 'DDoS_ICMP', 'DDoS_TCP']

# Get the columns with all zero values for each attack type
columns_to_drop = {}
for attack_type in attack_types:
    ddos_dfs[attack_type] = df[df['Attack_type'] == attack_type]
    columns_all = ddos_dfs[attack_type].columns
    columns_numeric = ddos_dfs[attack_type].select_dtypes(include=[np.number]).columns
    columns_non_numeric = [i for i in columns_all if i not in columns_numeric]
    columns_to_drop[attack_type] = ['Attack_label', 'Attack_type']
    for c in columns_numeric:
        if ddos_dfs[attack_type][c].mean() == 0:
            columns_to_drop[attack_type].append(c)

# Dropping columns with all zero values for each attack type
columns_count_data = {"Attack Type": [], "#Columns Before": [], 
                      "#Columns After": [], "#Columns After (Numeric)": [], 
                      "#Columns After (Non-numeric)": []}
for attack_type in attack_types:
    columns_count_data["Attack Type"].append(attack_type)
    columns_count_data["#Columns Before"].append(len(ddos_dfs[attack_type].columns))
    ddos_dfs[attack_type].drop(columns_to_drop[attack_type], axis=1, inplace=True)
    columns_count_data["#Columns After"].append(len(ddos_dfs[attack_type].columns))
    columns_all = ddos_dfs[attack_type].columns
    columns_numeric = ddos_dfs[attack_type].select_dtypes(include=[np.number]).columns
    columns_non_numeric = [i for i in columns_all if i not in columns_numeric]
    columns_count_data["#Columns After (Numeric)"].append(len(columns_numeric))
    columns_count_data["#Columns After (Non-numeric)"].append(len(columns_non_numeric))

columns_count_df = pd.DataFrame(columns_count_data)
columns_count_df

Unnamed: 0,Attack Type,#Columns Before,#Columns After,#Columns After (Numeric),#Columns After (Non-numeric)
0,DDoS_HTTP,63,33,14,19
1,DDoS_UDP,63,22,3,19
2,DDoS_ICMP,63,21,2,19
3,DDoS_TCP,63,29,10,19


In [5]:
# Get random samples from each attack type
sample_size = 100
random_samples_ddos_dfs = {}
random_samples_normal_dfs = {}

for attack_type in attack_types:
    random_samples_ddos_dfs[attack_type] = ddos_dfs[attack_type].sample(n=sample_size, random_state=1)
    random_samples_normal_dfs[attack_type] = df[df['Attack_type'] == 'Normal'].sample(n=sample_size, random_state=1)
    columns_to_drop = []
    for c in random_samples_normal_dfs[attack_type].columns:
        if c not in random_samples_ddos_dfs[attack_type].columns:
            columns_to_drop.append(c)
    random_samples_normal_dfs[attack_type].drop(columns_to_drop, axis=1, inplace=True)

### Round 1

In [13]:
from langchain_community.chat_models import ChatOllama
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import tiktoken
import json
from sklearn.metrics import classification_report

# Configs
train_set_size = 5
test_set_size = 5
model = "gemma:7b"

y_true = {}
y_pred = {}

def predict(attack_type):
    llm = Ollama(model=model, temperature=0)
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    messages = []
    with open('prompts.json') as f:
        system_prompt1 = ("system", json.load(f)["0"])
    system_prompt2 = ("system", f"The samples has following fields respectively {random_samples_ddos_dfs[attack_type].columns.tolist()}.")

    benign_samples = ""
    attack_samples = ""
    for i in range(train_set_size):
        benign_samples += str(random_samples_normal_dfs[attack_type].iloc[i].values.tolist()) + ">>> BENIGN\n"
        attack_samples += str(random_samples_ddos_dfs[attack_type].iloc[i].values.tolist()) + ">>> ATTACK\n"
    user_prompt = ("user", "Sample logs are given below ```" + benign_samples + attack_samples + "```")
    messages.append(system_prompt1)
    messages.append(system_prompt2)
    messages.append(user_prompt)
    messages.append(("user", "{input}"))

    # print("Messages:", messages)
    num_tokens = len(encoding.encode(str(messages)))
    print("Num tokens:", num_tokens)

    prompt = ChatPromptTemplate.from_messages(messages)
    chain = prompt | llm | StrOutputParser()

    y_true[attack_type] = []
    y_pred[attack_type] = []

    print("Predicting benign samples...")
    for i in range(train_set_size, train_set_size + test_set_size):
        y_true[attack_type].append("BENIGN")
        y_pred[attack_type].append(chain.invoke({"input": "Predict attack or benign: " + 
                                    str(random_samples_normal_dfs[attack_type].iloc[i].values.tolist())}))
        print(i-train_set_size+1, end=" ")
        print(y_pred[attack_type][-1])

    print("\nPredicting attack samples...")
    for j in range(train_set_size, train_set_size + test_set_size):
        y_true[attack_type].append("ATTACK")
        y_pred[attack_type].append(chain.invoke({"input": "Predict attack or benign: " + 
                                    str(random_samples_ddos_dfs[attack_type].iloc[j].values.tolist())}))
        print(j-train_set_size+1, end=" ")
        print(y_pred[attack_type][-1])

    print(classification_report(y_true[attack_type], y_pred[attack_type]))

In [14]:
for attack_type in attack_types[:1]:
    print(f"Predicting for attack type: {attack_type}")
    predict(attack_type)

Predicting for attack type: DDoS_HTTP
Num tokens: 2359
Predicting benign samples...
1 Human: Predict attack or benign: [' 2021 20:55:46.721770000 ', '192.168.0.101', '192.168.0.128', '0', '0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0']
2 Human: Predict attack or benign: [' 2021 16:23:11.253690000 ', '192.168.0.128', '192.168.0.101', '0', '0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '0.0'

In [16]:
with open('y_pred.json', 'w') as f:
    json.dump(y_pred, f)

### Round 2

In [7]:
from langchain_community.chat_models import ChatOllama
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import tiktoken
import json
from sklearn.metrics import classification_report

# Configs
train_set_size = 25
test_set_size = 25
model = "gemma:7b"

y_true = {}
y_pred = {}

def predict(attack_type):
    llm = Ollama(model=model, temperature=0)
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    messages = []
    with open('prompts.json') as f:
        system_prompt1 = ("system", json.load(f)["0"])
    system_prompt2 = ("system", f"The samples has following fields respectively {random_samples_ddos_dfs[attack_type].columns.tolist()}.")
    messages.append(system_prompt1)
    messages.append(system_prompt2)

    # benign_samples = ""
    # attack_samples = ""
    for i in range(train_set_size):
        messages.append(("user", str(random_samples_normal_dfs[attack_type].iloc[i].values.tolist())))
        messages.append(("ai", "BENIGN"))
        messages.append(("user", str(random_samples_ddos_dfs[attack_type].iloc[i].values.tolist())))
        messages.append(("ai", "ATTACK"))
        # benign_samples += str(random_samples_normal_dfs[attack_type].iloc[i].values.tolist()) + ">>> BENIGN\n"
        # attack_samples += str(random_samples_ddos_dfs[attack_type].iloc[i].values.tolist()) + ">>> ATTACK\n"
    # user_prompt = ("user", "Sample logs are given below ```" + benign_samples + attack_samples + "```")
    # messages.append(user_prompt)
    messages.append(("user", "{input}"))

    print("Messages:", messages)
    num_tokens = len(encoding.encode(str(messages)))
    print("Num tokens:", num_tokens)

    # prompt = ChatPromptTemplate.from_messages(messages)
    # chain = prompt | llm | StrOutputParser()

    # y_true[attack_type] = []
    # y_pred[attack_type] = []

    # print("Predicting benign samples...")
    # for i in range(train_set_size, train_set_size + test_set_size):
    #     y_true[attack_type].append("BENIGN")
    #     y_pred[attack_type].append(chain.invoke({"input": "Predict attack or benign: " + 
    #                                 str(random_samples_normal_dfs[attack_type].iloc[i].values.tolist())}))
    #     print(i-train_set_size+1, end=" ")
    #     print(y_pred[attack_type][-1])

    # print("\nPredicting attack samples...")
    # for j in range(train_set_size, train_set_size + test_set_size):
    #     y_true[attack_type].append("ATTACK")
    #     y_pred[attack_type].append(chain.invoke({"input": "Predict attack or benign: " + 
    #                                 str(random_samples_ddos_dfs[attack_type].iloc[j].values.tolist())}))
    #     print(j-train_set_size+1, end=" ")

    # print(classification_report(y_true[attack_type], y_pred[attack_type]))

In [8]:
for attack_type in attack_types[:1]:
    print(f"Predicting for attack type: {attack_type}")
    predict(attack_type)

Predicting for attack type: DDoS_HTTP
Messages: [('system', "You are intelligent network log analyzer. User gives you samples of benign and attack iot network data. Analyze the samples and check whether the user given data is benign or not. Output the label 'ATTACK' or 'BENIGN', nothing else."), ('system', "The samples has following fields respectively ['frame.time', 'ip.src_host', 'ip.dst_host', 'arp.dst.proto_ipv4', 'arp.src.proto_ipv4', 'http.file_data', 'http.content_length', 'http.request.uri.query', 'http.request.method', 'http.referer', 'http.request.full_uri', 'http.request.version', 'http.response', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.options', 'tcp.payload', 'tcp.seq', 'tcp.srcport', 'dns.qry.name.len', 'mqtt.conack.flags', 'mqtt.msg', 'mqtt.protoname', 'mqtt.topic']."), ('user', "[' 2021 12:18:05.921747000 ', '192.168.0

In [10]:
for i in range(25, 50):
    print(str(random_samples_ddos_dfs["DDoS_HTTP"].iloc[i].values.tolist()))

[' 2021 11:35:50.114362000 ', '192.168.0.170', '192.168.0.128', '0', '0', '0', 0.0, '0.0', '0', '0', '0', '0', 0.0, 1.0, 2933067421.0, 56366.0, 0.0, 0.0, 0.0, 0.0, 80.0, 24.0, 1.0, 32.0, '0101080a0aa4ce831dd5240c', '582d444f6b6f335070764c69574347516a4566323a2034774653775764710d0a', 382.0, '57056.0', '0.0', '0.0', '0.0', '0.0', '0.0']
[' 2021 11:36:44.022287000 ', '192.168.0.128', '192.168.0.170', '0', '0', '0', 0.0, '0.0', '0', '0', '0', '0', 0.0, 500.0, 2008827281.0, 25635.0, 0.0, 0.0, 0.0, 0.0, 32876.0, 16.0, 1.0, 0.0, '0101080a1dd602430aa5a15b', '0', 1.0, '80.0', '0.0', '0.0', '0.0', '0.0', '0.0']
[' 2021 11:37:20.450187000 ', '192.168.0.170', '192.168.0.128', '0', '0', '0', 0.0, '0.0', '0', '0', '0', '0', 0.0, 0.0, 0.0, 39477.0, 0.0, 0.0, 1.0, 0.0, 80.0, 2.0, 0.0, 0.0, '020405b40402080a0aa62ff70000000001030307', '0', 0.0, '38922.0', '0.0', '0.0', '0.0', '0.0', '0.0']
[' 2021 11:37:52.054638000 ', '192.168.0.170', '192.168.0.128', '0', '0', '0', 0.0, '0.0', '0', '0', '0', '0', 0.0, 