<a href="https://colab.research.google.com/github/Yxin/NTU-Msc-ID-project/blob/main/idproject_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setting up tools to process pcap

In [None]:
!pip install scapy

Extract relevant data from pcap into csv (testing and analysing)

In [None]:
from scapy.all import *
from datetime import datetime
import csv
import pandas as pd

pcap_file = './sample5.pcap'
packets = rdpcap(pcap_file)

headers = [
    'timestamp', 'ip_src', 'destination_ip',
    'source_port',
    'destination_port', 'packet_size',
    'dns_query_name', 'dns_query_type',
    'dns_transaction_id', 'dns_respond_code'
]

with open('raw_network_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file, delimiter=',')
    writer.writerow(headers)

    row = []
    for packet in packets:
      if UDP not in packet:
            continue
      timestamp = datetime.fromtimestamp(int(packet.time))
      source_ip= packet[IP].src
      destination_ip= packet[IP].dst
      source_port = packet[UDP].sport
      destination_port = packet[UDP].dport
      packet_size = len(packet[UDP].payload)
      if packet.haslayer(DNS):
        query_name = packet[DNS].qd.qname
        query_type = packet[DNS].qd.qtype
        transaction_id = packet[DNS].id
        respond_code = packet[DNS].rcode
    #  pkt_smry = packet.summary() # does not seems to be that useful
      row.append([timestamp, source_ip, destination_ip,
                  source_port,
                destination_port, packet_size,
                query_name,query_type, transaction_id, respond_code
                ])
    # write to CSV for verification
    writer.writerows(row)

#print(row[0])
print(len(row))

Install Pandas tool for analystics



In [None]:
!pip install pandas

Feed the extracted data into dataframe

In [None]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame(row, columns=headers)
#print(df)
print(len(df))

Convert dataframe into readable format

In [None]:
# Function to convert dataframe into LLM readable format
def convert_to_concise_format(row):
    #return f"{row['timestamp']} Src: {row['ip_src']}, Dst: {row['destination_ip']}, Dst_Port: {row['destination_port']}, Query: {row['dns_query_name']}, Type: {row['dns_query_type']}, Code: {row['dns_respond_code']}"
    return f"{row['timestamp']} Src: {row['ip_src']}:{row['source_port']}, Dst: {row['destination_ip']}:{row['destination_port']}, Query: {row['dns_query_name']}, Type: {row['dns_query_type']}, Code: {row['dns_respond_code']}"

# Apply the function iteratively
formatted_rows = [convert_to_concise_format(row) for _, row in df.iterrows()]

# Add line break hopefully that helps
#formatted_text = "\n".join(formatted_rows)

# Print the entries to verify
print(formatted_rows[0])

Configure HF_TOKEN with secret token

https://huggingface.co/docs/hub/en/security-tokens

In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

Install up LLM packages

In [None]:
!pip3 install transformers accelerate optimum
!pip3 install autoawq

In [None]:
!pip show transformers
!pip show accelerate
!pip show optimum
!pip show autoawq

Run processed data through LLM with designed prompts

In [None]:
# import tokenizer and automodel
from transformers import AutoTokenizer, AutoModelForCausalLM

# setup model name from hugging face
device = "cuda"
#model = "mistralai/Mistral-7B-v0.1"
#model = "mistralai/Mistral-7B-Instruct-v0.2"
model = "TheBloke/Mistral-7B-OpenOrca-AWQ"
#model = "mistralai/Mixtral-8x7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model)
# model = AutoModelForCausalLM.from_pretrained(model, device_map=device) # device map is for parallelism across multiple gpu
model = AutoModelForCausalLM.from_pretrained(model)

model.to(device)

Test prompting

In [None]:
# Test prompting
initial_prompt = "You are a network admin reviewing logs to identify hacking activity in the log"

# Dynamic row of logs retrieval
log_text =""
for row in formatted_rows[:75]:
    log_text += row + " "

#initial_prompt += log_text
initial_prompt += " 2020-10-23 08:37:05 Src: 10.0.2.4, Dst: 10.0.2.1, Dst_Port: 53, Type: 12, Query: b'15.3.0.10.in-addr.arpa.', Code: 0"

initial_prompt += "\nQuestion:Analyse this log entries, What activity is happening? Explain your decision."

initial_model_inputs = tokenizer([initial_prompt], return_tensors="pt").to(device)
initial_model_outputs = model.generate(**initial_model_inputs, max_new_tokens=1000, do_sample=True)
initial_response = tokenizer.batch_decode(initial_model_outputs)[0]
print("\Answer: ", initial_response)

Prompt Scenario No. 1 - Zero shot prompting

In [None]:
# Setup zero shot prompting
initial_prompt = "You are a network admin reviewing logs to identify hacking activity in the log"

# Dynamic row of logs retrieval (set to 75)
log_text =""
for row in formatted_rows[:75]:
    log_text += row + " "

initial_prompt += "\nAnalyse this log entries. " + log_text
initial_prompt += "\nQuestion: What is likely be the hacking activity? Explain your answer."

initial_model_inputs = tokenizer([initial_prompt], return_tensors="pt").to(device)
initial_model_outputs = model.generate(**initial_model_inputs, max_new_tokens=1000, do_sample=True)
initial_response = tokenizer.batch_decode(initial_model_outputs)[0]
print("\Answer: ", initial_response)

Prompt Scenario No. 2 - Few shot prompting

In [None]:
# Setup few shot prompting
initial_prompt = "You are a network admin reviewing logs to identify hacking activity in the log"

# Dynamic row of logs retrieval (set to 75)
log_text =""
for row in formatted_rows[:75]:
    log_text += row + " "

initial_prompt += "\nAnalyse this log entries. " + log_text
initial_prompt += "\nData Analysis: What does this specific DNS query suggest about network activity? Could it indicate a pattern of reconnaissance?"
initial_prompt += "\nScenario Evaluation: Imagine a sequence of DNS queries where each query targets a different IP address within the same subnet, all directed to port 53, occurring within minutes of each other. Assess the likelihood that this pattern represents network scanning or reconnaissance activity."
initial_prompt += "\nImplications: Given multiple sequential DNS reverse lookup queries targeting each IP in a subnet without any operational justification, what security risks might this indicate?"

initial_model_inputs = tokenizer([initial_prompt], return_tensors="pt").to(device)
initial_model_outputs = model.generate(**initial_model_inputs, max_new_tokens=1000, do_sample=True)
initial_response = tokenizer.batch_decode(initial_model_outputs)[0]
print("\Answer: ", initial_response)

Prompt Scenario No 3 - Chain of Thought Prompting

In [None]:
# Setup chain of thought prompting
initial_prompt = "You are a network admin reviewing logs to identify hacking activity in the log"
initial_prompt = "A log entry contains a reverse DNS lookup query noted as 'b'0.2.0.10.in-addr.arpa.'. Explain how to interpret this notation to understand the original IP address being queried."

initial_model_inputs = tokenizer([initial_prompt], return_tensors="pt").to(device)
initial_model_outputs = model.generate(**initial_model_inputs, max_new_tokens=1000, do_sample=True)
initial_response = tokenizer.batch_decode(initial_model_outputs)[0]
print("\nInitial answer: ", initial_response)

second_prompt = initial_response + "\nGiven the previous understanding, Analyse the following entry. '2020-10-23 08:37:05 Src: 10.0.2.4, Dst: 10.0.2.1, Dst_Port: 53, Type: 12, Query: b'15.3.0.10.in-addr.arpa.', Code: 0'"

second_model_inputs = tokenizer([second_prompt], return_tensors="pt").to(device)
second_model_outputs = model.generate(**second_model_inputs, max_new_tokens=1000, do_sample=True)
second_response = tokenizer.batch_decode(second_model_outputs)[0]
print("\nSecond answer: ", second_response)

third_prompt = second_response + "\nScenario Evaluation: Imagine a sequence of DNS queries where each query targets a different IP address within the same subnet, all directed to port 53, occurring within minutes of each other. Assess the likelihood that this pattern represents network scanning or reconnaissance activity."
third_prompt += "\nImplications: Given multiple sequential DNS reverse lookup queries targeting each IP in a subnet without any operational justification, what security risks might this indicate?"

third_model_inputs = tokenizer([third_prompt], return_tensors="pt").to(device)
third_model_outputs = model.generate(**third_model_inputs, max_new_tokens=1000, do_sample=True)
third_response = tokenizer.batch_decode(third_model_outputs)[0]
print("\Third answer: ", third_response)

log_text =""
for row in formatted_rows[:75]:
    log_text += row + " "

fourth_prompt = third_response + "\nAnalyse this log entries. " + log_text
fourth_prompt += "\nCould it be network scanning activity is happening? It is likely to be port scanning, host scanning or OS scanning? Explain your decision."

fourth_model_inputs = tokenizer([fourth_prompt], return_tensors="pt").to(device)
fourth_model_outputs = model.generate(**fourth_model_inputs, max_new_tokens=1000, do_sample=True)
fourth_response = tokenizer.batch_decode(fourth_model_outputs)[0]
print("\Fourth answer: ", fourth_response)
