# Introduction & Imports
In this project, we will attempt to generate synthetic network traffic using large language models and evaluate their performance.

In [2]:
# ===============================
# General Libraries
# ===============================
from classes import *
import datetime

# ===============================
# Data Acquisition and Cleaning
# ===============================
import pandas as pd  # Data manipulation
import openai  # OpenAI API
import os  # Operating system interface
import json  # JSON file handling

# ===============================
# Synthetic Data Generation
# ===============================

# Langchain - General
from langchain.chat_models import ChatOpenAI
from langchain.llms import LlamaCpp, VertexAI, Cohere  # LLMs
from langchain.schema import AIMessage, HumanMessage, SystemMessage  # Schema
from langchain.prompts import PromptTemplate  # General Prompt Template
from langchain.chains import LLMChain  # LLM Chains

# Langchain - Prompts
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

# Langchain - Callbacks
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Data Acquisition and Cleaning
In this section, we will import example network traffic data and store it in our model. We will also use `pandas` to display the data in a table.

In [3]:
# Import from CSV with columns "No.","Time","Source","Destination","Protocol","Length","Info","sport","dport"
in_data = pd.read_csv("data/4_1.csv",
                      encoding='unicode_escape',
                      usecols=["No.","Time","Source","Destination","Protocol","Length","Info","sport","dport"],
                      dtype={"No.": int, "Time": float, "Source": str, "Destination": str, "Protocol": str, "Length": 'Int64', "Info": str, "sport": 'Int64', "dport": 'Int64'})

# Drop data with missing fields
in_data = in_data.dropna(subset=["Source", "Destination", "sport", "dport", "Protocol", "Length", "Info"])

# Convert to list of NetworkPacket
packets = []

def addPacketToList(row):
    packets.append(NetworkPacket(
    time=row["Time"],
    src_ip=row["Source"],
    dst_ip=row["Destination"],
    src_port=row["sport"],
    dst_port=row["dport"],
    protocol=row["Protocol"],
    length=row["Length"],
    payload=row["Info"]
))
    
in_data.apply(addPacketToList, axis=1)


0          None
1          None
2          None
3          None
4          None
           ... 
999996     None
999997     None
999998     None
999999     None
1000000    None
Length: 999620, dtype: object

In [5]:
# Print the first 30 packets.
pd.DataFrame.from_dict([packet.dict() for packet in packets[:30]])

Unnamed: 0,time,src_ip,dst_ip,src_port,dst_port,protocol,length,payload,conversationID
0,0.0,59.166.0.8,149.171.126.2,24272,80,TCP,68,24272 > 80 [ACK] Seq=1 Ack=1 Win=34752 Len=0...,
1,9e-06,59.166.0.8,149.171.126.2,24272,80,TCP,68,[TCP Dup ACK 1#1] 24272 > 80 [ACK] Seq=1 Ack...,
2,0.000174,149.171.126.2,59.166.0.8,80,24272,TCP,1516,"80 > 24272 [PSH, ACK] Seq=1 Ack=1 Win=7240 L...",
3,0.000177,149.171.126.2,59.166.0.8,80,24272,TCP,1516,"[TCP Retransmission] 80 > 24272 [PSH, ACK] S...",
4,0.002172,149.171.126.1,59.166.0.5,80,41355,TCP,1516,"80 > 41355 [PSH, ACK] Seq=1 Ack=1 Win=7240 L...",
5,0.002182,149.171.126.1,59.166.0.5,80,41355,TCP,1516,"[TCP Retransmission] 80 > 41355 [PSH, ACK] S...",
6,0.002394,59.166.0.5,149.171.126.1,41355,80,TCP,68,41355 > 80 [ACK] Seq=1 Ack=1449 Win=65160 Le...,
7,0.002399,59.166.0.5,149.171.126.1,41355,80,TCP,68,[TCP Dup ACK 7#1] 41355 > 80 [ACK] Seq=1 Ack...,
8,0.002973,59.166.0.9,149.171.126.8,42238,5190,TCP,104,"42238 > 5190 [PSH, ACK] Seq=1 Ack=1 Win=6516...",
9,0.002979,59.166.0.9,149.171.126.8,42238,5190,TCP,104,"[TCP Retransmission] 42238 > 5190 [PSH, ACK]...",


# Synthetic Data Generation

Now that we have example data, we will pass it to the LLMs and prompt them to generate synthetic data.

## Defining LLMs

Here is a list of the models we will use.

| Model | Model Type | Creator | Runs on | 
| --- | --- | --- | --- |
| GPT-3.5 | Chat | OpenAI | Cloud |
| GPT-4 | Chat | OpenAI | Cloud |
| LLaMA 2 13B | Completion | Facebook | Locally |
| PaLM 2 | Both (but we will use Completion) | Google | Cloud |

Originally, Cohere was going to be used, but it had a maximum context window of 512 tokens, making it impractical for our use case. PaLM 2 is not in use right now because we are having trouble sourcing API keys.

Next, let's make a list with each model and its type (chat or completion).

# Add UUIDs to prompts
We will give each prompt a UUID so that it can be uniquely identified.

In [5]:
# # Add UUIDs to prompts that don't have them
# import uuid
# import json
# with open("data/prompts.json", "r") as f:
#     prompts_json = json.load(f)
#     for prompt in prompts_json:
#         # only if it doesn't have a UUID
#         if "id" not in prompt:
#             prompt["id"] = str(uuid.uuid4())
#     with open("data/prompts.json", "w") as f:
#         json.dump(prompts_json, f, indent=4)



# Add jobs to Redis Queue
We will add 10 jobs for each prompt and model to the Redis queue. Right now, we are only prompting on GPT-3.5, GPT-4, and LLaMA.

In [6]:
# # Add jobs to Redis queue

# from redis import Redis
# from rq import Queue
# from worker import run_model

# gpt_queue = Queue('gpt', connection=Redis())
# llama_queue = Queue('llama', connection=Redis())

# # Run the models ten times for each prompt
# for i in range(0, 10):
#     with open("data/prompts.json", "r") as f:
#         prompts_json = json.load(f)
#         for prompt in prompts_json:
#             gpt_queue.enqueue(
#                 run_model,
#                 in_packets = packets[:10],
#                 prompt_obj = prompt,
#                 model = "gpt-35",
#                 result_ttl=-1,
#             )
#             gpt_queue.enqueue(
#                 run_model,
#                 in_packets = packets[:10],
#                 prompt_obj = prompt,
#                 model = "gpt-4",
#                 result_ttl=-1,
#             )
#             llama_queue.enqueue(
#                 run_model,
#                 in_packets = packets[:10],
#                 prompt_obj = prompt,
#                 model = "llama2-13b",
#                 result_ttl=-1,
#             )

# Try preprocessing steps
We will try preprocessing the data to see if it improves the quality of the synthetic data.

In [18]:
# Define a function that enumerates IP addresses and ports instead of storing their actual data
ip_dict = {}
def convertIPsToTokens(packets):
    # Create a dictionary of IP addresses and ports
    for packet in packets:
        if packet.src_ip not in ip_dict:
            ip_dict[packet.src_ip] = len(ip_dict)
        if packet.dst_ip not in ip_dict:
            ip_dict[packet.dst_ip] = len(ip_dict)
    # Replace IP addresses and ports with their enumeration
    for packet in packets:
        packet.src_ip = ip_dict[packet.src_ip]
        packet.dst_ip = ip_dict[packet.dst_ip]
    return packets

# Label conversations
def labelConversations(packets):
    # Create a dictionary of conversations
    conversation_dict = {}
    # If we force a specific sorting of IP addresses and ports, we can ensure that the same conversation is always labeled the same way.
    for packet in packets:
        if packet.src_ip < packet.dst_ip:
            conversation = (packet.src_ip, packet.dst_ip, packet.src_port, packet.dst_port)
        else:
            conversation = (packet.dst_ip, packet.src_ip, packet.dst_port, packet.src_port)
        if conversation not in conversation_dict:
            conversation_dict[conversation] = len(conversation_dict)
        packet.conversationID = conversation_dict[conversation]
    return packets

def removePortPairsFromPayload(packets):
    for packet in packets:
        packet.payload = packet.payload.replace(f"{packet.src_port}  >  {packet.dst_port}", "")
    return packets

# Create a separate dataframe to try this on
packets2 = packets[:1000]

# Convert IP addresses and ports to tokens
packets2 = convertIPsToTokens(packets2)

# Remove port pairs from payload
packets2 = removePortPairsFromPayload(packets2)

# Add conversation IDs
packets2 = labelConversations(packets2)

# Print the first 30 packets.
pd.DataFrame.from_dict([packet.dict() for packet in packets2[:30]])

Unnamed: 0,time,src_ip,dst_ip,src_port,dst_port,protocol,length,payload,conversationID
0,0.0,0,1,24272,80,TCP,68,[ACK] Seq=1 Ack=1 Win=34752 Len=0 TSval=40001...,0
1,9e-06,0,1,24272,80,TCP,68,[TCP Dup ACK 1#1] [ACK] Seq=1 Ack=1 Win=34752...,0
2,0.000174,1,0,80,24272,TCP,1516,"[PSH, ACK] Seq=1 Ack=1 Win=7240 Len=1448 TSva...",0
3,0.000177,1,0,80,24272,TCP,1516,"[TCP Retransmission] [PSH, ACK] Seq=1 Ack=1 W...",0
4,0.002172,2,3,80,41355,TCP,1516,"[PSH, ACK] Seq=1 Ack=1 Win=7240 Len=1448 TSva...",1
5,0.002182,2,3,80,41355,TCP,1516,"[TCP Retransmission] [PSH, ACK] Seq=1 Ack=1 W...",1
6,0.002394,3,2,41355,80,TCP,68,[ACK] Seq=1 Ack=1449 Win=65160 Len=0 TSval=40...,1
7,0.002399,3,2,41355,80,TCP,68,[TCP Dup ACK 7#1] [ACK] Seq=1 Ack=1449 Win=65...,1
8,0.002973,4,5,42238,5190,TCP,104,"[PSH, ACK] Seq=1 Ack=1 Win=65160 Len=36 TSval...",2
9,0.002979,4,5,42238,5190,TCP,104,"[TCP Retransmission] [PSH, ACK] Seq=1 Ack=1 W...",2


# Get job results
Once the jobs are done, we can run this cell to get the results and export them to CSV.

In [12]:
""" # Get results
from redis import Redis
from rq import Queue

gpt_queue = Queue('gpt', connection=Redis())
llama_queue = Queue('llama', connection=Redis())

results = []

gpt_job_ids = gpt_queue.finished_job_registry.get_job_ids()
llama_job_ids = llama_queue.finished_job_registry.get_job_ids()

# Get the results
for job_id in gpt_job_ids:
    job = gpt_queue.fetch_job(job_id)
    results.append({
        "Prompt Name": job.kwargs["prompt_obj"]["name"],
        "Prompt": job.kwargs["prompt_obj"]["prompt"],
        "Model": job.kwargs["model"],
        "Input": job.kwargs["in_packets"],
        "Output": job.result
    })

for job_id in llama_job_ids:
    job = llama_queue.fetch_job(job_id)
    results.append({
        "Prompt Name": job.kwargs["prompt_obj"]["name"],
        "Prompt": job.kwargs["prompt_obj"]["prompt"],
        "Model": job.kwargs["model"],
        "Input": job.kwargs["in_packets"],
        "Output": job.result
    })

# Output to CSV
df = pd.DataFrame(results)
df.to_csv("data/output-new.csv", index=False)
df.to_json("data/output-new.json", orient="records", indent=4) """

In [80]:
# load data with pandas
df = pd.read_json("data/output-new.json")
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from itables import show
import itables.options as opt
import mercury as mr


In [82]:
# try to load NetworkPackets from the output (should have string repr)
# NetworkPacket(time=0.003000, src_ip=1, dst_ip=5, src_port=24272, dst_port=80, protocol='TCP', length=68, payload='[ACK] Seq=2 Ack=2 Win=34752 Len=0 TSval=4000192677 TSecr=4000176626')

# Define a function that converts the string representation of a NetworkPacket to a NetworkPacket object
import re

def convertStringToNetworkPacket(kv_pair):
    # Split the string into a list
    kv_pairs = kv_pair.split(", ")
    # Convert the string list to a dictionary
    obj = {}
    for kv_pair in kv_pairs:
        # Find the stuff before the = and the stuff after (may be many after, because of the payload)
        kv_pair = kv_pair.split("=")
        
        # Join all the stuff after the = back together
        tmp_result = "=".join(kv_pair[1:])
        
        # If there is a tmp result...
        if tmp_result:
            # Check for single quotes and remove if present
            if tmp_result[0] == "'":
                tmp_result = tmp_result[1:-1]
            
            # Check if parseable as a float
            try:
                tmp_result = float(tmp_result)
            except:
                pass

        # Set the dictionary value
        obj[kv_pair[0]] = tmp_result

    # Convert the dictionary to a NetworkPacket object
    return NetworkPacket(**obj)

def convertOutputToNetworkPacketList(output):
    # Accepts a free-form text string and returns a list of NetworkPacket objects, if any
    # Search for NetworkPacket() in the string
    results = []
    if "NetworkPacket(" in output:
        # Find text matching NetworkPacket(something) with regex
        regex = r"NetworkPacket\((.*?)\)"
        matches = re.findall(regex, output)
        # Convert each match to a NetworkPacket object
        for i in range(len(matches)):
            try:
                convertedPacket = convertStringToNetworkPacket(matches[i])
                results.append(convertedPacket)
            except:
                pass
        return results
    else:
        return []  
    
# for each output... convert the string representation of a NetworkPacket to a NetworkPacket object
df["Parsed"] = df["Output"].apply(convertOutputToNetworkPacketList)

In [15]:
@interact
def show_prompt(prompt_name=widgets.Select(
    options=df["Prompt Name"].unique().tolist(),
    description="Prompt Name",
    disabled=False,
)):
    # print the prompt
    print("PROMPT: " + df[df["Prompt Name"] == prompt_name]["Prompt"].iloc[0])
    print("=========================================")
    print("=========================================")
    
    # for each result, print the input and output in a block
    for index, row in df[df["Prompt Name"] == prompt_name].iterrows():
        widgets.HTML(f"<h1>{row['Model']}</h1>")
        print("MODEL: " + row["Model"])
        print("OUTPUT:")
        print(row["Output"])

        # print spacer
        print("=========================================")
        print("=========================================")

interactive(children=(Select(description='Prompt Name', options=('Dialogue', 'Anonymized Retransmission', 'Noi…

In [46]:
model_select = widgets.SelectMultiple(
    options=df["Model"].unique().tolist(),
    description="Model",
    disabled=False,
)
@interact
def show_results_for_models(models=model_select):
    return show(df[df["Model"].isin(models)])

interactive(children=(SelectMultiple(description='Model', options=('gpt-4', 'gpt-35', 'llama2-13b'), value=())…

In [55]:
# Count average generated packets for different models
df["Parsed Packets"] = df["Parsed"].apply(len)

# Show average generated packets for different models
show(df[["Parsed Packets", "Model"]].groupby(["Model"]).mean().reset_index())

# Show average generated packets for different prompts
show(df[["Parsed Packets", "Prompt Name"]].groupby(["Prompt Name"]).mean().reset_index())

# Show average generated packets for different prompts and models
show(df[["Parsed Packets", "Prompt Name", "Model"]].groupby(["Prompt Name", "Model"]).mean().reset_index())

Model,Parsed Packets
Loading... (need help?),


Prompt Name,Parsed Packets
Loading... (need help?),


Prompt Name,Model,Parsed Packets
Loading... (need help?),,


In [83]:
# Convert Input back to NetworkPacket objects
df["Input"] = df["Input"].apply(lambda x: [NetworkPacket(**packet) for packet in x])

In [84]:
def getPacketCount(packets):
    return len(packets)

def getAverageConversationLength(packets):
    # Create a dictionary of conversations
    conversation_dict = {}
    for packet in packets:
        if packet.src_ip < packet.dst_ip:
            conversation = (packet.src_ip, packet.dst_ip, packet.src_port, packet.dst_port)
        else:
            conversation = (packet.dst_ip, packet.src_ip, packet.dst_port, packet.src_port)
        if conversation not in conversation_dict:
            conversation_dict[conversation] = 0
        conversation_dict[conversation] += 1
    # Return the average conversation length
    if len(conversation_dict) == 0:
        return 0
    return sum(conversation_dict.values()) / len(conversation_dict)
    
# Create a function that takes a list of NetworkPacket objects and returns the time delta between the start and end of the dump.
def getTimeDelta(packets):
    if(len(packets) == 0):
        return 0
    return packets[-1].time - packets[0].time

# Create a function that takes a list of NetworkPacket objects and counts the number of unique IP addresses.
def getUniqueIPs(packets):
    ip_set = set()
    for packet in packets:
        ip_set.add(packet.src_ip)
        ip_set.add(packet.dst_ip)
    return len(ip_set)

# Create a function that takes a list of NetworkPacket objects and counts the number of unique ports.
def getUniquePorts(packets):
    port_set = set()
    for packet in packets:
        port_set.add(packet.src_port)
        port_set.add(packet.dst_port)
    return len(port_set)

# Run the functions on input packets and on the parsed packets. Find the difference between the two; that's the difference score (lower is better).



# Apply the function to the input packets and the parsed packets
df["Parsed Packets"] = df["Parsed"].apply(getPacketCount)

# Represents the difference in the average conversation length between the input and the parsed packets
# So, a positive number means that the parsed packets' conversations were, on average, longer than the input packets' conversations
# A negative number means that the parsed packets' conversations were, on average, shorter than the input packets' conversations
df["Average Conversation Length (Input)"] = df["Input"].apply(getAverageConversationLength)

# Because we may have parsed less packets than we input, let's multiply the average conversation length by the ratio of parsed packets to input packets
df["Average Conversation Length (Parsed)"] = df["Parsed"].apply(getAverageConversationLength) * (df["Parsed Packets"] / df["Input"].apply(getPacketCount))

# Now, we can find the difference between the two
df["Average Conversation Length (Delta)"] = df["Average Conversation Length (Parsed)"] - df["Average Conversation Length (Input)"]

# Represents the difference in the time delta between the input and the parsed packets
# So, a positive number means that the parsed packets represented a larger timespan than the input packets
# A negative number means that the parsed packets represented a smaller timespan than the input packets
df["Time Delta (Input)"] = df["Input"].apply(getTimeDelta)

# Because we may have parsed less packets than we input, let's multiply the time delta by the ratio of parsed packets to input packets
df["Time Delta (Parsed)"] = df["Parsed"].apply(getTimeDelta) * (df["Parsed Packets"] / df["Input"].apply(getPacketCount))

df["Time Delta (Delta)"] = df["Time Delta (Parsed)"] - df["Time Delta (Input)"]

# Represents the difference in the number of unique IPs between the input and the parsed packets
# So, a positive number means that the parsed packets represented more unique IPs than the input packets
# A negative number means that the parsed packets represented fewer unique IPs than the input packets
df["Unique IPs (Input)"] = df["Input"].apply(getUniqueIPs)
# Because we may have parsed less packets than we input, let's multiply the number of unique IPs by the ratio of parsed packets to input packets
df["Unique IPs (Parsed)"] = df["Parsed"].apply(getUniqueIPs) * (df["Parsed Packets"] / df["Input"].apply(getPacketCount))
df["Unique IPs (Delta)"] = df["Unique IPs (Parsed)"] - df["Unique IPs (Input)"]

# Represents the difference in the number of unique ports between the input and the parsed packets
# So, a positive number means that the parsed packets represented more unique ports than the input packets
# A negative number means that the parsed packets represented fewer unique ports than the input packets
df["Unique Ports (Input)"] = df["Input"].apply(getUniquePorts)
# Because we may have parsed less packets than we input, let's multiply the number of unique ports by the ratio of parsed packets to input packets
df["Unique Ports (Parsed)"] = df["Parsed"].apply(getUniquePorts) * (df["Parsed Packets"] / df["Input"].apply(getPacketCount))
df["Unique Ports (Delta)"] = df["Unique Ports (Parsed)"] - df["Unique Ports (Input)"]

# Create a summary score where we add up the absolute errors
df["Summary Error"] = df[["Average Conversation Length (Delta)", "Time Delta (Delta)", "Unique IPs (Delta)", "Unique Ports (Delta)"]].abs().sum(axis=1)

# Show deltas and parsed packets for different models
show(df[["Parsed Packets", "Average Conversation Length (Delta)", "Time Delta (Delta)", "Unique IPs (Delta)", "Unique Ports (Delta)", "Summary Error", "Model"]].groupby(["Model"]).mean().reset_index())

# Show deltas and parsed packets for different prompts and models
# show(df[["Parsed Packets", "Average Conversation Length (Delta)", "Time Delta (Delta)", "Unique IPs (Delta)", "Unique Ports (Delta)", "Summary Score", "Prompt Name", "Model"]].groupby(["Prompt Name", "Model"]).mean().reset_index())

KeyError: "['Summary Score'] not in index"