# Introduction & Imports
In this project, we will attempt to generate synthetic network traffic using large language models and evaluate their performance.

In [7]:
# ===============================
# General Libraries
# ===============================
from classes import *
import datetime

# ===============================
# Data Acquisition and Cleaning
# ===============================
import pandas as pd  # Data manipulation
import openai  # OpenAI API
import os  # Operating system interface
import json  # JSON file handling

# ===============================
# Synthetic Data Generation
# ===============================

# Trulens
from trulens_eval import TruChain, Feedback, Huggingface, Tru
tru = Tru()

# Langchain - General
from langchain.chat_models import ChatOpenAI
from langchain.llms import LlamaCpp, VertexAI, Cohere  # LLMs
from langchain.schema import AIMessage, HumanMessage, SystemMessage  # Schema
from langchain.prompts import PromptTemplate  # General Prompt Template
from langchain.chains import LLMChain  # LLM Chains

# Langchain - Prompts
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

# Langchain - Callbacks
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler



🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


# Data Acquisition and Cleaning
In this section, we will import example network traffic data and store it in our model. We will also use `pandas` to display the data in a table.

In [8]:
# Import from CSV with columns "No.","Time","Source","Destination","Protocol","Length","Info","sport","dport"
in_data = pd.read_csv("data/4_1.csv",
                      encoding='unicode_escape',
                      usecols=["No.","Time","Source","Destination","Protocol","Length","Info","sport","dport"],
                      dtype={"No.": int, "Time": float, "Source": str, "Destination": str, "Protocol": str, "Length": 'Int64', "Info": str, "sport": 'Int64', "dport": 'Int64'})

# Drop data with missing fields
in_data = in_data.dropna(subset=["Source", "Destination", "sport", "dport", "Protocol", "Length", "Info"])

# Convert to list of NetworkPacket
packets = []

def addPacketToList(row):
    packets.append(NetworkPacket(
    time=row["Time"],
    src_ip=row["Source"],
    dst_ip=row["Destination"],
    src_port=row["sport"],
    dst_port=row["dport"],
    protocol=row["Protocol"],
    length=row["Length"],
    payload=row["Info"]
))
    
in_data.apply(addPacketToList, axis=1)


0          None
1          None
2          None
3          None
4          None
           ... 
999996     None
999997     None
999998     None
999999     None
1000000    None
Length: 999620, dtype: object

In [74]:
# Print the first 30 packets.
pd.DataFrame.from_dict([packet.dict() for packet in packets[:30]])

Unnamed: 0,time,src_ip,dst_ip,src_port,dst_port,protocol,length,payload
0,0.0,59.166.0.8,149.171.126.2,24272,80,TCP,68,24272 > 80 [ACK] Seq=1 Ack=1 Win=34752 Len=0...
1,9e-06,59.166.0.8,149.171.126.2,24272,80,TCP,68,[TCP Dup ACK 1#1] 24272 > 80 [ACK] Seq=1 Ack...
2,0.000174,149.171.126.2,59.166.0.8,80,24272,TCP,1516,"80 > 24272 [PSH, ACK] Seq=1 Ack=1 Win=7240 L..."
3,0.000177,149.171.126.2,59.166.0.8,80,24272,TCP,1516,"[TCP Retransmission] 80 > 24272 [PSH, ACK] S..."
4,0.002172,149.171.126.1,59.166.0.5,80,41355,TCP,1516,"80 > 41355 [PSH, ACK] Seq=1 Ack=1 Win=7240 L..."
5,0.002182,149.171.126.1,59.166.0.5,80,41355,TCP,1516,"[TCP Retransmission] 80 > 41355 [PSH, ACK] S..."
6,0.002394,59.166.0.5,149.171.126.1,41355,80,TCP,68,41355 > 80 [ACK] Seq=1 Ack=1449 Win=65160 Le...
7,0.002399,59.166.0.5,149.171.126.1,41355,80,TCP,68,[TCP Dup ACK 7#1] 41355 > 80 [ACK] Seq=1 Ack...
8,0.002973,59.166.0.9,149.171.126.8,42238,5190,TCP,104,"42238 > 5190 [PSH, ACK] Seq=1 Ack=1 Win=6516..."
9,0.002979,59.166.0.9,149.171.126.8,42238,5190,TCP,104,"[TCP Retransmission] 42238 > 5190 [PSH, ACK]..."


# Synthetic Data Generation

Now that we have example data, we will pass it to the LLMs and prompt them to generate synthetic data.

## Defining LLMs

Here is a list of the models we will use.

| Model | Model Type | Creator | Runs on | 
| --- | --- | --- | --- |
| GPT-3.5 | Chat | OpenAI | Cloud |
| GPT-4 | Chat | OpenAI | Cloud |
| LLaMA 2 13B | Completion | Facebook | Locally |
| PaLM 2 | Both (but we will use Completion) | Google | Cloud |

Originally, Cohere was going to be used, but it had a maximum context window of 512 tokens, making it impractical for our use case. PaLM 2 is not in use right now because we are having trouble sourcing API keys.

Next, let's make a list with each model and its type (chat or completion).

# Add UUIDs to prompts
We will give each prompt a UUID so that it can be uniquely identified.

In [60]:
# Add UUIDs to prompts that don't have them
import uuid
import json
with open("data/prompts.json", "r") as f:
    prompts_json = json.load(f)
    for prompt in prompts_json:
        # only if it doesn't have a UUID
        if "id" not in prompt:
            prompt["id"] = str(uuid.uuid4())
    with open("data/prompts.json", "w") as f:
        json.dump(prompts_json, f, indent=4)



# Add jobs to Redis Queue
We will add 10 jobs for each prompt and model to the Redis queue. Right now, we are only prompting on GPT-3.5, GPT-4, and LLaMA.

In [55]:
# Add jobs to Redis queue

from redis import Redis
from rq import Queue
from worker import run_model

gpt_queue = Queue('gpt', connection=Redis())
llama_queue = Queue('llama', connection=Redis())

# Run the models ten times for each prompt
for i in range(0, 10):
    with open("data/prompts.json", "r") as f:
        prompts_json = json.load(f)
        for prompt in prompts_json:
            gpt_queue.enqueue(
                run_model,
                in_packets = packets[:10],
                prompt_obj = prompt,
                model = "gpt-35",
                result_ttl=-1,
            )
            gpt_queue.enqueue(
                run_model,
                in_packets = packets[:10],
                prompt_obj = prompt,
                model = "gpt-4",
                result_ttl=-1,
            )
            llama_queue.enqueue(
                run_model,
                in_packets = packets[:10],
                prompt_obj = prompt,
                model = "llama2-13b",
                result_ttl=-1,
            )

# Get job results
Once the jobs are done, we can run this cell to get the results and export them to CSV.

In [59]:
# Get results
from redis import Redis
from rq import Queue

gpt_queue = Queue('gpt', connection=Redis())
llama_queue = Queue('llama', connection=Redis())

results = []

gpt_job_ids = gpt_queue.finished_job_registry.get_job_ids()
llama_job_ids = llama_queue.finished_job_registry.get_job_ids()

# Get the results
for job_id in gpt_job_ids:
    job = gpt_queue.fetch_job(job_id)
    results.append({
        "Prompt Name": job.kwargs["prompt_obj"]["name"],
        "Prompt": job.kwargs["prompt_obj"]["prompt"],
        "Model": job.kwargs["model"],
        "Input": job.kwargs["in_packets"],
        "Output": job.result
    })

for job_id in llama_job_ids:
    job = llama_queue.fetch_job(job_id)
    results.append({
        "Prompt Name": job.kwargs["prompt_obj"]["name"],
        "Prompt": job.kwargs["prompt_obj"]["prompt"],
        "Model": job.kwargs["model"],
        "Input": job.kwargs["in_packets"],
        "Output": job.result
    })

# Output to CSV
df = pd.DataFrame(results)
df.to_csv("data/output-new.csv", index=False)