In [3]:
from models import *
import pandas as pd
import openai
import os
import json

In [4]:
# Import from CSV with columns "No.","Time","Source","Destination","Protocol","Length","Info","sport","dport"
in_data = pd.read_csv("data/4_1.csv",
                      encoding='unicode_escape',
                      usecols=["No.","Time","Source","Destination","Protocol","Length","Info","sport","dport"],
                      dtype={"No.": int, "Time": float, "Source": str, "Destination": str, "Protocol": str, "Length": 'Int64', "Info": str, "sport": 'Int64', "dport": 'Int64'})

# Drop data with missing fields
in_data = in_data.dropna(subset=["Source", "Destination", "sport", "dport", "Protocol", "Length", "Info"])

# Convert to list of NetworkPacket
packets = []

def addPacketToList(row):
    packets.append(NetworkPacket(
    time=row["Time"],
    src_ip=row["Source"],
    dst_ip=row["Destination"],
    src_port=row["sport"],
    dst_port=row["dport"],
    protocol=row["Protocol"],
    length=row["Length"],
    payload=row["Info"]
))
    
in_data.apply(addPacketToList, axis=1)


0          None
1          None
2          None
3          None
4          None
           ... 
999996     None
999997     None
999998     None
999999     None
1000000    None
Length: 999620, dtype: object

In [5]:
# print the input packets
pd.DataFrame.from_dict([packet.dict() for packet in packets[:30]])

/var/folders/7t/07brym5161v0lqdmpmct76t40000gn/T/ipykernel_81742/3545713065.py:2: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.3/migration/
  pd.DataFrame.from_dict([packet.dict() for packet in packets[:15]])


Unnamed: 0,time,src_ip,dst_ip,src_port,dst_port,protocol,length,payload
0,0.0,59.166.0.8,149.171.126.2,24272,80,TCP,68,24272 > 80 [ACK] Seq=1 Ack=1 Win=34752 Len=0...
1,9e-06,59.166.0.8,149.171.126.2,24272,80,TCP,68,[TCP Dup ACK 1#1] 24272 > 80 [ACK] Seq=1 Ack...
2,0.000174,149.171.126.2,59.166.0.8,80,24272,TCP,1516,"80 > 24272 [PSH, ACK] Seq=1 Ack=1 Win=7240 L..."
3,0.000177,149.171.126.2,59.166.0.8,80,24272,TCP,1516,"[TCP Retransmission] 80 > 24272 [PSH, ACK] S..."
4,0.002172,149.171.126.1,59.166.0.5,80,41355,TCP,1516,"80 > 41355 [PSH, ACK] Seq=1 Ack=1 Win=7240 L..."
5,0.002182,149.171.126.1,59.166.0.5,80,41355,TCP,1516,"[TCP Retransmission] 80 > 41355 [PSH, ACK] S..."
6,0.002394,59.166.0.5,149.171.126.1,41355,80,TCP,68,41355 > 80 [ACK] Seq=1 Ack=1449 Win=65160 Le...
7,0.002399,59.166.0.5,149.171.126.1,41355,80,TCP,68,[TCP Dup ACK 7#1] 41355 > 80 [ACK] Seq=1 Ack...
8,0.002973,59.166.0.9,149.171.126.8,42238,5190,TCP,104,"42238 > 5190 [PSH, ACK] Seq=1 Ack=1 Win=6516..."
9,0.002979,59.166.0.9,149.171.126.8,42238,5190,TCP,104,"[TCP Retransmission] 42238 > 5190 [PSH, ACK]..."


# Using Redis and Redis Queue
We will use Redis Queue, a job scheduling system, to call OpenAI in bulk.
Note: On macOS, you must start RQ workers with `export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES`. The cause of this is unknown.

In [6]:
from redis import Redis
from rq import Queue
from worker import get_summary, get_synthetic_packets

packet_summary_queue = Queue("packet_summary", connection=Redis())
packet_synthesis_queue = Queue("packet_synthesis", connection=Redis())

In [22]:
print("Enqueuing jobs...")

for i in range(10):
    job = packet_summary_queue.enqueue(
        get_summary,
        in_packets=packets[:15],
        result_ttl=-1, # Results should be stored forever.
    )

Enqueuing jobs...


In [23]:
# Get all jobs that succeeded
results = []
job_ids = packet_summary_queue.finished_job_registry.get_job_ids()

# Get the results
for job_id in job_ids:
    job = packet_summary_queue.fetch_job(job_id)
    results.append({
        "id": job.id,
        "packets": job.kwargs["in_packets"],
        "summary": job.result,
    })

df = pd.DataFrame(results)
df

df['summary']

0     The network packets in the given data consist ...
1     Based on the provided network packets, the ove...
2     Based on the network packets provided, the net...
3     Based on the provided network packets, the ove...
4     The network packets can be summarized as follo...
5     Based on the provided network packets, we see ...
6     Based on the provided network packets, there a...
7     Based on the provided network packets, the ove...
8     Based on the provided network packets, the fol...
9     The network packets provided contain informati...
10    The network traffic consists of multiple packe...
11    Based on the provided network packets, we can ...
12    Based on the provided network packets, the tra...
13    Based on the provided network packets, the net...
14    The network packets in the given data include ...
15    Based on the provided network packets, there a...
16    Based on the provided network packets, the ove...
17    Based on the provided network packets, the

In [24]:
# For each summary, enqueue a new job to generate packets with that summary
for i, row in df.iterrows():
    job = packet_synthesis_queue.enqueue(
        get_synthetic_packets,
        in_packets=row["packets"],
        summary=row["summary"],
        result_ttl=-1, # Results should be stored forever.
    )


In [25]:

job_ids = packet_synthesis_queue.finished_job_registry.get_job_ids()

# Get the results
for job_id in job_ids:
    job = packet_synthesis_queue.fetch_job(job_id)
    print("Synthetic packets generated by job", job.id, ":")
    print(json.dumps(job.result.message.function_call.arguments, indent=2))

Synthetic packets generated by job 04179be8-8375-4f7f-ae1d-b2138a0cabd4 :
"{\n  \"packets\": [\n    {\n      \"time\": 0.0,\n      \"src_ip\": \"192.168.0.1\",\n      \"dst_ip\": \"10.0.0.1\",\n      \"src_port\": 1234,\n      \"dst_port\": 80,\n      \"protocol\": \"TCP\",\n      \"length\": 256,\n      \"payload\": \"[SYN] Seq=1 Win=65535 Len=0\"\n    },\n    {\n      \"time\": 0.0001,\n      \"src_ip\": \"10.0.0.1\",\n      \"dst_ip\": \"192.168.0.1\",\n      \"src_port\": 80,\n      \"dst_port\": 1234,\n      \"protocol\": \"TCP\",\n      \"length\": 256,\n      \"payload\": \"[SYN, ACK] Seq=1 Ack=2 Win=65535 Len=0\"\n    },\n    {\n      \"time\": 0.001,\n      \"src_ip\": \"192.168.0.2\",\n      \"dst_ip\": \"10.0.0.2\",\n      \"src_port\": 4321,\n      \"dst_port\": 443,\n      \"protocol\": \"TCP\",\n      \"length\": 128,\n      \"payload\": \"[SYN] Seq=1 Win=65535 Len=0\"\n    },\n    {\n      \"time\": 0.0011,\n      \"src_ip\": \"10.0.0.2\",\n      \"dst_ip\": \"192.168.0.