Input: Camflow Logs parsed by unicorn parser

Output Vectorized Graphs

In [7]:
import os 
from os import path as osp
import torch
from tqdm import tqdm, trange
from torch_geometric.data import *
import psycopg2
from psycopg2 import extras as ex

In [8]:
connect = psycopg2.connect(
    database = 'camflow',
    host = '/var/run/postgresql/',
    user = 'postgres',
    password = 'postgres',
    port = '5432'
)

cur = connect.cursor()
connect.rollback()

In [9]:
# If the data is already in the database
process_raw_data = False
node_types = set()
edge_types = set()

In [10]:
for base_idx in trange(150):
    fidx = 0
    ds_path = '/home/aabouelk/ds/camflow/'
    benign_date = '10-05-2024'
    path = osp.join(ds_path, benign_date, f'base/base-{base_idx}.txt')  # The paths to the dataset.
    datalist = []
    with open(path) as f:
        for line in tqdm(f):
            src_id, dest_id, types = line.strip('\n').split(' ')
            src_type, dest_type, edge_type, ts = types.split(":")
            spl = [
                src_id,
                src_type,
                dest_id,
                dest_type,
                edge_type,
                ts,
                base_idx,
                ]
            datalist.append(spl)
            node_types.add(src_type)
            node_types.add(dest_type)
            edge_types.add(edge_type)
            if len(datalist) >= 10000 and process_raw_data:
                sql = '''insert into raw_data
                values %s
                '''
                ex.execute_values(cur, sql, datalist, page_size=10000)
                connect.commit()
                datalist = []

351859it [00:01, 214912.80it/s], ?it/s]
351907it [00:01, 246263.30it/s]4:04,  1.64s/it]
350634it [00:01, 240411.92it/s]3:52,  1.57s/it]
349533it [00:01, 336489.95it/s]3:48,  1.56s/it]
352332it [00:00, 365638.48it/s]3:22,  1.39s/it]
356273it [00:00, 370676.00it/s]3:01,  1.26s/it]
355014it [00:00, 368296.84it/s]2:49,  1.17s/it]
356007it [00:00, 360726.46it/s]2:40,  1.12s/it]
354442it [00:01, 329563.12it/s]2:36,  1.10s/it]
353031it [00:00, 388440.11it/s]2:36,  1.11s/it]
352729it [00:01, 336760.84it/s]02:29,  1.07s/it]
352349it [00:00, 440129.73it/s]02:29,  1.08s/it]
352903it [00:00, 358437.87it/s]02:19,  1.01s/it]
352660it [00:00, 369487.74it/s]02:19,  1.02s/it]
353498it [00:00, 397468.11it/s]02:18,  1.02s/it]
353270it [00:01, 329624.06it/s]02:14,  1.00it/s]
353404it [00:00, 443785.32it/s]02:18,  1.04s/it]
353581it [00:01, 349017.06it/s]02:10,  1.02it/s]
352697it [00:00, 383821.54it/s]02:13,  1.01s/it]
352366it [00:00, 377517.33it/s]02:10,  1.00it/s]
352669it [00:01, 300145.91it/s]02:09, 

In [11]:
nodevec = torch.nn.functional.one_hot(torch.arange(0, len(node_types)), num_classes=len(node_types))
edgevec = torch.nn.functional.one_hot(torch.arange(0, len(edge_types)), num_classes=len(edge_types))

edge2onehot = {}
node2onehot = {}
c = 0
for i in node_types:
    node2onehot[i] = nodevec[c]
    c += 1
c = 0
for i in edge_types:
    edge2onehot[i] = edgevec[c]
    c += 1


In [12]:
os.system("mkdir -p ../data/camflow")
for graph_id in tqdm(range(150)):
    sql = "select * from raw_data where graph_id='{graph_id}' ORDER BY _id;".format(graph_id=graph_id)
    cur.execute(sql)
    rows = cur.fetchall()
    from torch_geometric.data import TemporalData

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    for i in rows:
        src.append(int(i[0]))
        srconehot = node2onehot[i[1]]
        dst.append(int(i[2]))
        destonehot = node2onehot[i[3]]
        edgeonehot = edge2onehot[i[4]]
        msg_t = torch.cat([srconehot, edgeonehot, destonehot], dim=0)
        msg.append(msg_t)
        t.append(int(i[-1]))    # Use logical order of the event to represent the time

    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    torch.save(dataset, "../data/camflow/graph_" + str(graph_id) + ".TemporalData")

print("end")

100%|██████████| 150/150 [14:18<00:00,  5.72s/it]

end



