Input: Camflow Logs parsed by unicorn parser

Output Vectorized Graphs

In [1]:
import os 
from os import path as osp
import torch
from tqdm import tqdm, trange
from torch_geometric.data import *
import psycopg2
from psycopg2 import extras as ex

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
connect = psycopg2.connect(
    database = 'camflow',
    host = '/var/run/postgresql/',
    user = 'postgres',
    password = 'postgres',
    port = '5432'
)

cur = connect.cursor()
connect.rollback()

In [3]:
# If the data is already in the database
process_raw_data = True
clean_database = False
node_types = set()
edge_types = set()

In [4]:
if clean_database:    
    for graph_id in trange(220):
        sql = f"delete from raw_data where graph_id='{graph_id}';"
        cur.execute(sql)
        connect.commit()

In [5]:

ds_path = '/home/aabouelk/ds/camflow/'
benign_date = '30-04-2024'
n_splits = 33
graphs_per_file = 3

node_size = []
for idx in trange(n_splits):
    path = osp.join(ds_path, benign_date, f'preprocessed/preprocessed-{idx}.txt')  # The paths to the dataset.
    datalist = []
    lines_in_file = 0
    with open(path, 'rb') as f:
        lines_in_file = sum(1 for _ in f)
    lines_per_graph = lines_in_file / graphs_per_file
    print(lines_per_graph)
    line_count = 0
    graph_idx = 0 
    node_map = {}
    with open(path) as f:
        for line in tqdm(f):
            src_id, dest_id, types = line.strip('\n').split('\t')
            src_type, dest_type, edge_type, ts = types.split(":")
            if src_id not in node_map:
                node_map[src_id] = len(node_map)
            src_id = node_map[src_id]
            if dest_id not in node_map:
                node_map[dest_id] = len(node_map)
            dest_id = node_map[dest_id]
            
            spl = [
                src_id,
                src_type,
                dest_id,
                dest_type,
                edge_type,
                ts,
                (idx * graphs_per_file)+ graph_idx,
                ]
            datalist.append(spl)
            node_types.add(src_type)
            node_types.add(dest_type)
            edge_types.add(edge_type)
            line_count += 1
            if line_count > graph_idx * lines_per_graph:
                graph_idx += 1
                #node_size.append(len(node_map))
                node_map = {}
            if len(datalist) >= 10000 and process_raw_data:
                sql = '''insert into raw_data
                values %s
                '''
                ex.execute_values(cur, sql, datalist, page_size=10000)
                connect.commit()
                datalist = []
# print(node_size)
# print(len(node_map))

  0%|          | 0/33 [00:00<?, ?it/s]

291283.3333333333


873850it [00:24, 35096.95it/s]
  3%|▎         | 1/33 [00:25<13:21, 25.04s/it]

295746.0


887238it [00:24, 36327.20it/s]
  6%|▌         | 2/33 [00:49<12:47, 24.76s/it]

298261.3333333333


894784it [00:25, 35611.49it/s]
  9%|▉         | 3/33 [01:14<12:29, 24.98s/it]

297742.0


893226it [00:26, 34335.38it/s]
 12%|█▏        | 4/33 [01:41<12:17, 25.45s/it]

297895.6666666667


893687it [00:25, 34730.18it/s]
 15%|█▌        | 5/33 [02:06<11:56, 25.59s/it]

297788.6666666667


893366it [00:25, 34389.31it/s]
 18%|█▊        | 6/33 [02:32<11:35, 25.77s/it]

297281.3333333333


891844it [00:26, 34265.54it/s]
 21%|██        | 7/33 [02:59<11:13, 25.91s/it]

297516.3333333333


892549it [00:26, 33924.49it/s]
 24%|██▍       | 8/33 [03:25<10:52, 26.09s/it]

297593.6666666667


892781it [00:25, 34932.59it/s]
 27%|██▋       | 9/33 [03:51<10:23, 25.97s/it]

297263.0


891789it [00:26, 34270.01it/s]
 30%|███       | 10/33 [04:17<09:58, 26.04s/it]

296651.3333333333


889954it [00:25, 34592.98it/s]
 33%|███▎      | 11/33 [04:43<09:31, 25.98s/it]

297445.6666666667


892337it [00:26, 34129.72it/s]
 36%|███▋      | 12/33 [05:09<09:07, 26.08s/it]

297746.0


893238it [00:26, 34303.53it/s]
 39%|███▉      | 13/33 [05:35<08:42, 26.11s/it]

297059.0


891177it [00:25, 34527.13it/s]
 42%|████▏     | 14/33 [06:01<08:15, 26.07s/it]

297389.6666666667


892169it [00:25, 35037.70it/s]
 45%|████▌     | 15/33 [06:27<07:46, 25.93s/it]

297449.0


892347it [00:25, 34651.99it/s]
 48%|████▊     | 16/33 [06:53<07:20, 25.93s/it]

297232.3333333333


891697it [00:25, 34998.87it/s]
 52%|█████▏    | 17/33 [07:18<06:53, 25.83s/it]

297567.6666666667


892703it [00:25, 34720.90it/s]
 55%|█████▍    | 18/33 [07:44<06:27, 25.84s/it]

297925.3333333333


893776it [00:25, 34541.48it/s]
 58%|█████▊    | 19/33 [08:10<06:02, 25.89s/it]

297102.6666666667


891308it [00:25, 34421.91it/s]
 61%|██████    | 20/33 [08:36<05:37, 25.93s/it]

297420.0


892260it [00:25, 34705.51it/s]
 64%|██████▎   | 21/33 [09:02<05:10, 25.91s/it]

296804.0


890412it [00:25, 34460.14it/s]
 67%|██████▋   | 22/33 [09:28<04:45, 25.94s/it]

297274.6666666667


891824it [00:26, 34112.49it/s]
 70%|██████▉   | 23/33 [09:55<04:20, 26.04s/it]

297141.0


891423it [00:25, 34728.51it/s]
 73%|███████▎  | 24/33 [10:20<03:53, 25.97s/it]

297402.3333333333


892207it [00:25, 35035.75it/s]
 76%|███████▌  | 25/33 [10:46<03:26, 25.87s/it]

296741.3333333333


890224it [00:25, 34665.73it/s]
 79%|███████▉  | 26/33 [11:12<03:00, 25.85s/it]

296825.0


890475it [00:25, 34516.47it/s]
 82%|████████▏ | 27/33 [11:38<02:35, 25.88s/it]

296895.0


890685it [00:25, 34994.90it/s]
 85%|████████▍ | 28/33 [12:03<02:08, 25.80s/it]

297781.6666666667


893345it [00:26, 34225.01it/s]
 88%|████████▊ | 29/33 [12:30<01:43, 25.94s/it]

297077.6666666667


891233it [00:25, 34695.64it/s]
 91%|█████████ | 30/33 [12:55<01:17, 25.91s/it]

297139.6666666667


891419it [00:25, 35375.39it/s]
 94%|█████████▍| 31/33 [13:21<00:51, 25.73s/it]

296628.6666666667


889886it [00:22, 38722.45it/s]
 97%|█████████▋| 32/33 [13:44<00:24, 24.95s/it]

296836.0


890508it [00:24, 36915.16it/s]
100%|██████████| 33/33 [14:08<00:00, 25.72s/it]


In [6]:
print(len(edge_types))

54


In [50]:
ds_path = '/home/aabouelk/ds/camflow/'
attack_date = '26-05-2024'
a_splits = 1

for idx in trange(10, 14):
    path = osp.join(ds_path, attack_date, f'preprocessed/preprocessed-{idx}.txt')  # The paths to the dataset.
    datalist = []

    graphs_per_file = 25
    lines_in_file = 0
    with open(path, 'rb') as f:
        lines_in_file = sum(1 for _ in f)
    lines_per_graph = lines_in_file / graphs_per_file
    print(lines_per_graph)
    line_count = 0
    graph_idx = 1
    attack_start_idx = 99
    with open(path) as f:
        node_map = {}
        for line in tqdm(f):
            src_id, dest_id, types = line.strip('\n').split('\t')
            src_type, dest_type, edge_type, ts = types.split(":")
            if src_id not in node_map:
                node_map[src_id] = len(node_map)
            src_id = node_map[src_id]
            if dest_id not in node_map:
                node_map[dest_id] = len(node_map)
            dest_id = node_map[dest_id]
            spl = [
                src_id,
                src_type,
                dest_id,
                dest_type,
                edge_type,
                ts,
                attack_start_idx + ((idx-10) * graphs_per_file)+ graph_idx,
                ]
            line_count += 1
            if line_count > graph_idx * lines_per_graph:
                graph_idx += 1
            datalist.append(spl)
            node_types.add(src_type)
            node_types.add(dest_type)
            edge_types.add(edge_type)
            if len(datalist) >= 10000 and process_raw_data:
                sql = '''insert into raw_data
                values %s
                '''
                ex.execute_values(cur, sql, datalist, page_size=10000)
                connect.commit()
                datalist = []

  0%|          | 0/4 [00:00<?, ?it/s]

262225.88


6555647it [02:48, 39012.49it/s]
 25%|██▌       | 1/4 [02:48<08:26, 168.76s/it]

262146.28


6553657it [02:48, 38921.90it/s]
 50%|█████     | 2/4 [05:38<05:38, 169.05s/it]

261996.96


6549924it [02:45, 39472.86it/s]
 75%|███████▌  | 3/4 [08:24<02:48, 168.02s/it]

261927.36


6548184it [02:45, 39518.27it/s]
100%|██████████| 4/4 [11:11<00:00, 167.83s/it]


In [51]:
nodevec = torch.nn.functional.one_hot(torch.arange(0, len(node_types)), num_classes=len(node_types))
edgevec = torch.nn.functional.one_hot(torch.arange(0, len(edge_types)), num_classes=len(edge_types))

edge2onehot = {}
node2onehot = {}
c = 0
for i in node_types:
    node2onehot[i] = nodevec[c]
    c += 1
c = 0
for i in edge_types:
    edge2onehot[i] = edgevec[c]
    c += 1


In [52]:
data_dir = "../data/camflow250"
os.system(f"mkdir -p {data_dir}")
for graph_id in trange(199):
    sql = "select * from raw_data where graph_id='{graph_id}' ORDER BY _id;".format(graph_id=graph_id)
    cur.execute(sql)
    rows = cur.fetchall()
    from torch_geometric.data import TemporalData

    dataset = TemporalData()
    src = []
    dst = []
    msg = []
    t = []
    for i in rows:
        src.append(int(i[0]))
        srconehot = node2onehot[i[1]]
        dst.append(int(i[2]))
        destonehot = node2onehot[i[3]]
        edgeonehot = edge2onehot[i[4]]
        msg_t = torch.cat([srconehot, edgeonehot, destonehot], dim=0)
        msg.append(msg_t)
        t.append(int(i[-1]))    # Use logical order of the event to represent the time
    dataset.src = torch.tensor(src)
    dataset.dst = torch.tensor(dst)
    dataset.t = torch.tensor(t)
    dataset.msg = torch.vstack(msg)
    dataset.src = dataset.src.to(torch.long)
    dataset.dst = dataset.dst.to(torch.long)
    dataset.msg = dataset.msg.to(torch.float)
    dataset.t = dataset.t.to(torch.long)
    torch.save(dataset, f"{data_dir}/graph_" + str(graph_id) + ".TemporalData")

print("end")

100%|██████████| 199/199 [19:40<00:00,  5.93s/it]

end



