# Flash Evaluation on Streamspot Dataset:

This notebook is dedicated to evaluating Flash on the Streamspot dataset, which are graph-level in nature. We employ Flash in graph-level detection mode to analyze this dataset effectively. Upon completion of the notebook execution, the results will be presented.

## Dataset Access: 
- The Streamspot dataset can be accessed at the following link: [Streamspot Dataset](https://github.com/sbustreamspot/sbustreamspot-data).
- Please download the required data files from the provided link.

## Data Parsing and Execution:
- Utilize the parser included in this notebook to process the downloaded files.
- To obtain the evaluation results, execute all cells within this notebook.

## Model Training and Execution Flexibility:
- By default, the notebook operates using pre-trained model weights.
- Additionally, this notebook offers the flexibility to set parameters for training Graph Neural Networks (GNNs) and word2vec models from scratch.
- You can then utilize these freshly trained models to conduct the evaluation. 

Follow these guidelines for a thorough and efficient analysis of the Streamspot dataset using Flash.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import os
import torch.nn.functional as F
import json
import warnings
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader
import multiprocessing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Train_Gnn = False
# Train_Word2vec = False

Train_Gnn = True
Train_Word2vec = True

In [3]:
from pprint import pprint
import gzip
from sklearn.manifold import TSNE
import json
import copy
import os
import time

In [4]:
import os.path as osp
import csv
def show(str):
	print (str + ' ' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))

def parse_data():
    # os.system('tar -zxvf all.tar.gz')

    show('Start processing.')
    data = []
    gId = -1
    with open('all.tsv') as f:
        tsvreader = csv.reader(f, delimiter='\t')
        for row in tsvreader:
            if int(row[5]) != gId:
                gId = int(row[5])
                show('Graph ' + str(gId))
                # scene = int(gId/100)+1
                # if not osp.exists('streamspot/'+str(gId)):              ## eliminated use of 'scene'.
                #     os.system('mkdir streamspot/'+str(gId))             ## eliminated use of 'scene'.
                ff = open('streamspot/'+str(gId)+'.txt', 'w')           ## eliminated use of 'scene'.
            ff.write(str(row[0])+'\t'+str(row[1])+'\t'+str(row[2])+'\t'+str(row[3])+'\t'+str(row[4])+'\t'+str(row[5])+'\n')
    # os.system('rm all.tsv')
    show('Done.')

In [5]:
parse_data()

Start processing. 2024-11-05 18:24:20
Graph 0 2024-11-05 18:24:20
Graph 1 2024-11-05 18:24:21
Graph 2 2024-11-05 18:24:21
Graph 3 2024-11-05 18:24:22
Graph 4 2024-11-05 18:24:22
Graph 5 2024-11-05 18:24:23
Graph 6 2024-11-05 18:24:23
Graph 7 2024-11-05 18:24:24
Graph 8 2024-11-05 18:24:24
Graph 9 2024-11-05 18:24:24
Graph 10 2024-11-05 18:24:25
Graph 11 2024-11-05 18:24:25
Graph 12 2024-11-05 18:24:26
Graph 13 2024-11-05 18:24:26
Graph 14 2024-11-05 18:24:27
Graph 15 2024-11-05 18:24:27
Graph 16 2024-11-05 18:24:28
Graph 17 2024-11-05 18:24:28
Graph 18 2024-11-05 18:24:29
Graph 19 2024-11-05 18:24:30
Graph 20 2024-11-05 18:24:30
Graph 21 2024-11-05 18:24:30
Graph 22 2024-11-05 18:24:30
Graph 23 2024-11-05 18:24:31
Graph 24 2024-11-05 18:24:31
Graph 25 2024-11-05 18:24:31
Graph 26 2024-11-05 18:24:31
Graph 27 2024-11-05 18:24:32
Graph 28 2024-11-05 18:24:32
Graph 29 2024-11-05 18:24:33
Graph 30 2024-11-05 18:24:33
Graph 31 2024-11-05 18:24:33
Graph 32 2024-11-05 18:24:34
Graph 33 2024-1

In [6]:
def prepare_graph(df):
    nodes, labels, edges = {}, {}, []
    dummies = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7}

    for _, row in df.iterrows():
        actor_id, object_id = row["actorID"], row["objectID"]
        action = row["action"]

        for entity_id in [actor_id, object_id]:
            nodes.setdefault(entity_id, []).append(action)
            if entity_id == actor_id:
                labels[entity_id] = dummies[row['actor_type']]
            else:
                labels[entity_id] = dummies[row['object']]

        edges.append((actor_id, object_id))

    features, feat_labels, edge_index, mapping = [], [], [[], []], []
    index_map = {}

    for key, value in nodes.items():
        index_map[key] = len(features)
        features.append(value)
        feat_labels.append(labels[key])
        mapping.append(key)

    for source, target in edges:
        edge_index[0].append(index_map[source])
        edge_index[1].append(index_map[target])

    return features, feat_labels, edge_index, mapping

In [7]:
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv, GATConv
import torch.nn as nn


class GCN(torch.nn.Module):
    def __init__(self,in_channel,out_channel):
        super().__init__()
        self.conv1 = SAGEConv(in_channel, 32, normalize=True)
        self.conv2 = SAGEConv(32, out_channel, normalize=True)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        return x

In [8]:
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

In [9]:
from gensim.models.callbacks import CallbackAny2Vec
import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        model.save('trained_weights/streamspot/streamspot.model')
        self.epoch += 1

In [10]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [11]:
logger = EpochLogger()
saver = EpochSaver()

In [12]:
if Train_Word2vec:
    phrases = []
    for i in range(50):
        print(i)
        f = open(f"streamspot/{i}.txt")
        data = f.read().split('\n')

        data = [line.split('\t') for line in data]
        df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
        df = df.dropna()
        docs,labels,edges,mapp = prepare_graph(df)
        phrases = phrases + docs
        
    word2vec = Word2Vec(sentences=phrases, vector_size=30, window=10, min_count=1, workers=8,epochs=100,callbacks=[saver,logger])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
Epoch #10 start
Epoch #10 end
Epoch #11 start
Epoch #11 end
Epoch #12 start
Epoch #12 end
Epoch #13 start
Epoch #13 end
Epoch #14 start
Epoch #14 end
Epoch #15 start
Epoch #15 end
Epoch #16 start
Epoch #16 end
Epoch #17 start
Epoch #17 end
Epoch #18 start
Epoch #18 end
Epoch #19 start
Epoch #19 end
Epoch #20 start
Epoch #20 end
Epoch #21 start
Epoch #21 end
Epoch #22 start
Epoch #22 end
Epoch #23 start
Epoch #23 end
Epoch #24 start
Epoch #24 end
Epoch #25 start
Epoch #25 end
Epoch #26 start
Epoch #26 end
Epoch #27 start
Epoch #27 end
Epoch #28 start
Epoch #28 end
Epoch #29 

In [13]:
from sklearn.utils import class_weight
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

model = GCN(30,8).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [14]:
import math
import torch
import numpy as np
from gensim.models import Word2Vec

class PositionalEncoder:

    def __init__(self, d_model, max_len=100000):
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        self.pe = torch.zeros(max_len, d_model)
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)

    def embed(self, x):
        return x + self.pe[:x.size(0)]

def infer(document):
    if Train_Word2vec:                  ## added.
        word_embeddings = [word2vec.wv[word] for word in document if word in  word2vec.wv]
    else:
        word_embeddings = [w2vmodel.wv[word] for word in document if word in  w2vmodel.wv]
    
    if not word_embeddings:
        return np.zeros(20)

    output_embedding = torch.tensor(word_embeddings, dtype=torch.float)
    if len(document) < 100000:
        output_embedding = encoder.embed(output_embedding)

    output_embedding = output_embedding.detach().cpu().numpy()
    return np.mean(output_embedding, axis=0)

encoder = PositionalEncoder(30)
w2vmodel = Word2Vec.load("trained_weights/streamspot/streamspot.model")

In [15]:
from torch_geometric import utils

if Train_Gnn:
    for i in range(300):                        ## trained till 300.
    # for i in range(400):
        f = open(f"streamspot/{i}.txt")
        
        data = f.read().split('\n')

        data = [line.split('\t') for line in data]
        df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
        df = df.dropna()
        phrases,labels,edges,mapp = prepare_graph(df)

        criterion = CrossEntropyLoss()

        nodes = [infer(x) for x in phrases]
        nodes = np.array(nodes)  

        graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))

        model.train()
        optimizer.zero_grad() 
        out = model(graph.x, graph.edge_index) 
        loss = criterion(out, graph.y) 
        loss.backward() 
        optimizer.step() 

        _ , indices = out.sort(dim=1,descending=True)
        pred = indices[:,0]
        cond = pred == graph.y

        print(cond.sum() / len(graph.y))

        torch.save(model.state_dict(), f'trained_weights/streamspot/lstreamspot.pth')

tensor(0.2143, device='cuda:0')
tensor(0.7181, device='cuda:0')
tensor(0.8576, device='cuda:0')
tensor(0.9327, device='cuda:0')
tensor(0.9367, device='cuda:0')


tensor(0.8932, device='cuda:0')
tensor(0.9618, device='cuda:0')
tensor(0.9449, device='cuda:0')
tensor(0.9622, device='cuda:0')
tensor(0.9710, device='cuda:0')
tensor(0.9760, device='cuda:0')
tensor(0.9669, device='cuda:0')
tensor(0.9722, device='cuda:0')
tensor(0.9726, device='cuda:0')
tensor(0.9771, device='cuda:0')
tensor(0.9789, device='cuda:0')
tensor(0.9738, device='cuda:0')
tensor(0.9780, device='cuda:0')
tensor(0.9778, device='cuda:0')
tensor(0.9762, device='cuda:0')
tensor(0.9688, device='cuda:0')
tensor(0.9788, device='cuda:0')
tensor(0.9824, device='cuda:0')
tensor(0.9804, device='cuda:0')
tensor(0.9810, device='cuda:0')
tensor(0.9811, device='cuda:0')
tensor(0.9819, device='cuda:0')
tensor(0.9816, device='cuda:0')
tensor(0.9795, device='cuda:0')
tensor(0.9800, device='cuda:0')
tensor(0.9830, device='cuda:0')
tensor(0.9806, device='cuda:0')
tensor(0.9821, device='cuda:0')
tensor(0.9832, device='cuda:0')
tensor(0.9839, device='cuda:0')
tensor(0.9805, device='cuda:0')
tensor(0

### Validation

In [16]:
model.load_state_dict(torch.load(f'trained_weights/streamspot/lstreamspot.pth', map_location=torch.device('cpu')))
model.eval()

for i in range(400,450):                            ## validating 400 to 450, while trained till 300?
    f = open(f"streamspot/{i}.txt")
    data = f.read().split('\n')

    data = [line.split('\t') for line in data]
    df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
    df = df.dropna()
    
    phrases,labels,edges,mapp = prepare_graph(df)

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)
    
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    graph.n_id = torch.arange(graph.num_nodes)
    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)

    out = model(graph.x, graph.edge_index)

    sorted, indices = out.sort(dim=1,descending=True)
    conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
    conf = (conf - conf.min()) / conf.max()

    pred = indices[:,0]
    cond = ~(pred == graph.y)
    
    print(cond.sum().item(), (cond.sum().item() / len(cond))*100)

97 1.1112384007331884
97 1.1002722323049001
78 0.8956252152945229
97 1.097285067873303
92 1.046048891415577
84 0.9573740597219056
86 0.9921550530687587
97 1.1167395809348377
90 1.0359116022099446
98 1.121666475907062
84 0.9610983981693364
89 1.0350040702407257
97 1.115840331301047
90 1.0251737099897482
87 1.0069444444444444
100 1.1328877308258751
91 1.0476629058254663
97 1.101396616327921
98 1.104349785891368
93 1.0556186152099887
94 1.0670904756499036
83 0.9574345368554621
84 0.9535702122828924
83 0.9492223238792316
97 1.0875658706132976
90 1.0184451737014826
87 0.9953094611600504
91 1.0280162675101672
87 0.9879627526686351
86 0.9708737864077669
97 1.0919734323989643
77 0.8752983971808572
84 0.9496890898812888
84 0.9484023935869933
74 0.8413871517907903
90 1.0182147301730966
89 0.9997753313862054
79 0.9002849002849003
84 0.9630818619582664
105 1.1651131824234355
80 0.9092975676290065
83 0.9400838147015518
91 1.0218978102189782
89 1.0037216645990752
109 1.2308039747064137
92 1.02850754

### Testing

In [17]:
thresh = 200
correct_benign = 0
correct_attack = 0

In [18]:
model.load_state_dict(torch.load(f'trained_weights/streamspot/lstreamspot.pth',map_location=torch.device('cpu')))
model.eval()

GCN(
  (conv1): SAGEConv(30, 32, aggr=mean)
  (conv2): SAGEConv(32, 8, aggr=mean)
)

In [19]:
for i in range(450,600):
    f = open(f"streamspot/{i}.txt")
    data = f.read().split('\n')

    data = [line.split('\t') for line in data]
    df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
    df = df.dropna()
    
    phrases,labels,edges,mapp = prepare_graph(df)

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)
    
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    graph.n_id = torch.arange(graph.num_nodes)
    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)

    out = model(graph.x, graph.edge_index)

    sorted, indices = out.sort(dim=1,descending=True)
    conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
    conf = (conf - conf.min()) / conf.max()

    pred = indices[:,0]
    cond = ~(pred == graph.y)

    if cond.sum() <= thresh:
         correct_benign = correct_benign + 1
    
    print(cond.sum().item(), (cond.sum().item() / len(cond))*100)

97 1.0762232331077333
74 0.840718018632129
91 1.023737203284959
89 1.0041746586934446
88 0.9994321408290744
91 1.0306943028655566
94 1.0576057605760576
83 0.9344742175185768
93 1.0476512335248396
89 0.9951917700995192
89 1.0098717803245207
82 0.9257168661097314
79 0.8914466260437826
98 1.1009998876530727
92 1.0254123941150244
75 0.853825136612022
82 0.9272871197557391
81 0.9177430319510537
100 1.1189437171310284
87 0.9889735136978516
80 0.9012053621719048
97 1.0923423423423424
78 0.8843537414965987
84 0.9509792822370655
89 0.9998876530726885
91 1.0171007041466413
77 0.8740068104426788
84 0.9572649572649573
87 0.9752269924896312
90 1.0102143899427545
94 1.0532212885154062
75 0.8461191335740073
88 0.9915492957746479
88 0.9916610322289835
100 1.1273957158962795
80 0.9136592051164915
83 0.9380650994575045
78 0.8845543207076435
82 0.929810636126545
82 0.9288627095604893
94 1.0544026920919798
92 1.0349870626617166
96 1.079865016872891
78 0.88515660463005
82 0.9306548632391329
99 1.1136107986

In [20]:
for i in range(300,400):
    f = open(f"streamspot/{i}.txt")
    data = f.read().split('\n')

    data = [line.split('\t') for line in data]
    df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
    df = df.dropna()
    
    phrases,labels,edges,mapp = prepare_graph(df)
  
    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)
    
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    graph.n_id = torch.arange(graph.num_nodes)
    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)

    out = model(graph.x, graph.edge_index)

    sorted, indices = out.sort(dim=1,descending=True)
    conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
    conf = (conf - conf.min()) / conf.max()

    pred = indices[:,0]
    cond = ~(pred == graph.y)

    if cond.sum() > thresh:
         correct_attack = correct_attack + 1
            
    print(cond.sum().item(), (cond.sum().item() / len(cond))*100)

1072 11.995076647644623
1076 12.031756681203177
1070 11.963327370304116
1076 12.026377556722924
1073 12.000894754501735
1075 12.017887087758524
1073 11.999552672780139
1073 11.995528228060369
1072 11.987028961198703
1073 12.008953553441522
1074 12.013422818791947
1075 12.023263617045073
1075 12.025953686094642
1076 12.02503352704515
1074 12.004023695093327
1071 11.98791134989926
1071 11.973169368362214
1074 12.006707657909446
1071 11.975847031197585
1073 11.995528228060369
1072 11.984348798211292
1075 12.023263617045073
1076 12.031756681203177
1072 11.985688729874775
1074 12.001340931947704
1075 12.025953686094642
1070 11.96600313129054
1076 12.033102214269737
1073 12.008953553441522
1078 12.054120541205412
1073 12.002237136465324
48 0.6111535523300229
1073 12.000894754501735
48 0.6066734074823054
47 0.5941094678296044
1072 11.984348798211292
1076 12.034448048316742
1071 11.973169368362214
1073 11.998210891199822
1076 12.02503352704515
1073 12.008953553441522
1071 11.978525891958395
10

In [21]:
TOTAL_ATTACKS = 100
TOTAL_BENIGN = 150

def calculate_metrics(correct_attack, correct_benign):
    TP = correct_attack
    FP = TOTAL_BENIGN - correct_benign
    TN = correct_benign
    FN = TOTAL_ATTACKS - correct_attack

    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0

    print(f"Number of True Positives: {TP}")
    print(f"Number of False Positives: {FP}")
    print(f"Number of False Negatives: {FN}")
    print(f"Number of True Negatives: {TN}\n")

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

    fscore = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"Fscore: {fscore}\n")

calculate_metrics(correct_attack, correct_benign)

Number of True Positives: 95
Number of False Positives: 0
Number of False Negatives: 5
Number of True Negatives: 150

Precision: 1.0
Recall: 0.95
Fscore: 0.9743589743589743

