<a href="https://colab.research.google.com/github/aSafarpoor/OSN_FAD/blob/main/CLS/Twibot20_test_with_simple_GCN_and_GAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m61.4/63.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [2]:
import networkx as nx
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import random


import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, GINConv
from sklearn.metrics import roc_curve,accuracy_score, roc_auc_score, confusion_matrix
import torch.nn as nn


SEED = 10
random.seed(SEED)

#read files

In [6]:
import tarfile

file_name = "Twibot20.tar.xz"

with tarfile.open(file_name, "r:xz") as tar:
    tar.extractall()
    tar.list()

?r-xr-xr-x 0/0          0 2024-11-19 03:33:21 Twibot20/ 
?rw-rw-rw- 0/0      67324 2024-11-19 03:33:06 Twibot20/benigns.txt 
?rw-rw-rw- 0/0    6649660 2024-11-18 00:45:41 Twibot20/edges.txt 
?rw-rw-rw- 0/0      95339 2024-11-19 03:33:11 Twibot20/sybils.txt 
?rw-rw-rw- 0/0    2410572 2024-11-18 09:16:36 Twibot20/testNodes.txt 
?rw-rw-rw- 0/0     522736 2024-11-18 09:16:41 Twibot20/trainNodes.txt 


In [13]:
file_name = "Twibot20/benigns.txt"
with open(file_name, "r") as file:
    lines = file.readlines()
benigns = [line.strip() for line in lines]

file_name = "Twibot20/sybils.txt"
with open(file_name, "r") as file:
    lines = file.readlines()
sybils = [line.strip() for line in lines]

file_name = "Twibot20/trainNodes.txt"
with open(file_name, "r") as file:
    lines = file.readlines()
trainNodes = [line.strip() for line in lines]

file_name = "Twibot20/testNodes.txt"
with open(file_name, "r") as file:
    lines = file.readlines()
testNodes = [line.strip() for line in lines]

edges = []
file_name = "Twibot20/edges.txt"

with open(file_name, "r") as file:
    edges = [tuple( line.strip().split()) for line in file]

In [77]:
def renumber_nodes(trainNodes, testNodes, benigns, sybils, edges):
    train_mapping = {node: idx for idx, node in enumerate(trainNodes)}
    test_start_idx = len(trainNodes)
    test_mapping = {node: idx for idx, node in enumerate(testNodes, start=test_start_idx)}
    node_mapping = {**train_mapping, **test_mapping}
    updated_trainNodes = [node_mapping[node] for node in trainNodes]
    updated_testNodes = [node_mapping[node] for node in testNodes]
    updated_benigns = [node_mapping[node] for node in benigns if node in node_mapping]
    updated_sybils = [node_mapping[node] for node in sybils if node in node_mapping]
    updated_edges = [(node_mapping[src], node_mapping[dst]) for src, dst in edges if src in node_mapping and dst in node_mapping]
    return updated_trainNodes, updated_testNodes, updated_benigns, updated_sybils, updated_edges

trainNodes, testNodes, benigns, sybils, edges = renumber_nodes(trainNodes, testNodes, benigns, sybils, edges)

# Models

In [173]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout=True):
        super(GCN, self).__init__()
        self.dropout = dropout
        self.convs = torch.nn.ModuleList()
        self.num_classes = output_dim
        for i in range(num_layers):
            if i == 0:  # First layer
                self.convs.append(GCNConv(input_dim, hidden_dim, bias=True))
            elif i == num_layers - 1:  # Last layer
                self.convs.append(GCNConv(hidden_dim, output_dim, bias=True))
            else:  # Middle layers
                self.convs.append(GCNConv(hidden_dim, hidden_dim, bias=True))

    def forward(self, x, edge_index):
        h = x
        i = 0
        for conv in self.convs:
            if self.dropout:
                h = F.dropout(h, p=0.5, training=self.training)
            h = conv(h, edge_index)
            if i < len(self.convs) - 1:
                h = F.tanh(h)
            i += 1

        if self.num_classes == 1:
            return F.sigmoid(h)
        else:
            return F.softmax(h, dim=1)

In [174]:
class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, heads=1, dropout=True):
        super(GAT, self).__init__()
        self.dropout = dropout
        self.convs = torch.nn.ModuleList()
        self.num_classes = output_dim
        self.heads = heads

        for i in range(num_layers):
            if i == 0:  # First layer
                self.convs.append(GATConv(input_dim, hidden_dim, heads=heads, concat=True, bias=True))
            elif i == num_layers - 1:  # Last layer
                self.convs.append(GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, bias=True))
            else:  # Middle layers
                self.convs.append(GATConv(hidden_dim * heads, hidden_dim, heads=heads, concat=True, bias=True))

    def forward(self, x, edge_index):
        h = x
        for i, conv in enumerate(self.convs):
            if self.dropout:
                h = F.dropout(h, p=0.5, training=self.training)
            h = conv(h, edge_index)
            if i < len(self.convs) - 1:  # Apply activation only for hidden layers
                h = F.elu(h)  # ELU is often preferred for GAT
        if self.num_classes == 1:
            return F.sigmoid(h)
        else:
            return F.log_softmax(h, dim=1)

In [175]:
def create_datasets(all_nodes, train_nodes, test_nodes, benign_nodes, sybil_nodes, ratio_of_known_trains, ratio_of_known_tests,SEED):

    random.seed(SEED)

    labels = {node: 0.5 for node in all_nodes}  # Unknown nodes default to 0.5
    for node in benign_nodes:
        labels[node] = 0
    for node in sybil_nodes:
        labels[node] = 1

    train_x = [labels[node] for node in train_nodes]
    for i in range(len(train_x)):
        if train_x[i] != 0.5:
            if random.random()>ratio_of_known_trains:
                train_x[i] = 0.5
    train_y = [labels[node] for node in train_nodes]


    test_x = [labels[node] for node in test_nodes]
    for i in range(len(test_x)):
        if test_x[i] != 0.5:
            if random.random()>ratio_of_known_tests:
                test_x[i] = 0.5
    test_y = [labels[node] for node in test_nodes]

    mask_test = [i != 0.5 for i in test_y]
    mask_test_sybils = [i == 1 for i in test_y]


    train_x = torch.tensor(train_x, dtype=torch.float).reshape(-1,1)
    train_y = torch.tensor(train_y, dtype=torch.float).reshape(-1,1)
    test_x = torch.tensor(test_x, dtype=torch.float).reshape(-1,1)
    test_y = torch.tensor(test_y, dtype=torch.float).reshape(-1,1)

    return train_x, test_x, train_y, test_y, mask_test, mask_test_sybils

In [176]:
train_x, test_x, train_y, test_y, mask_test, mask_test_sybils = create_datasets(all_nodes = trainNodes + testNodes,
                                                                                train_nodes = trainNodes,
                                                                                test_nodes = testNodes,
                                                                                benign_nodes = benigns,
                                                                                sybil_nodes = sybils,
                                                                                ratio_of_known_trains = 0.5,
                                                                                ratio_of_known_tests = 0.5,
                                                                                SEED = 1)

In [177]:
g = nx.Graph()
g.add_edges_from(edges)


temp = list(g.subgraph(trainNodes).edges())
trainEdges = [[i[0] for i in temp],[i[1] for i in temp[:]]]
temp = list(g.subgraph(testNodes).edges())
testEdges = [[i[0] for i in temp],[i[1] for i in temp[:]]]

trainEdges = torch.tensor(trainEdges, dtype=torch.long)
testEdges = torch.tensor(testEdges, dtype=torch.long)


In [178]:
def train_model(model, train_x, train_y, train_edge_index, epochs=100, lr=0.01):

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # loss_fn = torch.nn.CrossEntropyLoss()
    loss_fn = torch.nn.MSELoss()


    model.train()
    for epoch in tqdm(range(epochs)):
        optimizer.zero_grad()
        output = model(train_x, train_edge_index)
        loss = loss_fn(output, train_y)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 20 == 0 or epoch == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

    return model



In [203]:
def test_model(model, test_x, test_y, test_edge_index, mask=None, threshold=None):
    model.eval()
    with torch.no_grad():
        output = model(test_x, test_edge_index)

        # Apply mask if provided
        if mask is not None:
            output = output[mask]
            test_y = test_y[mask]

        # If output has shape [batch_size, 1], use the single column
        output = output.squeeze(1)  # Converts shape [batch_size, 1] -> [batch_size]

        # Compute AUC and find optimal threshold
        try:
            auc_score = roc_auc_score(test_y.cpu(), output.cpu())  # Adjust for single-class output
            # Compute ROC curve
            fpr, tpr, thresholds = roc_curve(test_y.cpu(), output.cpu())
            # Find optimal threshold using Youden's J statistic
            j_scores = tpr - fpr
            optimal_idx = j_scores.argmax()
            if threshold is None:
                threshold = thresholds[optimal_idx]

        except Exception as e:
            auc_score = None
            if threshold is None:
                threshold = 0.5  # Default threshold if AUC computation fails

        # Apply the threshold to classify predictions
        threshold_predictions = (output >= threshold).int().cpu()

        # Compute accuracy using the threshold
        accuracy_with_threshold = accuracy_score(test_y.cpu(), threshold_predictions)

        # Compute confusion matrix
        conf_matrix = confusion_matrix(test_y.cpu(), threshold_predictions)
        normalized_cm = conf_matrix.astype('float') / len(test_y)

        # Prepare metrics
        metrics = {
            "accuracy": accuracy_with_threshold,
            "auc": auc_score,
            "optimal_threshold": threshold,
            "confusion_matrix": normalized_cm
        }

        # Print metrics
        try:
            print(f"AUC: {auc_score:.3f}")
        except:
            pass
        # print(f"Optimal Threshold: {threshold}")
        print(f"ACC: {accuracy_with_threshold:.3f}")
        print("Confusion Matrix (Normalized):")
        print(f"                      Predicted Negative                       Predicted Positive")
        print(f"Actual Negative       {normalized_cm[0, 0]:.3f}                {normalized_cm[0, 1]:.3f}")
        print(f"Actual Positive       {normalized_cm[1, 0]:.3f}                {normalized_cm[1, 1]:.3f}")

        return metrics, threshold


In [204]:
input_dim = 1
hidden_dim = 16
output_dim = 1
model = GCN(input_dim, hidden_dim, output_dim, num_layers=3)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)



print("based on auc th")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  4%|▍         | 4/100 [00:00<00:05, 16.41it/s]

Epoch 1/100, Loss: 0.0221


 22%|██▏       | 22/100 [00:01<00:04, 16.24it/s]

Epoch 20/100, Loss: 0.0172


 42%|████▏     | 42/100 [00:02<00:03, 16.94it/s]

Epoch 40/100, Loss: 0.0170


 62%|██████▏   | 62/100 [00:03<00:02, 17.12it/s]

Epoch 60/100, Loss: 0.0169


 82%|████████▏ | 82/100 [00:04<00:01, 16.59it/s]

Epoch 80/100, Loss: 0.0169


100%|██████████| 100/100 [00:06<00:00, 16.26it/s]


Epoch 100/100, Loss: 0.0169
based on auc th
AUC: 0.661
ACC: 0.613
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.276                0.143
Actual Positive       0.244                0.337
ACC: 0.580
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.420                0.580



based on 0.5
AUC: 0.661
ACC: 0.463
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.374                0.045
Actual Positive       0.492                0.089
ACC: 0.154
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.846                0.154


In [206]:
input_dim = 1
hidden_dim = 16
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=3, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  1%|          | 1/100 [00:00<00:38,  2.59it/s]

Epoch 1/100, Loss: 0.0199


 20%|██        | 20/100 [00:06<00:26,  3.02it/s]

Epoch 20/100, Loss: 0.0177


 40%|████      | 40/100 [00:13<00:17,  3.43it/s]

Epoch 40/100, Loss: 0.0170


 60%|██████    | 60/100 [00:18<00:11,  3.48it/s]

Epoch 60/100, Loss: 0.0170


 80%|████████  | 80/100 [00:25<00:06,  2.93it/s]

Epoch 80/100, Loss: 0.0169


100%|██████████| 100/100 [00:31<00:00,  3.15it/s]

Epoch 100/100, Loss: 0.0169
based on auc th





AUC: 0.745
ACC: 0.660
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.321                0.097
Actual Positive       0.243                0.339
ACC: 0.582
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.418                0.582



based on 0.5
AUC: 0.745
ACC: 0.496
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.412                0.006
Actual Positive       0.498                0.084
ACC: 0.144
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.856                0.144


In [207]:
input_dim = 1
hidden_dim = 8
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=4, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  1%|          | 1/100 [00:00<00:53,  1.84it/s]

Epoch 1/100, Loss: 0.0232


 20%|██        | 20/100 [00:10<00:46,  1.72it/s]

Epoch 20/100, Loss: 0.0172


 40%|████      | 40/100 [00:17<00:18,  3.18it/s]

Epoch 40/100, Loss: 0.0170


 60%|██████    | 60/100 [00:24<00:13,  2.88it/s]

Epoch 60/100, Loss: 0.0170


 80%|████████  | 80/100 [00:32<00:07,  2.82it/s]

Epoch 80/100, Loss: 0.0169


100%|██████████| 100/100 [00:44<00:00,  2.25it/s]

Epoch 100/100, Loss: 0.0169
based on auc th





AUC: 0.720
ACC: 0.647
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.308                0.110
Actual Positive       0.243                0.339
ACC: 0.582
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.418                0.582



based on 0.5
AUC: 0.720
ACC: 0.418
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.418                0.000
Actual Positive       0.582                0.000
ACC: 0.000
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       1.000                0.000


In [209]:
input_dim = 1
hidden_dim = 8
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=4, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=500, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  0%|          | 1/500 [00:00<06:19,  1.32it/s]

Epoch 1/500, Loss: 0.0293


  4%|▍         | 20/500 [00:08<02:35,  3.09it/s]

Epoch 20/500, Loss: 0.0177


  8%|▊         | 40/500 [00:15<03:16,  2.34it/s]

Epoch 40/500, Loss: 0.0171


 12%|█▏        | 60/500 [00:22<02:21,  3.12it/s]

Epoch 60/500, Loss: 0.0170


 16%|█▌        | 80/500 [00:29<03:01,  2.31it/s]

Epoch 80/500, Loss: 0.0170


 20%|██        | 100/500 [00:35<02:06,  3.16it/s]

Epoch 100/500, Loss: 0.0170


 24%|██▍       | 120/500 [00:42<02:49,  2.24it/s]

Epoch 120/500, Loss: 0.0169


 28%|██▊       | 140/500 [00:49<01:54,  3.14it/s]

Epoch 140/500, Loss: 0.0169


 32%|███▏      | 160/500 [00:56<02:29,  2.27it/s]

Epoch 160/500, Loss: 0.0169


 36%|███▌      | 180/500 [01:03<01:42,  3.13it/s]

Epoch 180/500, Loss: 0.0169


 40%|████      | 200/500 [01:10<02:10,  2.31it/s]

Epoch 200/500, Loss: 0.0169


 44%|████▍     | 220/500 [01:17<01:27,  3.20it/s]

Epoch 220/500, Loss: 0.0169


 48%|████▊     | 240/500 [01:24<01:59,  2.18it/s]

Epoch 240/500, Loss: 0.0169


 52%|█████▏    | 260/500 [01:31<01:17,  3.10it/s]

Epoch 260/500, Loss: 0.0169


 56%|█████▌    | 280/500 [01:38<01:37,  2.27it/s]

Epoch 280/500, Loss: 0.0169


 60%|██████    | 300/500 [01:44<01:01,  3.24it/s]

Epoch 300/500, Loss: 0.0169


 64%|██████▍   | 320/500 [01:51<01:18,  2.30it/s]

Epoch 320/500, Loss: 0.0169


 68%|██████▊   | 340/500 [01:58<00:49,  3.26it/s]

Epoch 340/500, Loss: 0.0169


 72%|███████▏  | 360/500 [02:05<01:00,  2.31it/s]

Epoch 360/500, Loss: 0.0169


 76%|███████▌  | 380/500 [02:11<00:37,  3.18it/s]

Epoch 380/500, Loss: 0.0169


 80%|████████  | 400/500 [02:18<00:43,  2.27it/s]

Epoch 400/500, Loss: 0.0169


 84%|████████▍ | 420/500 [02:25<00:24,  3.26it/s]

Epoch 420/500, Loss: 0.0169


 88%|████████▊ | 440/500 [02:32<00:26,  2.26it/s]

Epoch 440/500, Loss: 0.0169


 92%|█████████▏| 460/500 [02:39<00:12,  3.11it/s]

Epoch 460/500, Loss: 0.0169


 96%|█████████▌| 480/500 [02:46<00:08,  2.30it/s]

Epoch 480/500, Loss: 0.0169


100%|██████████| 500/500 [02:52<00:00,  2.90it/s]

Epoch 500/500, Loss: 0.0169
based on auc th





AUC: 0.720
ACC: 0.644
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.312                0.106
Actual Positive       0.249                0.332
ACC: 0.571
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.429                0.571



based on 0.5
AUC: 0.720
ACC: 0.496
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.418                0.000
Actual Positive       0.504                0.078
ACC: 0.134
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.866                0.134
