<a href="https://colab.research.google.com/github/aSafarpoor/OSN_FAD/blob/main/Twitter_(heeb)_test_with_simple_GCN_and_GAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m61.4/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m970.5 kB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [2]:
import networkx as nx
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import random


import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, GINConv
from sklearn.metrics import roc_curve,accuracy_score, roc_auc_score, confusion_matrix
import torch.nn as nn


SEED = 10
random.seed(SEED)

#read files

In [4]:
# import tarfile

# file_name = "Cresci.tar.xz"

# with tarfile.open(file_name, "r:xz") as tar:
#     tar.extractall()
#     tar.list()

In [38]:
with open("test.txt", "r") as file:
    lines = file.readlines()
lines = [line.split() for line in lines]
btest = lines[0][:]
stest = lines[1][:]

with open("train.txt", "r") as file:
    lines = file.readlines()
lines = [line.split() for line in lines]
btrain = lines[0][:]
strain = lines[1][:]

benigns = btest[:]+btrain[:]
sybils = stest[:]+strain[:]

trainNodes = btrain[:]+strain[:]
testNodes = btest[:]+stest[:]


edges = []
file_name = "graph.txt"
with open(file_name, "r") as file:
    edges = [tuple( line.strip().split()) for line in file]



In [39]:
len(btest),len(stest)

(158377, 81263)

In [41]:
def renumber_nodes(trainNodes, testNodes, benigns, sybils, edges):
    train_mapping = {node: idx for idx, node in enumerate(trainNodes)}
    test_start_idx = len(trainNodes)
    test_mapping = {node: idx for idx, node in enumerate(testNodes, start=test_start_idx)}
    node_mapping = {**train_mapping, **test_mapping}
    updated_trainNodes = [node_mapping[node] for node in trainNodes]
    updated_testNodes = [node_mapping[node] for node in testNodes]
    updated_benigns = [node_mapping[node] for node in benigns if node in node_mapping]
    updated_sybils = [node_mapping[node] for node in sybils if node in node_mapping]
    updated_edges = [(node_mapping[src], node_mapping[dst]) for src, dst in edges if src in node_mapping and dst in node_mapping]
    return updated_trainNodes, updated_testNodes, updated_benigns, updated_sybils, updated_edges

trainNodes, testNodes, benigns, sybils, edges = renumber_nodes(trainNodes, testNodes, benigns, sybils, edges)

# Models

In [42]:
min(trainNodes),max(trainNodes)

(0, 29999)

In [43]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout=True):
        super(GCN, self).__init__()
        self.dropout = dropout
        self.convs = torch.nn.ModuleList()
        self.num_classes = output_dim
        for i in range(num_layers):
            if i == 0:  # First layer
                self.convs.append(GCNConv(input_dim, hidden_dim, bias=True))
            elif i == num_layers - 1:  # Last layer
                self.convs.append(GCNConv(hidden_dim, output_dim, bias=True))
            else:  # Middle layers
                self.convs.append(GCNConv(hidden_dim, hidden_dim, bias=True))

    def forward(self, x, edge_index):
        h = x
        i = 0
        for conv in self.convs:
            if self.dropout:
                h = F.dropout(h, p=0.5, training=self.training)
            h = conv(h, edge_index)
            if i < len(self.convs) - 1:
                h = F.tanh(h)
            i += 1

        if self.num_classes == 1:
            return F.sigmoid(h)
        else:
            return F.softmax(h, dim=1)

In [44]:
class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, heads=1, dropout=True):
        super(GAT, self).__init__()
        self.dropout = dropout
        self.convs = torch.nn.ModuleList()
        self.num_classes = output_dim
        self.heads = heads

        for i in range(num_layers):
            if i == 0:  # First layer
                self.convs.append(GATConv(input_dim, hidden_dim, heads=heads, concat=True, bias=True))
            elif i == num_layers - 1:  # Last layer
                self.convs.append(GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, bias=True))
            else:  # Middle layers
                self.convs.append(GATConv(hidden_dim * heads, hidden_dim, heads=heads, concat=True, bias=True))

    def forward(self, x, edge_index):
        h = x
        for i, conv in enumerate(self.convs):
            if self.dropout:
                h = F.dropout(h, p=0.5, training=self.training)
            h = conv(h, edge_index)
            if i < len(self.convs) - 1:  # Apply activation only for hidden layers
                h = F.elu(h)  # ELU is often preferred for GAT
        if self.num_classes == 1:
            return F.sigmoid(h)
        else:
            return F.log_softmax(h, dim=1)

In [45]:
def create_datasets(all_nodes, train_nodes, test_nodes, benign_nodes, sybil_nodes, ratio_of_known_trains, ratio_of_known_tests,SEED):

    random.seed(SEED)

    labels = {node: 0.5 for node in all_nodes}  # Unknown nodes default to 0.5
    for node in benign_nodes:
        labels[node] = 0
    for node in sybil_nodes:
        labels[node] = 1

    train_x = [labels[node] for node in train_nodes]
    for i in range(len(train_x)):
        if train_x[i] != 0.5:
            if random.random()>ratio_of_known_trains:
                train_x[i] = 0.5
    train_y = [labels[node] for node in train_nodes]


    test_x = [labels[node] for node in test_nodes]
    for i in range(len(test_x)):
        if test_x[i] != 0.5:
            if random.random()>ratio_of_known_tests:
                test_x[i] = 0.5
    test_y = [labels[node] for node in test_nodes]

    mask_test = [i != 0.5 for i in test_y]
    mask_test_sybils = [i == 1 for i in test_y]


    train_x = torch.tensor(train_x, dtype=torch.float).reshape(-1,1)
    train_y = torch.tensor(train_y, dtype=torch.float).reshape(-1,1)
    test_x = torch.tensor(test_x, dtype=torch.float).reshape(-1,1)
    test_y = torch.tensor(test_y, dtype=torch.float).reshape(-1,1)

    return train_x, test_x, train_y, test_y, mask_test, mask_test_sybils

In [46]:
train_x, test_x, train_y, test_y, mask_test, mask_test_sybils = create_datasets(all_nodes = trainNodes + testNodes,
                                                                                train_nodes = trainNodes,
                                                                                test_nodes = testNodes,
                                                                                benign_nodes = benigns,
                                                                                sybil_nodes = sybils,
                                                                                ratio_of_known_trains = 0.5,
                                                                                ratio_of_known_tests = 0.5,
                                                                                SEED = 1)

In [47]:
g = nx.Graph()
g.add_edges_from(edges)


temp = list(g.subgraph(trainNodes).edges())
trainEdges = [[i[0] for i in temp],[i[1] for i in temp[:]]]
temp = list(g.subgraph(testNodes).edges())
testEdges = [[i[0] for i in temp],[i[1] for i in temp[:]]]

trainEdges = torch.tensor(trainEdges, dtype=torch.long)
testEdges = torch.tensor(testEdges, dtype=torch.long)


In [48]:
def train_model(model, train_x, train_y, train_edge_index, epochs=100, lr=0.01):

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # loss_fn = torch.nn.CrossEntropyLoss()
    loss_fn = torch.nn.MSELoss()


    model.train()
    for epoch in tqdm(range(epochs)):
        optimizer.zero_grad()
        output = model(train_x, train_edge_index)
        loss = loss_fn(output, train_y)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 20 == 0 or epoch == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

    return model



In [49]:
def test_model(model, test_x, test_y, test_edge_index, mask=None, threshold=None):
    model.eval()
    with torch.no_grad():
        output = model(test_x, test_edge_index)

        # Apply mask if provided
        if mask is not None:
            output = output[mask]
            test_y = test_y[mask]

        # If output has shape [batch_size, 1], use the single column
        output = output.squeeze(1)  # Converts shape [batch_size, 1] -> [batch_size]

        # Compute AUC and find optimal threshold
        try:
            auc_score = roc_auc_score(test_y.cpu(), output.cpu())  # Adjust for single-class output
            # Compute ROC curve
            fpr, tpr, thresholds = roc_curve(test_y.cpu(), output.cpu())
            # Find optimal threshold using Youden's J statistic
            j_scores = tpr - fpr
            optimal_idx = j_scores.argmax()
            if threshold is None:
                threshold = thresholds[optimal_idx]

        except Exception as e:
            auc_score = None
            if threshold is None:
                threshold = 0.5  # Default threshold if AUC computation fails

        # Apply the threshold to classify predictions
        threshold_predictions = (output >= threshold).int().cpu()

        # Compute accuracy using the threshold
        accuracy_with_threshold = accuracy_score(test_y.cpu(), threshold_predictions)

        # Compute confusion matrix
        conf_matrix = confusion_matrix(test_y.cpu(), threshold_predictions)
        normalized_cm = conf_matrix.astype('float') / len(test_y)

        # Prepare metrics
        metrics = {
            "accuracy": accuracy_with_threshold,
            "auc": auc_score,
            "optimal_threshold": threshold,
            "confusion_matrix": normalized_cm
        }

        # Print metrics
        try:
            print(f"AUC: {auc_score:.3f}")
        except:
            pass
        # print(f"Optimal Threshold: {threshold}")
        print(f"ACC: {accuracy_with_threshold:.3f}")
        print("Confusion Matrix (Normalized):")
        try:
            print(f"                      Predicted Negative                       Predicted Positive")
            print(f"Actual Negative       {normalized_cm[0, 0]:.3f}                {normalized_cm[0, 1]:.3f}")
            print(f"Actual Positive       {normalized_cm[1, 0]:.3f}                {normalized_cm[1, 1]:.3f}")
        except:
            pass
        return metrics, threshold


In [50]:
len(trainEdges[0])

272308

In [52]:
trainEdges

tensor([[    0,     0,     0,  ..., 29955, 29955, 29964],
        [    1,     2,     3,  ..., 29961, 29988, 29976]])

In [53]:
input_dim = 1
hidden_dim = 16
output_dim = 1
model = GCN(input_dim, hidden_dim, output_dim, num_layers=3)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)



print("based on auc th")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  1%|          | 1/100 [00:00<00:30,  3.20it/s]

Epoch 1/100, Loss: 0.2531


 21%|██        | 21/100 [00:03<00:13,  5.84it/s]

Epoch 20/100, Loss: 0.2089


 41%|████      | 41/100 [00:07<00:10,  5.80it/s]

Epoch 40/100, Loss: 0.2064


 61%|██████    | 61/100 [00:12<00:08,  4.53it/s]

Epoch 60/100, Loss: 0.2028


 81%|████████  | 81/100 [00:15<00:03,  5.77it/s]

Epoch 80/100, Loss: 0.2038


100%|██████████| 100/100 [00:19<00:00,  5.24it/s]

Epoch 100/100, Loss: 0.2049
based on auc th





AUC: 0.673
ACC: 0.710
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.581                0.080
Actual Positive       0.210                0.129
ACC: 0.381
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.619                0.381



based on 0.5
AUC: 0.673
ACC: 0.664
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.661                0.000
Actual Positive       0.335                0.004
ACC: 0.011
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.989                0.011


In [54]:
input_dim = 1
hidden_dim = 16
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=3, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  1%|          | 1/100 [00:01<01:50,  1.11s/it]

Epoch 1/100, Loss: 0.2590


 20%|██        | 20/100 [00:24<01:30,  1.14s/it]

Epoch 20/100, Loss: 0.2029


 40%|████      | 40/100 [00:47<01:08,  1.14s/it]

Epoch 40/100, Loss: 0.1547


 60%|██████    | 60/100 [01:19<00:55,  1.38s/it]

Epoch 60/100, Loss: 0.1404


 80%|████████  | 80/100 [01:42<00:21,  1.09s/it]

Epoch 80/100, Loss: 0.1322


100%|██████████| 100/100 [02:06<00:00,  1.26s/it]

Epoch 100/100, Loss: 0.1270
based on auc th





AUC: 0.745
ACC: 0.734
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.496                0.164
Actual Positive       0.102                0.237
ACC: 0.700
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.300                0.700



based on 0.5
AUC: 0.745
ACC: 0.661
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.661                0.000
Actual Positive       0.339                0.000
ACC: 0.000
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       1.000                0.000


In [55]:
input_dim = 1
hidden_dim = 8
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=4, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  1%|          | 1/100 [00:01<02:29,  1.51s/it]

Epoch 1/100, Loss: 0.2550


 20%|██        | 20/100 [00:20<01:18,  1.02it/s]

Epoch 20/100, Loss: 0.2124


 40%|████      | 40/100 [00:41<01:08,  1.14s/it]

Epoch 40/100, Loss: 0.1856


 60%|██████    | 60/100 [01:02<00:39,  1.01it/s]

Epoch 60/100, Loss: 0.1493


 80%|████████  | 80/100 [01:22<00:21,  1.06s/it]

Epoch 80/100, Loss: 0.1355


100%|██████████| 100/100 [01:43<00:00,  1.03s/it]

Epoch 100/100, Loss: 0.1365
based on auc th





AUC: 0.118
ACC: 0.664
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.661                0.000
Actual Positive       0.336                0.004
ACC: 0.011
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.989                0.011



based on 0.5
AUC: 0.118
ACC: 0.661
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.661                0.000
Actual Positive       0.339                0.000
ACC: 0.000
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       1.000                0.000


In [56]:
input_dim = 1
hidden_dim = 8
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=4, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=500, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  0%|          | 1/500 [00:01<10:06,  1.22s/it]

Epoch 1/500, Loss: 0.2515


  4%|▍         | 20/500 [00:20<07:47,  1.03it/s]

Epoch 20/500, Loss: 0.2109


  8%|▊         | 40/500 [00:41<08:31,  1.11s/it]

Epoch 40/500, Loss: 0.1779


 12%|█▏        | 60/500 [01:01<07:14,  1.01it/s]

Epoch 60/500, Loss: 0.1495


 16%|█▌        | 80/500 [01:21<07:38,  1.09s/it]

Epoch 80/500, Loss: 0.1355


 20%|██        | 100/500 [01:42<06:35,  1.01it/s]

Epoch 100/500, Loss: 0.1337


 24%|██▍       | 120/500 [02:03<06:48,  1.07s/it]

Epoch 120/500, Loss: 0.1267


 28%|██▊       | 140/500 [02:24<06:12,  1.03s/it]

Epoch 140/500, Loss: 0.1264


 32%|███▏      | 160/500 [02:47<07:04,  1.25s/it]

Epoch 160/500, Loss: 0.1279


 36%|███▌      | 180/500 [03:07<05:10,  1.03it/s]

Epoch 180/500, Loss: 0.1276


 40%|████      | 200/500 [03:28<06:06,  1.22s/it]

Epoch 200/500, Loss: 0.1262


 44%|████▍     | 220/500 [03:49<04:24,  1.06it/s]

Epoch 220/500, Loss: 0.1274


 48%|████▊     | 240/500 [04:11<05:04,  1.17s/it]

Epoch 240/500, Loss: 0.1251


 52%|█████▏    | 260/500 [04:32<03:53,  1.03it/s]

Epoch 260/500, Loss: 0.1247


 56%|█████▌    | 280/500 [04:52<03:58,  1.08s/it]

Epoch 280/500, Loss: 0.1253


 60%|██████    | 300/500 [05:12<03:07,  1.07it/s]

Epoch 300/500, Loss: 0.1260


 64%|██████▍   | 320/500 [05:34<03:33,  1.19s/it]

Epoch 320/500, Loss: 0.1248


 68%|██████▊   | 340/500 [05:54<02:43,  1.02s/it]

Epoch 340/500, Loss: 0.1220


 72%|███████▏  | 360/500 [06:21<02:26,  1.05s/it]

Epoch 360/500, Loss: 0.1271


 76%|███████▌  | 380/500 [06:41<01:52,  1.07it/s]

Epoch 380/500, Loss: 0.1271


 80%|████████  | 400/500 [07:02<01:41,  1.01s/it]

Epoch 400/500, Loss: 0.1270


 84%|████████▍ | 420/500 [07:23<01:22,  1.04s/it]

Epoch 420/500, Loss: 0.1200


 88%|████████▊ | 440/500 [07:44<00:57,  1.04it/s]

Epoch 440/500, Loss: 0.1258


 92%|█████████▏| 460/500 [08:05<00:44,  1.11s/it]

Epoch 460/500, Loss: 0.1178


 96%|█████████▌| 480/500 [08:26<00:18,  1.06it/s]

Epoch 480/500, Loss: 0.1229


100%|██████████| 500/500 [08:47<00:00,  1.05s/it]

Epoch 500/500, Loss: 0.1214
based on auc th





AUC: 0.188
ACC: 0.665
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.661                0.000
Actual Positive       0.334                0.005
ACC: 0.014
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.986                0.014



based on 0.5
AUC: 0.188
ACC: 0.661
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.661                0.000
Actual Positive       0.339                0.000
ACC: 0.000
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       1.000                0.000
