<a href="https://colab.research.google.com/github/aSafarpoor/OSN_FAD/blob/main/CLS/FAD%2BNN%2BNode2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Installing collected packages: node2vec
Successfully installed node2vec-0.5.0


In [2]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m61.4/63.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [3]:
import networkx as nx
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
from node2vec import Node2Vec
import random
SEED = 10

In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv
from sklearn.metrics import roc_curve,accuracy_score, roc_auc_score, confusion_matrix



# read data

In [6]:
edges = []
with open('graph.txt', 'r') as file:
    for line in file:
        # Split each line into two numbers and convert them to integers
        pair = list(map(int, line.strip().split()))
        edges.append(pair)


with open('train.txt', 'r') as file:
    lines = file.readlines()
    btrain = list(map(float, lines[0].strip().split()))
    strain = list(map(float, lines[1].strip().split()))

with open('test.txt', 'r') as file:
    lines = file.readlines()
    btest = list(map(float, lines[0].strip().split()))
    stest = list(map(float, lines[1].strip().split()))

In [7]:
print("Length of edges:", len(edges),
      "| Length of btrain:", len(btrain),
      "| Length of btest:", len(btest),
      "| Length of strain:", len(strain),
      "| Length of stest:", len(stest))

Length of edges: 6818501 | Length of btrain: 20000 | Length of btest: 158377 | Length of strain: 10000 | Length of stest: 81263


# data preparation

In [8]:
bigG = nx.Graph()
bigG.add_edges_from(edges)

In [9]:
trainG = bigG.subgraph(btrain+strain).copy()
testG = bigG.subgraph(btest+stest).copy()

In [10]:
print(len(bigG.nodes()),len(trainG.nodes()),len(testG.nodes()))

269640 30000 239640


In [11]:
def reindex_graph_and_dictionaries(G, train, test):
    new_index_mapping = {old_index: new_index for new_index, old_index in enumerate(G.nodes())}
    G_reindexed = nx.relabel_nodes(G, new_index_mapping)

    train = [new_index_mapping[int(node)] for node in train]
    test =  [new_index_mapping[int(node)] for node in test]

    return G_reindexed, train, test

In [12]:
trainG,btrain,strain =reindex_graph_and_dictionaries(trainG, btrain,strain)
testG,btest,stest =reindex_graph_and_dictionaries(testG, btest,stest)

In [13]:
trainDictionaryX = {}
trainDictionaryY = {}
testDictionaryX = {}
testDictionaryY = {}

random.seed(SEED)

c=0
for node in btrain:
    if random.random() < 0.20:
        c+=1
        trainDictionaryX[node] = [0]
    else:
        trainDictionaryX[node] = [0.5]
    trainDictionaryY[node] = 0
print(c)

c=0
for node in strain:
    if random.random() < 0.20:
        c+=1
        trainDictionaryX[node] = [1]
    else:
        trainDictionaryX[node] = [0.5]
    trainDictionaryY[node] = 1
print(c)

c=0
for node in btest:
    if random.random() < 0.1:
        c+=1
        testDictionaryX[node] = [0]
    else:
        testDictionaryX[node] = [0.5]
    testDictionaryY[node] = 0
print(c)

c=0
for node in stest:
    if random.random() < 0.1:
        c+=1
        testDictionaryX[node] = [1]
    else:
        testDictionaryX[node] = [0.5]
    testDictionaryY[node] = 1
print(c)

4038
1986
15631
8053


In [14]:
# print(len(btrain))
# print(len(strain))
# print(len(btest))
# print(len(stest))


#model

In [15]:
# Prepare data (assuming PyTorch Geometric format for graphs)
def prepare_data(graph, dictionaryX, dictionaryY):
    x = torch.tensor([dictionaryX[node] for node in graph.nodes()], dtype=torch.float)
    y = torch.tensor([dictionaryY[node] for node in graph.nodes()], dtype=torch.long)
    # edge_index = torch.tensor(list(graph.edges())).t().contiguous()
    edge_index = torch.tensor(list(graph.edges()), dtype=torch.long).t().contiguous()
    return x, y, edge_index

In [16]:
def pure_sybil_test(test_x, test_y):
    print(len(test_x))
    mask = (test_y == 1)
    x = test_x[mask]
    y = test_y[mask]
    print(len(x))
    return x, y, mask

In [17]:
train_x, train_y, train_edge_index = prepare_data(trainG, trainDictionaryX, trainDictionaryY)
test_x, test_y, test_edge_index = prepare_data(testG, testDictionaryX, testDictionaryY)
test_x2, test_y2, mask2 = pure_sybil_test(test_x, test_y)


239640
81263


In [18]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout=True):
        super(GCN, self).__init__()
        self.dropout = dropout
        self.convs = torch.nn.ModuleList()
        self.num_classes = output_dim
        for i in range(num_layers):
            if i == 0:  # First layer
                self.convs.append(GCNConv(input_dim, hidden_dim, bias=True))
            elif i == num_layers - 1:  # Last layer
                self.convs.append(GCNConv(hidden_dim, output_dim, bias=True))
            else:  # Middle layers
                self.convs.append(GCNConv(hidden_dim, hidden_dim, bias=True))

    def forward(self, x, edge_index):
        h = x
        i = 0
        for conv in self.convs:
            if self.dropout:
                h = F.dropout(h, p=0.5, training=self.training)
            h = conv(h, edge_index)
            if i < len(self.convs) - 1:
                h = F.tanh(h)
            i += 1

        if self.num_classes == 1:
            return F.sigmoid(h), h
        else:
            return F.softmax(h, dim=1)




class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads)
        self.conv2 = GATConv(hidden_dim * heads, output_dim, heads=1)  # Single head in the final layer

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.elu(x)  # ELU is commonly used in GAT models instead of ReLU
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [19]:
def run_model(model,epoch_number):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_fn = torch.nn.CrossEntropyLoss()

    model.train()
    for epoch in tqdm(range(epoch_number)):
        optimizer.zero_grad()
        out = model(train_x, train_edge_index)
        loss = loss_fn(out, train_y)
        loss.backward()
        optimizer.step()
        if (epoch+1)%20 == 0 or epoch==0:
            print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

    model.eval()
    with torch.no_grad():
        test_out = model(test_x, test_edge_index)
        pred = test_out.argmax(dim=1).cpu()

        # Compute metrics
        acc = accuracy_score(test_y, pred)
        auc = roc_auc_score(test_y, test_out[:, 1], multi_class='ovo')  # Adjust depending on binary or multi-class
        cm = confusion_matrix(test_y, pred)
        cm = cm.astype('float') / len(pred)


        print(f'Accuracy: {acc:.3f}')
        print(f'AUC: {auc:.3f}')
        print('Confusion Matrix:')
        print(f"                  Predicted Negative    Predicted Positive")
        print(f"Actual Negative       {cm[0, 0]:.3f}           {cm[0, 1]:.3f}")
        print(f"Actual Positive       {cm[1, 0]:.3f}           {cm[1, 1]:.3f}")

        print("\n--------------------------just sybils-----------------------------\n")

        test_out = model(test_x, test_edge_index)
        test_out = test_out[mask2]
        pred = test_out.argmax(dim=1).cpu()
        acc = accuracy_score(test_y2, pred)
        print(f'Accuracy: {acc:.3f}')


    with torch.no_grad():
        test_out = model(test_x, test_edge_index)
        pred = test_out.argmax(dim=1).cpu()

        # Compute AUC score as usual
        auc = roc_auc_score(test_y, test_out[:, 1], multi_class='ovo')
        fpr, tpr, thresholds = roc_curve(test_y, test_out[:, 1])
        j_scores = tpr - fpr
        optimal_threshold = thresholds[j_scores.argmax()]
        pred = (test_out[:, 1] >= optimal_threshold)
        acc = accuracy_score(test_y, pred)

        # just sybil
        test_out = test_out[mask2]
        pred = (test_out[:, 1] >= optimal_threshold)
        acc_sybils = accuracy_score(test_y2, pred)
        print("Accuracy with optimal threshold:", acc)
        print("Accuracy with optimal threshold just for sybils:", acc_sybils)

In [25]:
# Initialize model, optimizer, and loss function
input_dim = 1
hidden_dim = 16
output_dim = len(set(trainDictionaryY.values()))  # Number of classes

model = GCN(input_dim, hidden_dim, output_dim, num_layers=3)
run_model(model,epoch_number=300)

  0%|          | 1/300 [00:00<00:50,  5.98it/s]

Epoch 1, Loss: 0.6947438716888428


  7%|▋         | 21/300 [00:03<00:50,  5.53it/s]

Epoch 20, Loss: 0.632820188999176


 14%|█▎        | 41/300 [00:07<00:45,  5.74it/s]

Epoch 40, Loss: 0.6359561085700989


 20%|██        | 60/300 [00:11<01:02,  3.87it/s]

Epoch 60, Loss: 0.6329337358474731


 27%|██▋       | 81/300 [00:15<00:39,  5.54it/s]

Epoch 80, Loss: 0.6336530447006226


 34%|███▎      | 101/300 [00:19<00:35,  5.68it/s]

Epoch 100, Loss: 0.6335083246231079


 40%|████      | 120/300 [00:22<00:39,  4.51it/s]

Epoch 120, Loss: 0.6328360438346863


 47%|████▋     | 141/300 [00:27<00:28,  5.57it/s]

Epoch 140, Loss: 0.6314420104026794


 54%|█████▎    | 161/300 [00:30<00:24,  5.65it/s]

Epoch 160, Loss: 0.6339887380599976


 60%|██████    | 181/300 [00:34<00:20,  5.78it/s]

Epoch 180, Loss: 0.633427619934082


 67%|██████▋   | 200/300 [00:38<00:25,  3.95it/s]

Epoch 200, Loss: 0.6300966143608093


 74%|███████▎  | 221/300 [00:42<00:13,  5.73it/s]

Epoch 220, Loss: 0.6294932961463928


 80%|████████  | 241/300 [00:45<00:10,  5.72it/s]

Epoch 240, Loss: 0.6286304593086243


 87%|████████▋ | 261/300 [00:49<00:06,  5.66it/s]

Epoch 260, Loss: 0.6280556917190552


 94%|█████████▎| 281/300 [00:54<00:03,  5.19it/s]

Epoch 280, Loss: 0.62641841173172


100%|██████████| 300/300 [00:57<00:00,  5.21it/s]


Epoch 300, Loss: 0.6229791641235352
Accuracy: 0.661
AUC: 0.392
Confusion Matrix:
                  Predicted Negative    Predicted Positive
Actual Negative       0.661           0.000
Actual Positive       0.339           0.000

--------------------------just sybils-----------------------------

Accuracy: 0.000
Accuracy with optimal threshold: 0.3394132865965615
Accuracy with optimal threshold just for sybils: 0.9997046626385933


In [26]:
input_dim = 1
hidden_dim = 4
output_dim = len(set(trainDictionaryY.values()))  # Number of classes

model = GCN(input_dim, hidden_dim, output_dim, num_layers=4)
run_model(model,epoch_number=300)

  0%|          | 1/300 [00:00<00:44,  6.67it/s]

Epoch 1, Loss: 0.6918877363204956


  7%|▋         | 21/300 [00:02<00:39,  7.15it/s]

Epoch 20, Loss: 0.6674107909202576


 14%|█▎        | 41/300 [00:05<00:35,  7.30it/s]

Epoch 40, Loss: 0.6276900768280029


 20%|██        | 60/300 [00:08<00:44,  5.37it/s]

Epoch 60, Loss: 0.6300861835479736


 27%|██▋       | 81/300 [00:13<00:35,  6.18it/s]

Epoch 80, Loss: 0.6221268773078918


 34%|███▎      | 101/300 [00:15<00:28,  7.09it/s]

Epoch 100, Loss: 0.6249790787696838


 40%|████      | 121/300 [00:18<00:25,  7.10it/s]

Epoch 120, Loss: 0.6227192282676697


 47%|████▋     | 141/300 [00:21<00:22,  7.00it/s]

Epoch 140, Loss: 0.6238486170768738


 53%|█████▎    | 160/300 [00:25<00:32,  4.30it/s]

Epoch 160, Loss: 0.6247026920318604


 60%|██████    | 181/300 [00:28<00:16,  7.26it/s]

Epoch 180, Loss: 0.6215833425521851


 67%|██████▋   | 201/300 [00:31<00:14,  7.03it/s]

Epoch 200, Loss: 0.6178641319274902


 74%|███████▎  | 221/300 [00:34<00:11,  7.12it/s]

Epoch 220, Loss: 0.6166728138923645


 80%|████████  | 240/300 [00:37<00:12,  4.84it/s]

Epoch 240, Loss: 0.6196548342704773


 87%|████████▋ | 261/300 [00:41<00:05,  6.73it/s]

Epoch 260, Loss: 0.618475079536438


 94%|█████████▎| 281/300 [00:44<00:02,  7.06it/s]

Epoch 280, Loss: 0.619096577167511


100%|██████████| 300/300 [00:47<00:00,  6.38it/s]


Epoch 300, Loss: 0.6183600425720215
Accuracy: 0.661
AUC: 0.406
Confusion Matrix:
                  Predicted Negative    Predicted Positive
Actual Negative       0.661           0.000
Actual Positive       0.339           0.000

--------------------------just sybils-----------------------------

Accuracy: 0.000
Accuracy with optimal threshold: 0.3422091470539142
Accuracy with optimal threshold just for sybils: 0.9971696836198516


In [23]:
input_dim = 1
hidden_dim = 16
output_dim = len(set(trainDictionaryY.values()))  # Number of classes

model = GAT(input_dim, hidden_dim, output_dim, heads=4)
run_model(model,epoch_number=300)

  0%|          | 1/300 [00:01<08:37,  1.73s/it]

Epoch 1, Loss: 0.7306354641914368


  7%|▋         | 20/300 [00:33<10:30,  2.25s/it]

Epoch 20, Loss: 0.6242342591285706


 13%|█▎        | 40/300 [00:56<02:55,  1.48it/s]

Epoch 40, Loss: 0.5630982518196106


 20%|██        | 60/300 [01:09<02:11,  1.82it/s]

Epoch 60, Loss: 0.4854258596897125


 27%|██▋       | 80/300 [01:21<02:01,  1.81it/s]

Epoch 80, Loss: 0.4491177499294281


 33%|███▎      | 100/300 [01:33<01:54,  1.75it/s]

Epoch 100, Loss: 0.43628302216529846


 40%|████      | 120/300 [01:45<01:51,  1.62it/s]

Epoch 120, Loss: 0.42377105355262756


 47%|████▋     | 140/300 [01:57<01:59,  1.34it/s]

Epoch 140, Loss: 0.4041675329208374


 53%|█████▎    | 160/300 [02:09<01:37,  1.44it/s]

Epoch 160, Loss: 0.40137889981269836


 60%|██████    | 180/300 [02:27<02:47,  1.40s/it]

Epoch 180, Loss: 0.400587260723114


 67%|██████▋   | 200/300 [02:40<01:14,  1.34it/s]

Epoch 200, Loss: 0.40021809935569763


 73%|███████▎  | 220/300 [02:52<00:57,  1.40it/s]

Epoch 220, Loss: 0.39993348717689514


 80%|████████  | 240/300 [03:04<00:36,  1.63it/s]

Epoch 240, Loss: 0.3996928632259369


 87%|████████▋ | 260/300 [03:17<00:22,  1.79it/s]

Epoch 260, Loss: 0.399457186460495


 93%|█████████▎| 280/300 [03:29<00:11,  1.76it/s]

Epoch 280, Loss: 0.3992132842540741


100%|██████████| 300/300 [03:43<00:00,  1.34it/s]

Epoch 300, Loss: 0.39895614981651306





Accuracy: 0.803
AUC: 0.903
Confusion Matrix:
                  Predicted Negative    Predicted Positive
Actual Negative       0.531           0.130
Actual Positive       0.067           0.272

--------------------------just sybils-----------------------------

Accuracy: 0.802
Accuracy with optimal threshold: 0.8589467534635287
Accuracy with optimal threshold just for sybils: 0.7359683989023295


In [24]:
input_dim = 1
hidden_dim = 4
output_dim = len(set(trainDictionaryY.values()))  # Number of classes

model = GAT(input_dim, hidden_dim, output_dim, heads=4)
run_model(model,epoch_number=300)

  0%|          | 1/300 [00:00<01:13,  4.05it/s]

Epoch 1, Loss: 0.744617760181427


  7%|▋         | 20/300 [00:05<01:40,  2.78it/s]

Epoch 20, Loss: 0.6390350461006165


 13%|█▎        | 40/300 [00:11<01:06,  3.90it/s]

Epoch 40, Loss: 0.616067111492157


 20%|██        | 60/300 [00:16<00:59,  4.04it/s]

Epoch 60, Loss: 0.583609938621521


 27%|██▋       | 80/300 [00:22<00:54,  4.03it/s]

Epoch 80, Loss: 0.530661940574646


 33%|███▎      | 100/300 [00:27<00:49,  4.05it/s]

Epoch 100, Loss: 0.46302366256713867


 40%|████      | 120/300 [00:33<01:05,  2.75it/s]

Epoch 120, Loss: 0.43701252341270447


 47%|████▋     | 140/300 [00:38<00:40,  3.97it/s]

Epoch 140, Loss: 0.4301377236843109


 53%|█████▎    | 160/300 [00:43<00:35,  3.92it/s]

Epoch 160, Loss: 0.41961246728897095


 60%|██████    | 180/300 [00:50<00:30,  3.94it/s]

Epoch 180, Loss: 0.406279057264328


 67%|██████▋   | 200/300 [00:55<00:25,  3.91it/s]

Epoch 200, Loss: 0.4030618369579315


 73%|███████▎  | 220/300 [01:01<00:29,  2.75it/s]

Epoch 220, Loss: 0.4017576575279236


 80%|████████  | 240/300 [01:06<00:15,  3.85it/s]

Epoch 240, Loss: 0.40109580755233765


 87%|████████▋ | 260/300 [01:11<00:09,  4.13it/s]

Epoch 260, Loss: 0.4006361663341522


 93%|█████████▎| 280/300 [01:18<00:05,  3.81it/s]

Epoch 280, Loss: 0.400273859500885


100%|██████████| 300/300 [01:23<00:00,  3.61it/s]

Epoch 300, Loss: 0.39999303221702576





Accuracy: 0.859
AUC: 0.902
Confusion Matrix:
                  Predicted Negative    Predicted Positive
Actual Negative       0.611           0.050
Actual Positive       0.091           0.248

--------------------------just sybils-----------------------------

Accuracy: 0.731
Accuracy with optimal threshold: 0.8592805875479886
Accuracy with optimal threshold just for sybils: 0.7314891155876598
