<a href="https://colab.research.google.com/github/aSafarpoor/OSN_FAD/blob/main/Data/Cresci_test_with_simple_GCN_and_GAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install torch_geometric



In [17]:
import networkx as nx
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
import random


import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, GINConv
from sklearn.metrics import roc_curve,accuracy_score, roc_auc_score, confusion_matrix
import torch.nn as nn


SEED = 10
random.seed(SEED)

#read files

In [18]:
import tarfile

file_name = "Cresci.tar.xz"

with tarfile.open(file_name, "r:xz") as tar:
    tar.extractall()
    tar.list()

?r-xr-xr-x 0/0          0 2024-11-19 03:53:22 Cresci/ 
?rw-rw-rw- 0/0      19098 2024-11-19 03:26:23 Cresci/benigns.txt 
?rw-rw-rw- 0/0   35918725 2024-11-19 03:26:04 Cresci/follower_edges.txt 
?rw-rw-rw- 0/0   47439805 2024-11-19 03:26:09 Cresci/friend_edges.txt 
?rw-rw-rw- 0/0      34621 2024-11-19 03:26:17 Cresci/sybils.txt 
?rw-rw-rw- 0/0   10466507 2024-11-19 03:15:13 Cresci/testNodes.txt 
?rw-rw-rw- 0/0    2473061 2024-11-19 03:15:01 Cresci/trainNodes.txt 


In [19]:
file_name = "Cresci/benigns.txt"
with open(file_name, "r") as file:
    lines = file.readlines()
benigns = [line.strip() for line in lines]

file_name = "Cresci/sybils.txt"
with open(file_name, "r") as file:
    lines = file.readlines()
sybils = [line.strip() for line in lines]

file_name = "Cresci/trainNodes.txt"
with open(file_name, "r") as file:
    lines = file.readlines()
trainNodes = [line.strip() for line in lines]

file_name = "Cresci/testNodes.txt"
with open(file_name, "r") as file:
    lines = file.readlines()
testNodes = [line.strip() for line in lines]

edges1 = []
file_name = "Cresci/friend_edges.txt"
with open(file_name, "r") as file:
    edges1 = [tuple( line.strip().split()) for line in file]

edges2 = []
file_name = "Cresci/follower_edges.txt"
with open(file_name, "r") as file:
    edges2 = [tuple( line.strip().split()) for line in file]

edges = edges1[:]+edges2[:]
edges1 = []
edges2 = []

In [20]:
def renumber_nodes(trainNodes, testNodes, benigns, sybils, edges):
    train_mapping = {node: idx for idx, node in enumerate(trainNodes)}
    test_start_idx = len(trainNodes)
    test_mapping = {node: idx for idx, node in enumerate(testNodes, start=test_start_idx)}
    node_mapping = {**train_mapping, **test_mapping}
    updated_trainNodes = [node_mapping[node] for node in trainNodes]
    updated_testNodes = [node_mapping[node] for node in testNodes]
    updated_benigns = [node_mapping[node] for node in benigns if node in node_mapping]
    updated_sybils = [node_mapping[node] for node in sybils if node in node_mapping]
    updated_edges = [(node_mapping[src], node_mapping[dst]) for src, dst in edges if src in node_mapping and dst in node_mapping]
    return updated_trainNodes, updated_testNodes, updated_benigns, updated_sybils, updated_edges

trainNodes, testNodes, benigns, sybils, edges = renumber_nodes(trainNodes, testNodes, benigns, sybils, edges)

# Models

In [21]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout=True):
        super(GCN, self).__init__()
        self.dropout = dropout
        self.convs = torch.nn.ModuleList()
        self.num_classes = output_dim
        for i in range(num_layers):
            if i == 0:  # First layer
                self.convs.append(GCNConv(input_dim, hidden_dim, bias=True))
            elif i == num_layers - 1:  # Last layer
                self.convs.append(GCNConv(hidden_dim, output_dim, bias=True))
            else:  # Middle layers
                self.convs.append(GCNConv(hidden_dim, hidden_dim, bias=True))

    def forward(self, x, edge_index):
        h = x
        i = 0
        for conv in self.convs:
            if self.dropout:
                h = F.dropout(h, p=0.5, training=self.training)
            h = conv(h, edge_index)
            if i < len(self.convs) - 1:
                h = F.tanh(h)
            i += 1

        if self.num_classes == 1:
            return F.sigmoid(h)
        else:
            return F.softmax(h, dim=1)

In [22]:
class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, heads=1, dropout=True):
        super(GAT, self).__init__()
        self.dropout = dropout
        self.convs = torch.nn.ModuleList()
        self.num_classes = output_dim
        self.heads = heads

        for i in range(num_layers):
            if i == 0:  # First layer
                self.convs.append(GATConv(input_dim, hidden_dim, heads=heads, concat=True, bias=True))
            elif i == num_layers - 1:  # Last layer
                self.convs.append(GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, bias=True))
            else:  # Middle layers
                self.convs.append(GATConv(hidden_dim * heads, hidden_dim, heads=heads, concat=True, bias=True))

    def forward(self, x, edge_index):
        h = x
        for i, conv in enumerate(self.convs):
            if self.dropout:
                h = F.dropout(h, p=0.5, training=self.training)
            h = conv(h, edge_index)
            if i < len(self.convs) - 1:  # Apply activation only for hidden layers
                h = F.elu(h)  # ELU is often preferred for GAT
        if self.num_classes == 1:
            return F.sigmoid(h)
        else:
            return F.log_softmax(h, dim=1)

In [23]:
def create_datasets(all_nodes, train_nodes, test_nodes, benign_nodes, sybil_nodes, ratio_of_known_trains, ratio_of_known_tests,SEED):

    random.seed(SEED)

    labels = {node: 0.5 for node in all_nodes}  # Unknown nodes default to 0.5
    for node in benign_nodes:
        labels[node] = 0
    for node in sybil_nodes:
        labels[node] = 1

    train_x = [labels[node] for node in train_nodes]
    for i in range(len(train_x)):
        if train_x[i] != 0.5:
            if random.random()>ratio_of_known_trains:
                train_x[i] = 0.5
    train_y = [labels[node] for node in train_nodes]


    test_x = [labels[node] for node in test_nodes]
    for i in range(len(test_x)):
        if test_x[i] != 0.5:
            if random.random()>ratio_of_known_tests:
                test_x[i] = 0.5
    test_y = [labels[node] for node in test_nodes]

    mask_test = [i != 0.5 for i in test_y]
    mask_test_sybils = [i == 1 for i in test_y]


    train_x = torch.tensor(train_x, dtype=torch.float).reshape(-1,1)
    train_y = torch.tensor(train_y, dtype=torch.float).reshape(-1,1)
    test_x = torch.tensor(test_x, dtype=torch.float).reshape(-1,1)
    test_y = torch.tensor(test_y, dtype=torch.float).reshape(-1,1)

    return train_x, test_x, train_y, test_y, mask_test, mask_test_sybils

In [24]:
train_x, test_x, train_y, test_y, mask_test, mask_test_sybils = create_datasets(all_nodes = trainNodes + testNodes,
                                                                                train_nodes = trainNodes,
                                                                                test_nodes = testNodes,
                                                                                benign_nodes = benigns,
                                                                                sybil_nodes = sybils,
                                                                                ratio_of_known_trains = 0.5,
                                                                                ratio_of_known_tests = 0.5,
                                                                                SEED = 1)

In [25]:
g = nx.Graph()
g.add_edges_from(edges)


temp = list(g.subgraph(trainNodes).edges())
trainEdges = [[i[0] for i in temp],[i[1] for i in temp[:]]]
temp = list(g.subgraph(testNodes).edges())
testEdges = [[i[0] for i in temp],[i[1] for i in temp[:]]]

trainEdges = torch.tensor(trainEdges, dtype=torch.long)
testEdges = torch.tensor(testEdges, dtype=torch.long)


In [26]:
def train_model(model, train_x, train_y, train_edge_index, epochs=100, lr=0.01):

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # loss_fn = torch.nn.CrossEntropyLoss()
    loss_fn = torch.nn.MSELoss()


    model.train()
    for epoch in tqdm(range(epochs)):
        optimizer.zero_grad()
        output = model(train_x, train_edge_index)
        loss = loss_fn(output, train_y)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 20 == 0 or epoch == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

    return model



In [31]:
def test_model(model, test_x, test_y, test_edge_index, mask=None, threshold=None):
    model.eval()
    with torch.no_grad():
        output = model(test_x, test_edge_index)

        # Apply mask if provided
        if mask is not None:
            output = output[mask]
            test_y = test_y[mask]

        # If output has shape [batch_size, 1], use the single column
        output = output.squeeze(1)  # Converts shape [batch_size, 1] -> [batch_size]

        # Compute AUC and find optimal threshold
        try:
            auc_score = roc_auc_score(test_y.cpu(), output.cpu())  # Adjust for single-class output
            # Compute ROC curve
            fpr, tpr, thresholds = roc_curve(test_y.cpu(), output.cpu())
            # Find optimal threshold using Youden's J statistic
            j_scores = tpr - fpr
            optimal_idx = j_scores.argmax()
            if threshold is None:
                threshold = thresholds[optimal_idx]

        except Exception as e:
            auc_score = None
            if threshold is None:
                threshold = 0.5  # Default threshold if AUC computation fails

        # Apply the threshold to classify predictions
        threshold_predictions = (output >= threshold).int().cpu()

        # Compute accuracy using the threshold
        accuracy_with_threshold = accuracy_score(test_y.cpu(), threshold_predictions)

        # Compute confusion matrix
        conf_matrix = confusion_matrix(test_y.cpu(), threshold_predictions)
        normalized_cm = conf_matrix.astype('float') / len(test_y)

        # Prepare metrics
        metrics = {
            "accuracy": accuracy_with_threshold,
            "auc": auc_score,
            "optimal_threshold": threshold,
            "confusion_matrix": normalized_cm
        }

        # Print metrics
        try:
            print(f"AUC: {auc_score:.3f}")
        except:
            pass
        # print(f"Optimal Threshold: {threshold}")
        print(f"ACC: {accuracy_with_threshold:.3f}")
        print("Confusion Matrix (Normalized):")
        try:
            print(f"                      Predicted Negative                       Predicted Positive")
            print(f"Actual Negative       {normalized_cm[0, 0]:.3f}                {normalized_cm[0, 1]:.3f}")
            print(f"Actual Positive       {normalized_cm[1, 0]:.3f}                {normalized_cm[1, 1]:.3f}")
        except:
            pass
        return metrics, threshold


In [28]:
input_dim = 1
hidden_dim = 16
output_dim = 1
model = GCN(input_dim, hidden_dim, output_dim, num_layers=3)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)



print("based on auc th")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  1%|          | 1/100 [00:01<01:43,  1.05s/it]

Epoch 1/100, Loss: 0.0081


 20%|██        | 20/100 [00:18<01:14,  1.08it/s]

Epoch 20/100, Loss: 0.0015


 40%|████      | 40/100 [00:36<00:57,  1.04it/s]

Epoch 40/100, Loss: 0.0012


 60%|██████    | 60/100 [00:53<00:31,  1.27it/s]

Epoch 60/100, Loss: 0.0011


 80%|████████  | 80/100 [01:13<00:18,  1.06it/s]

Epoch 80/100, Loss: 0.0011


100%|██████████| 100/100 [01:32<00:00,  1.08it/s]

Epoch 100/100, Loss: 0.0011
based on auc th





AUC: 0.778
ACC: 0.707
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.279                0.081
Actual Positive       0.213                0.427
ACC: 0.668
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.332                0.668



based on 0.5
AUC: 0.778
ACC: 0.551
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.351                0.009
Actual Positive       0.440                0.200
ACC: 0.313
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.687                0.313


In [32]:
input_dim = 1
hidden_dim = 16
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=3, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

based on auc th
AUC: 0.990
ACC: 0.965
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.355                0.005
Actual Positive       0.030                0.610
ACC: 0.953
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.047                0.953



based on 0.5
AUC: 0.990
ACC: 0.640
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.360
Actual Positive       0.000                0.640
ACC: 1.000
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive




In [33]:
input_dim = 1
hidden_dim = 8
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=4, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=100, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  1%|          | 1/100 [00:04<08:05,  4.90s/it]

Epoch 1/100, Loss: 0.0088


 20%|██        | 20/100 [01:10<04:42,  3.53s/it]

Epoch 20/100, Loss: 0.0016


 40%|████      | 40/100 [02:25<03:50,  3.84s/it]

Epoch 40/100, Loss: 0.0012


 60%|██████    | 60/100 [03:37<02:29,  3.74s/it]

Epoch 60/100, Loss: 0.0012


 80%|████████  | 80/100 [04:56<01:09,  3.49s/it]

Epoch 80/100, Loss: 0.0011


100%|██████████| 100/100 [06:06<00:00,  3.66s/it]

Epoch 100/100, Loss: 0.0011
based on auc th





AUC: 0.993
ACC: 0.977
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.354                0.006
Actual Positive       0.017                0.623
ACC: 0.973
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.027                0.973



based on 0.5
AUC: 0.993
ACC: 0.640
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.360
Actual Positive       0.000                0.640
ACC: 1.000
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive




In [34]:
input_dim = 1
hidden_dim = 8
output_dim = 1
model = GAT(input_dim, hidden_dim, output_dim, num_layers=4, heads = 4)

model = train_model(model, train_x, train_y, trainEdges, epochs=500, lr=0.01)


print("based on auc th")

metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = th)
print("\n\n")
print("based on 0.5")
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test, threshold = 0.5)
metric,th = test_model(model, test_x, test_y, testEdges-len(train_x), mask = mask_test_sybils, threshold = 0.5)

  0%|          | 1/500 [00:02<21:09,  2.54s/it]

Epoch 1/500, Loss: 0.0065


  4%|▍         | 20/500 [01:07<33:43,  4.21s/it]

Epoch 20/500, Loss: 0.0013


  8%|▊         | 40/500 [02:07<21:38,  2.82s/it]

Epoch 40/500, Loss: 0.0011


 12%|█▏        | 60/500 [03:27<27:13,  3.71s/it]

Epoch 60/500, Loss: 0.0011


 16%|█▌        | 80/500 [04:30<25:43,  3.67s/it]

Epoch 80/500, Loss: 0.0011


 20%|██        | 100/500 [05:52<27:01,  4.05s/it]

Epoch 100/500, Loss: 0.0011


 24%|██▍       | 120/500 [07:15<25:30,  4.03s/it]

Epoch 120/500, Loss: 0.0011


 28%|██▊       | 140/500 [08:45<26:08,  4.36s/it]

Epoch 140/500, Loss: 0.0011


 32%|███▏      | 160/500 [10:09<24:30,  4.32s/it]

Epoch 160/500, Loss: 0.0011


 36%|███▌      | 180/500 [11:31<22:47,  4.27s/it]

Epoch 180/500, Loss: 0.0011


 40%|████      | 200/500 [12:53<21:14,  4.25s/it]

Epoch 200/500, Loss: 0.0011


 44%|████▍     | 220/500 [14:15<20:08,  4.31s/it]

Epoch 220/500, Loss: 0.0011


 48%|████▊     | 240/500 [15:38<18:52,  4.36s/it]

Epoch 240/500, Loss: 0.0011


 52%|█████▏    | 260/500 [17:01<17:32,  4.39s/it]

Epoch 260/500, Loss: 0.0011


 56%|█████▌    | 280/500 [18:19<12:40,  3.46s/it]

Epoch 280/500, Loss: 0.0011


 60%|██████    | 300/500 [19:40<14:00,  4.20s/it]

Epoch 300/500, Loss: 0.0011


 64%|██████▍   | 320/500 [21:03<12:20,  4.11s/it]

Epoch 320/500, Loss: 0.0011


 68%|██████▊   | 340/500 [22:15<07:30,  2.81s/it]

Epoch 340/500, Loss: 0.0011


 72%|███████▏  | 360/500 [23:36<09:51,  4.22s/it]

Epoch 360/500, Loss: 0.0011


 76%|███████▌  | 380/500 [24:42<07:00,  3.50s/it]

Epoch 380/500, Loss: 0.0011


 80%|████████  | 400/500 [26:06<06:59,  4.19s/it]

Epoch 400/500, Loss: 0.0011


 84%|████████▍ | 420/500 [27:30<05:31,  4.14s/it]

Epoch 420/500, Loss: 0.0011


 88%|████████▊ | 440/500 [28:42<03:05,  3.09s/it]

Epoch 440/500, Loss: 0.0011


 92%|█████████▏| 460/500 [29:47<02:07,  3.19s/it]

Epoch 460/500, Loss: 0.0011


 96%|█████████▌| 480/500 [31:10<01:22,  4.13s/it]

Epoch 480/500, Loss: 0.0011


100%|██████████| 500/500 [32:18<00:00,  3.88s/it]

Epoch 500/500, Loss: 0.0011
based on auc th





AUC: 0.991
ACC: 0.967
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.354                0.006
Actual Positive       0.027                0.613
ACC: 0.957
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.000
Actual Positive       0.043                0.957



based on 0.5
AUC: 0.991
ACC: 0.640
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive
Actual Negative       0.000                0.360
Actual Positive       0.000                0.640
ACC: 1.000
Confusion Matrix (Normalized):
                      Predicted Negative                       Predicted Positive


