In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

path = "/content/drive/MyDrive/elli"
os.chdir(path)

print(os.getcwd())

Mounted at /content/drive
/content/drive/MyDrive/elli


## Function of the Code

1. **Install dependency**
   - Installs the `torch-geometric` library (quiet mode).

2. **Import required libraries**
   - `torch` and `torch.nn.functional`: Core PyTorch operations.
   - `accuracy_score` from `sklearn.metrics`: Model evaluation.
   - `EllipticBitcoinDataset` from `torch_geometric.datasets`: Loads the Elliptic Bitcoin transaction dataset.
   - `SAGEConv` and `DeepGraphInfomax` from `torch_geometric.nn`: GNN layers and self-supervised learning method.

3. **Set device**
   - Uses GPU (`'cuda'`) if available; otherwise defaults to CPU.

4. **Load dataset**
   - Downloads/loads the Elliptic Bitcoin dataset to `./elliptic`.
   - Selects the first (and only) graph object and moves it to the chosen device.


In [None]:

!pip install torch-geometric -q

import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from torch_geometric.datasets import EllipticBitcoinDataset
from torch_geometric.nn import SAGEConv, DeepGraphInfomax


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


dataset = EllipticBitcoinDataset(root='./elliptic')
data = dataset[0].to(device)


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m45.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25h

Downloading https://data.pyg.org/datasets/elliptic/elliptic_txs_features.csv.zip
Downloading https://data.pyg.org/datasets/elliptic/elliptic_txs_edgelist.csv.zip
Downloading https://data.pyg.org/datasets/elliptic/elliptic_txs_classes.csv.zip
Processing...
Done!


## Function of the Code

1. **Print data object structure**
   - Displays the overall `data` object from the Elliptic Bitcoin dataset, which contains graph structure, features, and labels.

2. **Print graph statistics**
   - Number of nodes (`data.num_nodes`)
   - Number of edges (`data.num_edges`)
   - Feature dimension (`data.num_node_features`)

3. **Print label distribution**
   - Uses `torch.unique()` to get all unique labels and their counts.
   - Iterates through labels to print the number of nodes per class.

4. **Print train/test split information**
   - Counts and prints the number of nodes in the training set (`data.train_mask`).
   - Counts and prints the number of nodes in the testing set (`data.test_mask`).


In [None]:
print("Data object structure:", data)
print(f"Number of nodes: {data.num_nodes}, Number of edges: {data.num_edges}, Feature dimension: {data.num_node_features}")

unique, counts = torch.unique(data.y, return_counts=True)
print("Label distribution:")
for label, count in zip(unique.tolist(), counts.tolist()):
    print(f"  Label {label}: {count} nodes")

print(f"Number of training nodes: {data.train_mask.sum().item()}, Number of testing nodes: {data.test_mask.sum().item()}")


Data object structure: Data(x=[203769, 165], edge_index=[2, 234355], y=[203769], train_mask=[203769], test_mask=[203769])
Number of nodes: 203769, Number of edges: 234355, Feature dimension: 165
Label distribution:
  Label 0: 42019 nodes
  Label 1: 4545 nodes
  Label 2: 157205 nodes
Number of training nodes: 29894, Number of testing nodes: 16670


## Function of the Code

1. **Define `GraphSAGE` model**
   - Two-layer GraphSAGE architecture:
     - `SAGEConv(in_channels → hidden_channels)`
     - `ReLU` activation
     - `SAGEConv(hidden_channels → out_channels)`

2. **Initialize model and optimizer**
   - Creates `model1` with:
     - Input features = `data.num_features`
     - Hidden dimension = 64
     - Output dimension = 2 (binary classification)
   - Uses Adam optimizer with learning rate 0.01.

3. **`train(model, optimizer)`**
   - Sets model to training mode.
   - Performs forward pass on all nodes.
   - Computes cross-entropy loss on training nodes (`data.train_mask`).
   - Backpropagates and updates parameters.
   - Returns the loss value.

4. **`evaluate(model)`**
   - Sets model to evaluation mode.
   - Performs forward pass to get predictions.
   - Selects predicted class labels (`argmax`).
   - Computes accuracy on test nodes (`data.test_mask`).

5. **Train loop**
   - Runs for 50 epochs.
   - Every 10 epochs:
     - Evaluates the model on the test set.
     - Prints epoch, loss, and test accuracy.


In [None]:
# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)

model1 = GraphSAGE(data.num_features, 64, 2).to(device)
opt1 = torch.optim.Adam(model1.parameters(), lr=0.01)

# Training function
def train(model, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Evaluation function
@torch.no_grad()
def evaluate(model):
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    acc = accuracy_score(data.y[data.test_mask].cpu(), pred[data.test_mask].cpu())
    return acc

# Train GraphSAGE
for epoch in range(1, 51):
    loss = train(model1, opt1)
    if epoch % 10 == 0:
        acc1 = evaluate(model1)
        print(f'[GraphSAGE] Epoch {epoch} | Loss: {loss:.4f} | Test Acc: {acc1:.4f}')


[GraphSAGE] Epoch 10 | Loss: 0.1742 | Test Acc: 0.9196
[GraphSAGE] Epoch 20 | Loss: 0.1184 | Test Acc: 0.9259
[GraphSAGE] Epoch 30 | Loss: 0.0942 | Test Acc: 0.9314
[GraphSAGE] Epoch 40 | Loss: 0.0785 | Test Acc: 0.9365
[GraphSAGE] Epoch 50 | Loss: 0.0669 | Test Acc: 0.9376


## Function of the Code

1. **Define `GraphSAGE` model**
   - A two-layer GraphSAGE architecture:
     - First layer: `SAGEConv(in_channels → hidden_channels)` followed by ReLU activation.
     - Second layer: `SAGEConv(hidden_channels → out_channels)` to produce class logits.

2. **Initialize model and optimizer**
   - Creates `model1` with:
     - Input features = `data.num_features`
     - Hidden dimension = 64
     - Output dimension = 2 (binary classification)
   - Uses Adam optimizer with learning rate 0.01.

3. **`train(model, optimizer)`**
   - Sets the model to training mode.
   - Computes forward pass for all nodes.
   - Calculates cross-entropy loss on **training nodes** (`data.train_mask`).
   - Performs backpropagation and updates model parameters.
   - Returns the loss value.

4. **`evaluate(model)`**
   - Sets the model to evaluation mode.
   - Runs forward pass to get predicted logits.
   - Selects class predictions with `argmax`.
   - Calculates accuracy on **test nodes** (`data.test_mask`).

5. **Training loop**
   - Trains for 50 epochs.
   - Every 10 epochs:
     - Evaluates the model on the test set.
     - Prints epoch number, training loss, and test accuracy.


In [None]:
# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)

model1 = GraphSAGE(data.num_features, 64, 2).to(device)
opt1 = torch.optim.Adam(model1.parameters(), lr=0.01)

# Training function
def train(model, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Evaluation function
@torch.no_grad()
def evaluate(model):
    model.eval()
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    acc = accuracy_score(data.y[data.test_mask].cpu(), pred[data.test_mask].cpu())
    return acc

# Train GraphSAGE
for epoch in range(1, 51):
    loss = train(model1, opt1)
    if epoch % 10 == 0:
        acc1 = evaluate(model1)
        print(f'[GraphSAGE] Epoch {epoch} | Loss: {loss:.4f} | Test Acc: {acc1:.4f}')


[GraphSAGE] Epoch 10 | Loss: 0.1789 | Test Acc: 0.9379
[GraphSAGE] Epoch 20 | Loss: 0.1239 | Test Acc: 0.9304
[GraphSAGE] Epoch 30 | Loss: 0.0958 | Test Acc: 0.9374
[GraphSAGE] Epoch 40 | Loss: 0.0790 | Test Acc: 0.9418
[GraphSAGE] Epoch 50 | Loss: 0.0669 | Test Acc: 0.9461


## Function of the Code

1. **Define DGI Encoder (`Encoder` class)**
   - Uses a single `SAGEConv` layer to produce node embeddings.
   - `forward()`: Generates embeddings from input features and graph structure.
   - `summary()`: Computes a global summary vector using the mean of embeddings with sigmoid activation.

2. **Initialize Deep Graph Infomax (DGI)**
   - `hidden_channels=64`: Embedding size.
   - `encoder`: The custom SAGE-based encoder.
   - `summary`: The encoder’s summary function.
   - `corruption`: Node feature corruption by randomly shuffling rows (negative samples).
   - Moves DGI model to the selected device (CPU/GPU).

3. **Train DGI**
   - Runs for 100 epochs.
   - In each epoch:
     - Gets positive (`pos_z`) and negative (`neg_z`) embeddings and summary vector.
     - Computes DGI loss to maximize mutual information between graph patches and summary.
     - Updates parameters with Adam optimizer.
   - Prints loss every 20 epochs.

4. **Generate embeddings**
   - After training, uses the encoder to compute node embeddings `z` without gradient tracking.

5. **Define simple classifier (`Classifier` class)**
   - A single linear layer mapping embeddings to 2 output classes.

6. **Train classifier on embeddings**
   - Runs for 50 epochs.
   - Uses cross-entropy loss on training nodes.
   - Every 10 epochs:
     - Evaluates on test nodes and prints loss + accuracy.


In [None]:
# Define DGI encoder (SAGE)
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv = SAGEConv(in_channels, hidden_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

    def summary(self, z, *args, **kwargs):
        return torch.sigmoid(z.mean(dim=0))

encoder = Encoder(data.num_features, 64).to(device)
dgi = DeepGraphInfomax(
    hidden_channels=64,
    encoder=encoder,
    summary=encoder.summary,
    corruption=lambda x, edge_index: (x[torch.randperm(x.size(0))], edge_index),
).to(device)

opt2 = torch.optim.Adam(dgi.parameters(), lr=0.001)

# Train DGI
for epoch in range(1, 101):
    dgi.train()
    opt2.zero_grad()
    pos_z, neg_z, summary = dgi(data.x, data.edge_index)
    loss = dgi.loss(pos_z, neg_z, summary)
    loss.backward()
    opt2.step()
    if epoch % 20 == 0:
        print(f'[DGI] Epoch {epoch} | Loss: {loss.item():.4f}')

# Use the trained encoder to get z
with torch.no_grad():
    z = encoder(data.x, data.edge_index).detach()

# Define a simple classifier using z
class Classifier(torch.nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.lin = torch.nn.Linear(in_dim, out_dim)

    def forward(self, z):
        return self.lin(z)

clf = Classifier(64, 2).to(device)
opt3 = torch.optim.Adam(clf.parameters(), lr=0.01)

# Train classifier
for epoch in range(1, 51):
    clf.train()
    opt3.zero_grad()
    out = clf(z)
    loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    opt3.step()
    if epoch % 10 == 0:
        clf.eval()
        pred = out.argmax(dim=1)
        acc2 = accuracy_score(data.y[data.test_mask].cpu(), pred[data.test_mask].cpu())
        print(f'[DGI+Classifier] Epoch {epoch} | Loss: {loss.item():.4f} | Test Acc: {acc2:.4f}')


[DGI] Epoch 20 | Loss: 1.3332
[DGI] Epoch 40 | Loss: 1.3168
[DGI] Epoch 60 | Loss: 1.3127
[DGI] Epoch 80 | Loss: 1.3079
[DGI] Epoch 100 | Loss: 1.3077
[DGI+Classifier] Epoch 10 | Loss: 0.3885 | Test Acc: 0.4195
[DGI+Classifier] Epoch 20 | Loss: 0.3062 | Test Acc: 0.5254
[DGI+Classifier] Epoch 30 | Loss: 0.2758 | Test Acc: 0.5852
[DGI+Classifier] Epoch 40 | Loss: 0.2596 | Test Acc: 0.6278
[DGI+Classifier] Epoch 50 | Loss: 0.2494 | Test Acc: 0.6611


## Function of the Code

1. **Prepare predictions for GraphSAGE (Model 1)**
   - Sets `model1` to evaluation mode.
   - Performs forward pass to get logits (`out1`) and predicted labels (`pred1`).
   - Retrieves true labels (`true1`) and applies the test mask (`data.test_mask`) to select test samples.

2. **Prepare predictions for DGI + Classifier (Model 2)**
   - Sets `clf` to evaluation mode.
   - Performs forward pass on precomputed embeddings `z` to get logits (`out2`) and predicted labels (`pred2`).
   - Retrieves true labels (`true2`) and applies the same test mask.

3. **Move tensors to CPU for sklearn**
   - Converts masked true and predicted labels for both models to CPU tensors so they can be used by `classification_report`.

4. **Print evaluation metrics**
   - For **GraphSAGE trained directly**.
   - For **DGI + GraphSAGE Classifier**.
   - Uses `classification_report()` to display Precision, Recall, F1-score, and Support for each class with 4 decimal precision.


In [None]:
from sklearn.metrics import classification_report

# Model 1: GraphSAGE
model1.eval()
out1 = model1(data.x, data.edge_index)
pred1 = out1.argmax(dim=1)
true1 = data.y
mask = data.test_mask

# Model 2: DGI + Classifier
clf.eval()
out2 = clf(z)
pred2 = out2.argmax(dim=1)
true2 = data.y

# Move only the final tensors used in sklearn to CPU
true1_cpu = true1[mask].cpu()
pred1_cpu = pred1[mask].cpu()

true2_cpu = true2[mask].cpu()
pred2_cpu = pred2[mask].cpu()

# Print evaluation metrics for comparison
print("====== GraphSAGE Trained Directly ======")
print(classification_report(true1_cpu, pred1_cpu, digits=4))

print("====== DGI + GraphSAGE Classifier ======")
print(classification_report(true2_cpu, pred2_cpu, digits=4))


              precision    recall  f1-score   support

           0     0.9659    0.9768    0.9713     15587
           1     0.6013    0.5042    0.5485      1083

    accuracy                         0.9461     16670
   macro avg     0.7836    0.7405    0.7599     16670
weighted avg     0.9422    0.9461    0.9439     16670

              precision    recall  f1-score   support

           0     0.9743    0.6577    0.7853     15587
           1     0.1322    0.7507    0.2248      1083

    accuracy                         0.6637     16670
   macro avg     0.5533    0.7042    0.5051     16670
weighted avg     0.9196    0.6637    0.7489     16670

