<a href="https://colab.research.google.com/github/arumishra/Assignment-Codes/blob/main/GAT_GIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import zipfile
import os

zip_path = "/content/webkb.zip"  # Path to the ZIP file
extract_path = "/content"  # Extract directly to /content

# Extract the dataset
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Set the correct data directory path
data_directory = "/content/webkb"  # Now this should directly contain the files

# List extracted files to verify
print("Extracted files:", os.listdir(data_directory))


Extracted files: ['README', 'wisconsin.cites', 'washington.content', 'wisconsin.content', 'texas.cites', 'cornell.content', 'washington.cites', 'texas.content', 'cornell.cites']


In [3]:
# Install torch-scatter, torch-sparse, torch-cluster, and torch-spline-conv (PyG dependencies)
# Select appropriate versions based on your torch version (below assumes torch 2.0.0+ and CUDA 11.7)
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+cpu.html

# Now install PyTorch Geometric
!pip install torch-geometric


Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_scatter-2.1.2%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (494 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.0/494.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt20cpu
Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcpu/torch_sparse-0.6.18%2Bpt20cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt20cpu
Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-

In [4]:
import os

# Define paths
data_directory = "/content/webkb"
output_cites = "/content/webkb_combined/combined.cites"

# Ensure output directory exists
os.makedirs("/content/webkb_combined", exist_ok=True)

# Merge .cites files
with open(output_cites, "w") as outfile:
    for file in os.listdir(data_directory):
        if file.endswith(".cites"):
            file_path = os.path.join(data_directory, file)
            print(f"🔄 Merging {file_path}")
            with open(file_path, "r") as infile:
                lines = infile.readlines()
                if lines:
                    print(f"✅ {file} has {len(lines)} edges")
                    outfile.writelines(lines)
                else:
                    print(f"⚠️ {file} is empty!")

print("✅ Merging .cites files complete!")


🔄 Merging /content/webkb/wisconsin.cites
✅ wisconsin.cites has 530 edges
🔄 Merging /content/webkb/texas.cites
✅ texas.cites has 328 edges
🔄 Merging /content/webkb/washington.cites
✅ washington.cites has 446 edges
🔄 Merging /content/webkb/cornell.cites
✅ cornell.cites has 304 edges
✅ Merging .cites files complete!


In [5]:
# Define output file path
output_content = "/content/webkb_combined/combined.content"

# Merge .content files
with open(output_content, "w") as outfile:
    for file in os.listdir(data_directory):
        if file.endswith(".content"):
            file_path = os.path.join(data_directory, file)
            print(f"🔄 Merging {file_path}")
            with open(file_path, "r") as infile:
                lines = infile.readlines()
                if lines:
                    print(f"✅ {file} has {len(lines)} nodes")
                    outfile.writelines(lines)
                else:
                    print(f"⚠️ {file} is empty!")

print("✅ Merging .content files complete!")


🔄 Merging /content/webkb/washington.content
✅ washington.content has 230 nodes
🔄 Merging /content/webkb/wisconsin.content
✅ wisconsin.content has 265 nodes
🔄 Merging /content/webkb/cornell.content
✅ cornell.content has 195 nodes
🔄 Merging /content/webkb/texas.content
✅ texas.content has 187 nodes
✅ Merging .content files complete!


In [6]:
print("✅ Size of combined.cites:", os.path.getsize(output_cites), "bytes")
print("✅ Size of combined.content:", os.path.getsize(output_content), "bytes")


✅ Size of combined.cites: 137522 bytes
✅ Size of combined.content: 3034502 bytes


In [7]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GINConv
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Load WebKB Dataset
def load_webkb_dataset(content_path, cites_path):
    paper_ids = []
    features = []
    labels = []

    with open(content_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            paper_ids.append(parts[0])
            features.append([int(x) for x in parts[1:-1]])
            labels.append(parts[-1])

    id_to_idx = {paper_id: idx for idx, paper_id in enumerate(paper_ids)}
    x = torch.tensor(features, dtype=torch.float)
    label_set = sorted(set(labels))
    label_to_idx = {label: i for i, label in enumerate(label_set)}
    y = torch.tensor([label_to_idx[label] for label in labels], dtype=torch.long)

    edge_index = []
    with open(cites_path, 'r') as f:
        for line in f:
            src, dst = line.strip().split()
            if src in id_to_idx and dst in id_to_idx:
                src_idx = id_to_idx[src]
                dst_idx = id_to_idx[dst]
                edge_index.append([src_idx, dst_idx])
                edge_index.append([dst_idx, src_idx])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    data = Data(x=x, edge_index=edge_index, y=y)
    return data, label_to_idx

# Create Train/Test Masks
def split_train_test(data, train_ratio=0.7, seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    indices = np.arange(data.num_nodes)
    labels = data.y.numpy()
    train_idx, test_idx = train_test_split(indices, train_size=train_ratio, stratify=labels, random_state=seed)

    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    test_mask[test_idx] = True
    data.train_mask = train_mask
    data.test_mask = test_mask
    return data

# GIN Model
class GIN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.5):
        super(GIN, self).__init__()
        nn1 = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim))
        self.conv1 = GINConv(nn1)
        nn2 = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim))
        self.conv2 = GINConv(nn2)
        self.classifier = nn.Linear(hidden_dim, output_dim)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        out = self.classifier(x)
        return out

# Train and Evaluate
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
        y_true = data.y[data.test_mask].cpu()
        y_pred = pred[data.test_mask].cpu()
        acc = accuracy_score(y_true, y_pred)
        prec, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    return acc, prec, recall, f1

# Main Execution
if __name__ == "__main__":
    content_path = "webkb_combined/combined.content"
    cites_path = "webkb_combined/combined.cites"

    data, label_map = load_webkb_dataset(content_path, cites_path)
    data = split_train_test(data)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GIN(input_dim=data.num_node_features, hidden_dim=16, output_dim=len(label_map)).to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()

    print("Training GIN model...\n")
    for epoch in range(1, 201):
        loss = train(model, data, optimizer, criterion)
        if epoch % 20 == 0:
            acc, prec, rec, f1 = test(model, data)
            print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f}")

    # Final Evaluation
    acc, prec, rec, f1 = test(model, data)
    print("\n--- Final Test Metrics ---")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")




Training GIN model...

Epoch 020 | Loss: 1.3846 | Acc: 0.4735 | F1: 0.1285
Epoch 040 | Loss: 1.2998 | Acc: 0.4735 | F1: 0.1285
Epoch 060 | Loss: 1.2783 | Acc: 0.4735 | F1: 0.1285
Epoch 080 | Loss: 1.2648 | Acc: 0.4735 | F1: 0.1285
Epoch 100 | Loss: 1.2395 | Acc: 0.4583 | F1: 0.2149
Epoch 120 | Loss: 1.2611 | Acc: 0.5076 | F1: 0.2355
Epoch 140 | Loss: 1.1851 | Acc: 0.5114 | F1: 0.2374
Epoch 160 | Loss: 1.1275 | Acc: 0.6023 | F1: 0.2794
Epoch 180 | Loss: 1.0647 | Acc: 0.5985 | F1: 0.2782
Epoch 200 | Loss: 0.9714 | Acc: 0.6023 | F1: 0.2820

--- Final Test Metrics ---
Accuracy:  0.6023
Precision: 0.2514
Recall:    0.3253
F1 Score:  0.2820


In [8]:
#GAT
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# 1. Load Dataset
def load_webkb_dataset(content_path, cites_path):
    paper_ids = []
    features = []
    labels = []

    with open(content_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            paper_ids.append(parts[0])
            features.append([int(x) for x in parts[1:-1]])
            labels.append(parts[-1])

    id_to_idx = {paper_id: idx for idx, paper_id in enumerate(paper_ids)}
    x = torch.tensor(features, dtype=torch.float)
    label_set = sorted(set(labels))
    label_to_idx = {label: i for i, label in enumerate(label_set)}
    y = torch.tensor([label_to_idx[label] for label in labels], dtype=torch.long)

    edge_index = []
    with open(cites_path, 'r') as f:
        for line in f:
            src, dst = line.strip().split()
            if src in id_to_idx and dst in id_to_idx:
                src_idx = id_to_idx[src]
                dst_idx = id_to_idx[dst]
                edge_index.append([src_idx, dst_idx])
                edge_index.append([dst_idx, src_idx])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    data = Data(x=x, edge_index=edge_index, y=y)
    return data, label_to_idx

# 2. Create train/test split
def split_train_test(data, train_ratio=0.7, seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    indices = np.arange(data.num_nodes)
    labels = data.y.numpy()
    train_idx, test_idx = train_test_split(indices, train_size=train_ratio, stratify=labels, random_state=seed)

    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    test_mask[test_idx] = True
    data.train_mask = train_mask
    data.test_mask = test_mask
    return data

# 3. Define GAT model
class GAT(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8, dropout=0.5):
        super(GAT, self).__init__()
        self.gat1 = GATConv(input_dim, hidden_dim, heads=heads, dropout=dropout)
        self.gat2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=dropout)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.gat2(x, edge_index)
        return x

# 4. Train and Evaluate
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(model, data):
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
        y_true = data.y[data.test_mask].cpu()
        y_pred = pred[data.test_mask].cpu()
        acc = accuracy_score(y_true, y_pred)
        prec, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    return acc, prec, recall, f1

# 5. Run everything
if __name__ == "__main__":
    content_path = "webkb_combined/combined.content"
    cites_path = "webkb_combined/combined.cites"

    data, label_map = load_webkb_dataset(content_path, cites_path)
    data = split_train_test(data)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GAT(input_dim=data.num_node_features, hidden_dim=16, output_dim=len(label_map)).to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()

    print("Training GAT model...\n")
    for epoch in range(1, 201):
        loss = train(model, data, optimizer, criterion)
        if epoch % 20 == 0:
            acc, prec, rec, f1 = test(model, data)
            print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Acc: {acc:.4f} | F1: {f1:.4f}")

    # Final evaluation
    acc, prec, rec, f1 = test(model, data)
    print("\n--- Final Test Metrics ---")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")


Training GAT model...

Epoch 020 | Loss: 1.3026 | Acc: 0.5455 | F1: 0.2861
Epoch 040 | Loss: 1.1143 | Acc: 0.5795 | F1: 0.3372
Epoch 060 | Loss: 0.9934 | Acc: 0.5833 | F1: 0.3595
Epoch 080 | Loss: 0.9040 | Acc: 0.5682 | F1: 0.3847
Epoch 100 | Loss: 0.8444 | Acc: 0.5947 | F1: 0.4083
Epoch 120 | Loss: 0.7996 | Acc: 0.5644 | F1: 0.3625
Epoch 140 | Loss: 0.7808 | Acc: 0.5758 | F1: 0.3711
Epoch 160 | Loss: 0.8485 | Acc: 0.5758 | F1: 0.3630
Epoch 180 | Loss: 0.7597 | Acc: 0.5568 | F1: 0.3544
Epoch 200 | Loss: 0.7111 | Acc: 0.5644 | F1: 0.3642

--- Final Test Metrics ---
Accuracy:  0.5644
Precision: 0.3923
Recall:    0.3628
F1 Score:  0.3642
