<a href="https://colab.research.google.com/github/arumishra/Assignment-Codes/blob/main/complexnetworks2_2GCN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import zipfile
import os

zip_path = "/content/webkb.zip"  # Path to the ZIP file
extract_path = "/content"  # Extract directly to /content

# Extract the dataset
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Set the correct data directory path
data_directory = "/content/webkb"  # Now this should directly contain the files

# List extracted files to verify
print("Extracted files:", os.listdir(data_directory))


Extracted files: ['README', 'wisconsin.cites', 'washington.content', 'wisconsin.content', 'texas.cites', 'cornell.content', 'washington.cites', 'texas.content', 'cornell.cites']


In [4]:
import os

# Define paths
data_directory = "/content/webkb"
output_cites = "/content/webkb_combined/combined.cites"

# Ensure output directory exists
os.makedirs("/content/webkb_combined", exist_ok=True)

# Merge .cites files
with open(output_cites, "w") as outfile:
    for file in os.listdir(data_directory):
        if file.endswith(".cites"):
            file_path = os.path.join(data_directory, file)
            print(f"🔄 Merging {file_path}")
            with open(file_path, "r") as infile:
                lines = infile.readlines()
                if lines:
                    print(f"✅ {file} has {len(lines)} edges")
                    outfile.writelines(lines)
                else:
                    print(f"⚠️ {file} is empty!")

print("✅ Merging .cites files complete!")


🔄 Merging /content/webkb/wisconsin.cites
✅ wisconsin.cites has 530 edges
🔄 Merging /content/webkb/texas.cites
✅ texas.cites has 328 edges
🔄 Merging /content/webkb/washington.cites
✅ washington.cites has 446 edges
🔄 Merging /content/webkb/cornell.cites
✅ cornell.cites has 304 edges
✅ Merging .cites files complete!


In [5]:
# Define output file path
output_content = "/content/webkb_combined/combined.content"

# Merge .content files
with open(output_content, "w") as outfile:
    for file in os.listdir(data_directory):
        if file.endswith(".content"):
            file_path = os.path.join(data_directory, file)
            print(f"🔄 Merging {file_path}")
            with open(file_path, "r") as infile:
                lines = infile.readlines()
                if lines:
                    print(f"✅ {file} has {len(lines)} nodes")
                    outfile.writelines(lines)
                else:
                    print(f"⚠️ {file} is empty!")

print("✅ Merging .content files complete!")


🔄 Merging /content/webkb/washington.content
✅ washington.content has 230 nodes
🔄 Merging /content/webkb/wisconsin.content
✅ wisconsin.content has 265 nodes
🔄 Merging /content/webkb/cornell.content
✅ cornell.content has 195 nodes
🔄 Merging /content/webkb/texas.content
✅ texas.content has 187 nodes
✅ Merging .content files complete!


In [6]:
print("✅ Size of combined.cites:", os.path.getsize(output_cites), "bytes")
print("✅ Size of combined.content:", os.path.getsize(output_content), "bytes")


✅ Size of combined.cites: 137522 bytes
✅ Size of combined.content: 3034502 bytes


In [9]:
import numpy as np
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ------------------- Activation and Utility -------------------
def relu(x):
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# ------------------- Adam Optimizer -------------------
class AdamOptimizer:
    def __init__(self, shape, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.m = np.zeros(shape)
        self.v = np.zeros(shape)
        self.t = 0

    def update(self, param, grad):
        self.t += 1
        self.m = self.beta1 * self.m + (1 - self.beta1) * grad
        self.v = self.beta2 * self.v + (1 - self.beta2) * (grad ** 2)
        m_hat = self.m / (1 - self.beta1 ** self.t)
        v_hat = self.v / (1 - self.beta2 ** self.t)
        param -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
        return param

# ------------------- GCN Layer -------------------
class GCNLayer:
    def __init__(self, in_dim, out_dim, lr=0.01):
        self.W = np.random.randn(in_dim, out_dim) * 0.01
        self.optimizer = AdamOptimizer(self.W.shape, lr)

    def forward(self, X, A_hat):
        self.X = X
        self.Z = A_hat @ X @ self.W
        return relu(self.Z)

    def backward(self, grad, A_hat):
        grad_Z = grad * (self.Z > 0).astype(float)
        dW = self.X.T @ A_hat.T @ grad_Z
        self.W = self.optimizer.update(self.W, dW)
        return grad_Z @ self.W.T

# ------------------- GCN Model -------------------
class GCN:
    def __init__(self, input_dim, hidden_dim, output_dim, lr=0.01):
        self.gcn1 = GCNLayer(input_dim, hidden_dim, lr)
        self.gcn2 = GCNLayer(hidden_dim, output_dim, lr)

    def forward(self, X, A_hat, training=True):
        self.h1 = self.gcn1.forward(X, A_hat)
        if training:
            dropout_mask = (np.random.rand(*self.h1.shape) > 0.5)
            self.h1 *= dropout_mask  # Dropout with rate = 0.5
        self.out = A_hat @ self.h1 @ self.gcn2.W
        return softmax(self.out)

    def backward(self, X, A_hat, y_true, idx_train):
        preds = softmax(self.out)
        y_onehot = np.eye(preds.shape[1])[y_true]
        loss_grad = (preds - y_onehot) / len(idx_train)
        grad_out = np.zeros_like(loss_grad)
        grad_out[idx_train] = loss_grad[idx_train]

        grad_h1 = grad_out @ self.gcn2.W.T
        self.gcn2.W = self.gcn2.optimizer.update(self.gcn2.W, self.h1.T @ A_hat.T @ grad_out)
        self.gcn1.backward(grad_h1, A_hat)

    def predict(self):
        return np.argmax(self.out, axis=1)

# ------------------- Data Processing -------------------
def load_content_file(path):
    paper_ids, features, labels = [], [], []

    with open(path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            paper_ids.append(parts[0])
            features.append(list(map(int, parts[1:-1])))
            labels.append(parts[-1])

    label_set = sorted(set(labels))
    label_to_index = {label: i for i, label in enumerate(label_set)}
    encoded_labels = np.array([label_to_index[label] for label in labels])

    id_to_index = {pid: i for i, pid in enumerate(paper_ids)}
    features = np.array(features)
    return features, encoded_labels, id_to_index, label_to_index

def load_cites_file(path, id_to_index, num_nodes):
    adj = np.zeros((num_nodes, num_nodes))
    with open(path, 'r') as f:
        for line in f:
            src, dst = line.strip().split()
            if src in id_to_index and dst in id_to_index:
                i, j = id_to_index[src], id_to_index[dst]
                adj[i][j] = 1
                adj[j][i] = 1
    return adj

def normalize_adjacency(adj):
    adj = adj + np.eye(adj.shape[0])
    deg = np.sum(adj, axis=1)
    d_inv_sqrt = np.power(deg, -0.5)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    D_inv_sqrt = np.diag(d_inv_sqrt)
    return D_inv_sqrt @ adj @ D_inv_sqrt

def split_nodes(num_nodes, train_ratio=0.7):
    indices = np.arange(num_nodes)
    np.random.shuffle(indices)
    train_end = int(train_ratio * num_nodes)
    return indices[:train_end], indices[train_end:]

def cross_entropy(preds, labels, idx):
    logp = -np.log(preds[np.arange(len(preds)), labels])
    return np.mean(logp[idx])

# ------------------- Training and Evaluation -------------------
def train_gcn(X, y, A_hat, idx_train, idx_test, num_classes, epochs=200, lr=0.01):
    model = GCN(X.shape[1], 16, num_classes, lr)

    for epoch in range(epochs):
        out = model.forward(X, A_hat)
        loss = cross_entropy(out, y, idx_train)
        model.backward(X, A_hat, y, idx_train)

        if epoch % 10 == 0:
            pred = model.predict()
            acc = accuracy_score(y[idx_train], pred[idx_train])
            print(f"Epoch {epoch} | Loss: {loss:.4f} | Train Acc: {acc:.4f}")

    return model

def evaluate(model, y_true, idx_test):
    pred = model.predict()
    y_pred = pred[idx_test]
    y_true = y_true[idx_test]
    print("\n--- Test Set Evaluation ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='macro'))
    print("Recall:", recall_score(y_true, y_pred, average='macro'))
    print("F1 Score:", f1_score(y_true, y_pred, average='macro'))
    import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

def evaluate_detailed(model, y_true, idx_test, label_map, paper_ids):
    pred = model.predict()
    y_pred = pred[idx_test]
    y_true = y_true[idx_test]

    # --- Metric Summary ---
    print("\n--- Per-Class Metrics ---")
    print(classification_report(y_true, y_pred, target_names=list(label_map.keys())))

    # --- Confusion Matrix ---
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=list(label_map.keys()),
                yticklabels=list(label_map.keys()))
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

# ------------------- Main -------------------
if __name__ == "__main__":
    # Load data
    X, y, id_to_idx, label_map = load_content_file("webkb_combined/combined.content")
    A = load_cites_file("webkb_combined/combined.cites", id_to_idx, X.shape[0])
    A_hat = normalize_adjacency(A)

    # Train-test split
    idx_train, idx_test = split_nodes(len(y), 0.7)

    # Train
    model = train_gcn(X, y, A_hat, idx_train, idx_test, num_classes=len(label_map))

    # Evaluate
    evaluate(model, y, idx_test)


Epoch 0 | Loss: 1.6096 | Train Acc: 0.1077
Epoch 10 | Loss: 1.3552 | Train Acc: 0.4796
Epoch 20 | Loss: 1.1826 | Train Acc: 0.5530
Epoch 30 | Loss: 1.0459 | Train Acc: 0.6085
Epoch 40 | Loss: 0.9824 | Train Acc: 0.6281
Epoch 50 | Loss: 0.9508 | Train Acc: 0.6248
Epoch 60 | Loss: 0.9983 | Train Acc: 0.5824
Epoch 70 | Loss: 0.9558 | Train Acc: 0.6264
Epoch 80 | Loss: 0.9574 | Train Acc: 0.6020
Epoch 90 | Loss: 0.9768 | Train Acc: 0.6003
Epoch 100 | Loss: 0.9751 | Train Acc: 0.5938
Epoch 110 | Loss: 0.9449 | Train Acc: 0.6362
Epoch 120 | Loss: 0.9403 | Train Acc: 0.5938
Epoch 130 | Loss: 0.9474 | Train Acc: 0.6378
Epoch 140 | Loss: 0.9821 | Train Acc: 0.5840
Epoch 150 | Loss: 0.9307 | Train Acc: 0.6297
Epoch 160 | Loss: 0.9384 | Train Acc: 0.6232
Epoch 170 | Loss: 0.9916 | Train Acc: 0.5987
Epoch 180 | Loss: 0.9651 | Train Acc: 0.5971
Epoch 190 | Loss: 0.9164 | Train Acc: 0.6052

--- Test Set Evaluation ---
Accuracy: 0.4128787878787879
Precision: 0.2829938532348518
Recall: 0.2739035375399

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

def evaluate_detailed(model, y_true, idx_test, label_map, paper_ids):
    pred = model.predict()
    y_pred = pred[idx_test]
    y_true = y_true[idx_test]

    # --- Metric Summary ---
    print("\n--- Per-Class Metrics ---")
    print(classification_report(y_true, y_pred, target_names=list(label_map.keys())))

    # --- Confusion Matrix ---
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=list(label_map.keys()),
                yticklabels=list(label_map.keys()))
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()