<a href="https://colab.research.google.com/github/arumishra/Assignment-Codes/blob/main/complexnetworkstask1_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import zipfile
import os

zip_path = "/content/webkb.zip"  # Path to the ZIP file
extract_path = "/content"  # Extract directly to /content

# Extract the dataset
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Set the correct data directory path
data_directory = "/content/webkb"  # Now this should directly contain the files

# List extracted files to verify
print("Extracted files:", os.listdir(data_directory))


Extracted files: ['wisconsin.cites', 'cornell.cites', 'texas.cites', 'washington.content', 'wisconsin.content', 'texas.content', 'cornell.content', 'README', 'washington.cites', 'webkb']


In [39]:
import os

# Define paths
data_directory = "/content/webkb"
output_cites = "/content/webkb_combined/combined.cites"

# Ensure output directory exists
os.makedirs("/content/webkb_combined", exist_ok=True)

# Merge .cites files
with open(output_cites, "w") as outfile:
    for file in os.listdir(data_directory):
        if file.endswith(".cites"):
            file_path = os.path.join(data_directory, file)
            print(f"🔄 Merging {file_path}")
            with open(file_path, "r") as infile:
                lines = infile.readlines()
                if lines:
                    print(f"✅ {file} has {len(lines)} edges")
                    outfile.writelines(lines)
                else:
                    print(f"⚠️ {file} is empty!")

print("✅ Merging .cites files complete!")


🔄 Merging /content/webkb/wisconsin.cites
✅ wisconsin.cites has 530 edges
🔄 Merging /content/webkb/cornell.cites
✅ cornell.cites has 304 edges
🔄 Merging /content/webkb/texas.cites
✅ texas.cites has 328 edges
🔄 Merging /content/webkb/washington.cites
✅ washington.cites has 446 edges
✅ Merging .cites files complete!


In [40]:
# Define output file path
output_content = "/content/webkb_combined/combined.content"

# Merge .content files
with open(output_content, "w") as outfile:
    for file in os.listdir(data_directory):
        if file.endswith(".content"):
            file_path = os.path.join(data_directory, file)
            print(f"🔄 Merging {file_path}")
            with open(file_path, "r") as infile:
                lines = infile.readlines()
                if lines:
                    print(f"✅ {file} has {len(lines)} nodes")
                    outfile.writelines(lines)
                else:
                    print(f"⚠️ {file} is empty!")

print("✅ Merging .content files complete!")


🔄 Merging /content/webkb/washington.content
✅ washington.content has 230 nodes
🔄 Merging /content/webkb/wisconsin.content
✅ wisconsin.content has 265 nodes
🔄 Merging /content/webkb/texas.content
✅ texas.content has 187 nodes
🔄 Merging /content/webkb/cornell.content
✅ cornell.content has 195 nodes
✅ Merging .content files complete!


In [41]:
print("✅ Size of combined.cites:", os.path.getsize(output_cites), "bytes")
print("✅ Size of combined.content:", os.path.getsize(output_content), "bytes")


✅ Size of combined.cites: 137522 bytes
✅ Size of combined.content: 3034502 bytes


In [42]:
import random

def split_cites_file(input_file, train_file, test_file, split_ratio=0.7):
    with open(input_file, "r") as f:
        lines = f.readlines()

    random.shuffle(lines)
    split_idx = int(len(lines) * split_ratio)

    with open(train_file, "w") as f:
        f.writelines(lines[:split_idx])

    with open(test_file, "w") as f:
        f.writelines(lines[split_idx:])

# Run the function
split_cites_file("webkb_combined/combined.cites",
                 "webkb_combined/combined_train.cites",
                 "webkb_combined/combined_test.cites")


In [51]:
from collections import defaultdict

def load_webkb_data(data_dir, train_cites_file):
    graph = defaultdict(list)
    labels = {}

    # Load node labels from combined.content
    with open(os.path.join(data_dir, "combined.content"), "r") as f:
        for line in f:
            parts = line.strip().split()
            webpage_id, class_label = parts[0], parts[-1]
            labels[webpage_id] = class_label
            graph[webpage_id] = []  # Ensure every node appears in the graph

    # Load edges from combined_train.cites
    with open(train_cites_file, "r") as f:
        for line in f:
            cited, citing = line.strip().split()
            graph[citing].append(cited)  # Directed edge: citing → cited

    return graph, labels

# Load train graph
graph, labels = load_webkb_data("webkb_combined", "webkb_combined/combined_train.cites")
# Load test graph
test_graph, test_labels = load_webkb_data("webkb_combined", "webkb_combined/combined_test.cites")


print("✅ Data Loaded: Sample Graph:", dict(list(graph.items())[:5]))
print("✅ Labels Loaded: Sample Labels:", dict(list(labels.items())[:5]))


✅ Data Loaded: Sample Graph: {'http://metacrawler.cs.washington.edu:8080': ['http://metacrawler.cs.washington.edu:8080', 'http://www.cs.washington.edu/homes/etzioni'], 'http://www.cs.washington.edu': [], 'http://www.cs.washington.edu/education/courses/135': ['http://www.cs.washington.edu/education/courses/135'], 'http://www.cs.washington.edu/education/courses/142/95a': ['http://www.cs.washington.edu/education/courses/142/95a'], 'http://www.cs.washington.edu/education/courses/142/currentqtr': ['http://www.cs.washington.edu/education/courses/142/95a', 'http://www.cs.washington.edu/homes/dickey', 'http://www.cs.washington.edu/education/courses/142/currentqtr']}
✅ Labels Loaded: Sample Labels: {'http://metacrawler.cs.washington.edu:8080': 'project', 'http://www.cs.washington.edu': 'course', 'http://www.cs.washington.edu/education/courses/135': 'course', 'http://www.cs.washington.edu/education/courses/142/95a': 'course', 'http://www.cs.washington.edu/education/courses/142/currentqtr': 'cour

In [50]:
def check_split_ratio():
    original_file = "webkb_combined/combined.cites"
    train_file = "webkb_combined/combined_train.cites"
    test_file = "webkb_combined/combined_test.cites"

    # Count edges in each file
    total_edges = sum(1 for _ in open(original_file, "r"))
    train_edges = sum(1 for _ in open(train_file, "r"))
    test_edges = sum(1 for _ in open(test_file, "r"))

    # Compute the split ratio
    train_ratio = train_edges / total_edges
    test_ratio = test_edges / total_edges

    # Print verification
    print(f"✅ Total Edges: {total_edges}")
    print(f"✅ Training Edges: {train_edges} ({train_ratio:.2%})")
    print(f"✅ Testing Edges: {test_edges} ({test_ratio:.2%})")

check_split_ratio()


✅ Total Edges: 1608
✅ Training Edges: 1125 (69.96%)
✅ Testing Edges: 483 (30.04%)


✅ Random Walks Generated! Sample: [['http://www.cs.utexas.edu/users/novak/cs375.html'], ['http://simon.cs.cornell.edu/info/courses/current/cs401'], ['http://www.cs.utexas.edu/users/rdb/cs372', 'http://www.cs.utexas.edu/users/rdb', 'http://www.cs.utexas.edu/users/less', 'http://www.cs.utexas.edu/users/rdb', 'http://www.cs.utexas.edu/users/rdb', 'http://www.cs.utexas.edu/users/rdb', 'http://www.cs.utexas.edu/users/rdb', 'http://www.cs.utexas.edu/users/less', 'http://www.cs.utexas.edu/users/rdb', 'http://www.cs.utexas.edu/users/rdb']]


In [45]:
import random

class Node2VecRandomWalker:
    def __init__(self, graph, p=1.0, q=1.0, walk_length=10, num_walks=5):
        """
        Initializes the Node2Vec random walker.

        Args:
            graph (dict): Adjacency list representation of the graph.
            p (float): Return parameter (higher means staying closer to the start node).
            q (float): In-out parameter (higher means exploring more).
            walk_length (int): Number of steps per walk.
            num_walks (int): Number of random walks per node.
        """
        self.graph = graph
        self.p = p
        self.q = q
        self.walk_length = walk_length
        self.num_walks = num_walks

    def random_walk(self, start_node):
        """
        Performs a biased random walk starting from the given node.

        Args:
            start_node (str): The node where the walk starts.

        Returns:
            list: A list of nodes representing the random walk.
        """
        walk = [start_node]

        # If the node has no neighbors, return the walk
        if len(self.graph[start_node]) == 0:
            return walk

        # Start with a random first step
        current_node = start_node
        next_node = random.choice(self.graph[current_node])
        walk.append(next_node)

        for _ in range(self.walk_length - 1):
            current_node = walk[-1]
            prev_node = walk[-2] if len(walk) > 1 else None

            neighbors = self.graph[current_node]
            if len(neighbors) == 0:
                break  # Stop if no outgoing edges

            # Compute transition probabilities
            probabilities = self.get_transition_probabilities(prev_node, current_node, neighbors)

            # Choose next node based on transition probabilities
            next_node = random.choices(neighbors, weights=probabilities)[0]
            walk.append(next_node)

        return walk

    def get_transition_probabilities(self, prev_node, current_node, neighbors):
        """
        Computes the transition probabilities for the next step.

        Args:
            prev_node (str): Previous node in the walk.
            current_node (str): Current node in the walk.
            neighbors (list): List of neighbor nodes.

        Returns:
            list: Transition probabilities for each neighbor.
        """
        probabilities = []
        for neighbor in neighbors:
            if neighbor == prev_node:  # Returning to the previous node
                probabilities.append(1 / self.p)
            elif neighbor in self.graph[current_node]:  # Normal neighbor
                probabilities.append(1)
            else:  # Distant node
                probabilities.append(1 / self.q)

        return probabilities

    def generate_walks(self):
        """
        Generates multiple random walks for each node.

        Returns:
            list: A list of all generated walks.
        """
        walks = []
        nodes = list(self.graph.keys())
        for _ in range(self.num_walks):
            random.shuffle(nodes)  # Shuffle nodes to reduce bias
            for node in nodes:
                walk = self.random_walk(node)
                walks.append(walk)
        return walks

# Example Usage
walker = Node2VecRandomWalker(graph, p=1.0, q=1.0, walk_length=10, num_walks=5)
walks = walker.generate_walks()

# Print a few random walks
print("Sample Random Walks:", walks[:3])


Sample Random Walks: [['http://www.cs.cornell.edu/info/people/mishaal/home.html', 'http://www.cs.cornell.edu', 'http://cs-tr.cs.cornell.edu'], ['http://www.cs.utexas.edu/users/ejp', 'http://www.cs.utexas.edu/users/dmcl', 'http://www.cs.utexas.edu'], ['http://www.cs.wisc.edu/~cchin/cchin.html', 'http://www.cs.wisc.edu']]


In [46]:
import torch
import random

class SkipGramDataset(torch.utils.data.Dataset):
    def __init__(self, walks, window_size=2):
        """
        Converts random walks into Skip-Gram training pairs.

        Args:
            walks (list of lists): List of random walks (sequences of nodes).
            window_size (int): Context window size.
        """
        self.window_size = window_size
        self.word2idx = {}  # Node to index mapping
        self.idx2word = []  # Index to node mapping
        self.pairs = []  # Skip-Gram (center, context) pairs

        # Assign unique indices to each node
        self.build_vocab(walks)

        # Extract Skip-Gram training pairs
        self.generate_skipgram_pairs(walks)

    def build_vocab(self, walks):
        """Creates node-to-index mappings."""
        unique_nodes = set(node for walk in walks for node in walk)
        self.word2idx = {node: i for i, node in enumerate(unique_nodes)}
        self.idx2word = list(unique_nodes)

    def generate_skipgram_pairs(self, walks):
        """Generates (center, context) training pairs."""
        for walk in walks:
            indexed_walk = [self.word2idx[node] for node in walk]
            for i, center in enumerate(indexed_walk):
                for j in range(-self.window_size, self.window_size + 1):
                    context_idx = i + j
                    if j != 0 and 0 <= context_idx < len(indexed_walk):
                        self.pairs.append((center, indexed_walk[context_idx]))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return torch.tensor(self.pairs[idx], dtype=torch.long)

# Example Usage
window_size = 2
dataset = SkipGramDataset(walks, window_size=window_size)

# Print a few training pairs
print("Sample Training Pairs (Node Indices):", dataset.pairs[:5])
print("Total training samples:", len(dataset))


Sample Training Pairs (Node Indices): [(14, 831), (14, 7), (831, 14), (831, 7), (7, 14)]
Total training samples: 27052


In [47]:
import torch
import torch.nn as nn
import torch.optim as optim

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        """
        Initializes the Skip-Gram model with two embedding layers.

        Args:
            vocab_size (int): Total number of unique nodes.
            embedding_dim (int): Size of the embedding vectors.
        """
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)  # Center node embeddings
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)  # Context node embeddings

        nn.init.xavier_uniform_(self.embeddings.weight)
        nn.init.xavier_uniform_(self.context_embeddings.weight)

    def forward(self, center_nodes, context_nodes, negative_samples):
        """
        Computes Skip-Gram loss with negative sampling.

        Args:
            center_nodes (Tensor): Indices of center nodes.
            context_nodes (Tensor): Indices of positive context nodes.
            negative_samples (Tensor): Indices of negative samples.

        Returns:
            loss (Tensor): Computed loss value.
        """
        center_emb = self.embeddings(center_nodes)  # Shape: (batch_size, embedding_dim)
        context_emb = self.context_embeddings(context_nodes)  # Shape: (batch_size, embedding_dim)
        negative_emb = self.context_embeddings(negative_samples)  # Shape: (batch_size, num_neg_samples, embedding_dim)

        # Compute positive similarity (dot product)
        positive_score = torch.mul(center_emb, context_emb).sum(dim=1)  # (batch_size)
        positive_loss = torch.log(torch.sigmoid(positive_score))

        # Compute negative similarity (dot product with negative samples)
        negative_score = torch.bmm(negative_emb, center_emb.unsqueeze(2)).squeeze(2)  # (batch_size, num_neg_samples)
        negative_loss = torch.log(torch.sigmoid(-negative_score)).sum(dim=1)  # Sum over negative samples

        # Compute total loss (maximize positive, minimize negative)
        loss = -torch.mean(positive_loss + negative_loss)
        return loss

# Example Usage
vocab_size = len(dataset.word2idx)
embedding_dim = 256
model = SkipGramModel(vocab_size, embedding_dim)

# Print model summary
print(model)


SkipGramModel(
  (embeddings): Embedding(877, 256)
  (context_embeddings): Embedding(877, 256)
)


In [48]:
import torch
import torch.optim as optim
import numpy as np

class Node2VecTrainer:
    def __init__(self, model, dataset, num_neg_samples=5, batch_size=1024, lr=0.001, epochs=30):
        """
        Initializes the trainer for Skip-Gram with negative sampling.

        Args:
            model (SkipGramModel): The Skip-Gram neural network.
            dataset (SkipGramDataset): Dataset containing training pairs.
            num_neg_samples (int): Number of negative samples per positive pair.
            batch_size (int): Number of samples per batch.
            lr (float): Learning rate.
            epochs (int): Number of training epochs.
        """
        self.model = model
        self.dataset = dataset
        self.num_neg_samples = num_neg_samples
        self.batch_size = batch_size
        self.lr = lr
        self.epochs = epochs
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Create DataLoader for batching
        self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, shuffle=True)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

    def get_negative_samples(self, batch_size):
        """Samples negative nodes randomly."""
        vocab_size = len(self.dataset.word2idx)
        return torch.randint(0, vocab_size, (batch_size, self.num_neg_samples), device=self.device)

    def train(self):
        """Trains the model using Skip-Gram with negative sampling."""
        self.model.train()

        for epoch in range(self.epochs):
            total_loss = 0
            for batch in self.dataloader:
                center_nodes, context_nodes = batch[:, 0].to(self.device), batch[:, 1].to(self.device)
                negative_samples = self.get_negative_samples(center_nodes.shape[0])

                self.optimizer.zero_grad()
                loss = self.model(center_nodes, context_nodes, negative_samples)
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

            print(f"Epoch {epoch+1}/{self.epochs}, Loss: {total_loss:.4f}")

    def get_embedding(self, node):
        """Retrieves the learned embedding for a node."""
        node_idx = self.dataset.word2idx.get(node)
        if node_idx is None:
            return None
        return self.model.embeddings.weight[node_idx].detach().cpu().numpy()

# Initialize Trainer and Train Model
trainer = Node2VecTrainer(model, dataset, epochs=40, batch_size=1024)
trainer.train()

# Save embeddings
embeddings = {node: trainer.get_embedding(node) for node in dataset.word2idx}
np.save("webkb_node2vec_embeddings.npy", embeddings)
print("Node2Vec embeddings saved!")


Epoch 1/40, Loss: 110.8408
Epoch 2/40, Loss: 106.3209
Epoch 3/40, Loss: 94.9075
Epoch 4/40, Loss: 72.8846
Epoch 5/40, Loss: 51.9389
Epoch 6/40, Loss: 40.8168
Epoch 7/40, Loss: 34.5666
Epoch 8/40, Loss: 30.2112
Epoch 9/40, Loss: 26.6790
Epoch 10/40, Loss: 23.5658
Epoch 11/40, Loss: 20.9984
Epoch 12/40, Loss: 18.8694
Epoch 13/40, Loss: 17.0663
Epoch 14/40, Loss: 15.3359
Epoch 15/40, Loss: 14.1066
Epoch 16/40, Loss: 12.9002
Epoch 17/40, Loss: 12.0654
Epoch 18/40, Loss: 11.3709
Epoch 19/40, Loss: 10.6102
Epoch 20/40, Loss: 10.0513
Epoch 21/40, Loss: 9.5741
Epoch 22/40, Loss: 9.2907
Epoch 23/40, Loss: 8.9640
Epoch 24/40, Loss: 8.3541
Epoch 25/40, Loss: 8.1977
Epoch 26/40, Loss: 7.9861
Epoch 27/40, Loss: 7.7645
Epoch 28/40, Loss: 7.5848
Epoch 29/40, Loss: 7.2982
Epoch 30/40, Loss: 7.3052
Epoch 31/40, Loss: 6.7735
Epoch 32/40, Loss: 6.8254
Epoch 33/40, Loss: 6.6645
Epoch 34/40, Loss: 6.6863
Epoch 35/40, Loss: 6.5669
Epoch 36/40, Loss: 6.3147
Epoch 37/40, Loss: 6.3835
Epoch 38/40, Loss: 6.3368

In [52]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Extract train and test nodes from the cites files
def get_nodes_from_cites(file_path):
    nodes = set()
    with open(file_path, "r") as f:
        for line in f:
            cited, citing = line.strip().split()
            nodes.add(cited)
            nodes.add(citing)
    return nodes

# Load train and test nodes
train_nodes = get_nodes_from_cites("webkb_combined/combined_train.cites")
test_nodes = get_nodes_from_cites("webkb_combined/combined_test.cites")

# Filter nodes that exist in embeddings
train_nodes = [node for node in train_nodes if node in embeddings]
test_nodes = [node for node in test_nodes if node in embeddings]

# Prepare feature matrices
X_train = np.array([embeddings[node] for node in train_nodes])
y_train = np.array([labels[node] for node in train_nodes])

X_test = np.array([embeddings[node] for node in test_nodes])
y_test = np.array([labels[node] for node in test_nodes])

# Train logistic regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("✅ Classification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.7114624505928854
✅ Classification Report:
               precision    recall  f1-score   support

      course       0.76      0.67      0.71       124
     faculty       0.92      0.44      0.60        77
     project       0.85      0.49      0.62        57
       staff       1.00      0.25      0.40        20
     student       0.65      0.92      0.76       228

    accuracy                           0.71       506
   macro avg       0.84      0.55      0.62       506
weighted avg       0.76      0.71      0.70       506

