<a href="https://colab.research.google.com/github/avisinghal6/Node-Classification-using-Graph-Convolutional-Neural-network/blob/main/MLG_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-2.0.0+cu118.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_scatter-2.1.1%2Bpt20cu118-cp39-cp39-linux_x86_64.whl (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.1+pt20cu118
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-2.0.0+cu118.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.0.0%2Bcu118/torch_sparse-0.6.17%2Bpt20cu118-cp39-cp39-linux_x86_64.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
Inst

In [2]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
# Get some basic info about the dataset
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(50*'=')

# There is only one graph in the dataset, use it as new data object
data = dataset[0]  

# Gather some statistics about the graph.
print(data)
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Is undirected: {data.is_undirected()}')

Number of graphs: 1
Number of features: 1433
Number of classes: 7
Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Number of training nodes: 140
Training node label rate: 0.05
Is undirected: True


In [5]:
class NN(torch.nn.Module):
    def __init__(self, hidden_channels,num_features):
        super(NN, self).__init__()
        torch.manual_seed(42)

        # Initialize the layers
        self.fc1 = nn.Linear(num_features, hidden_channels)
        self.fc2 = nn.Linear(hidden_channels, hidden_channels)
        self.out = nn.Linear(hidden_channels, dataset.num_classes)

    def forward(self, x):
        # First Message Passing Layer (Transformation)
        x = self.fc1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        # Second Message Passing Layer
        x = self.fc2(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        # Output layer 
        x = F.softmax(self.out(x), dim=1)
        return x

In [27]:
# Initialize model
model = NN(hidden_channels=64,num_features=data.num_features)

# Use GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
data = data.to(device)

# Initialize Optimizer
learning_rate = 0.001
decay = 5e-4
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate, 
                             weight_decay=decay)
# Define loss function (CrossEntropyLoss for Classification Problems with 
# probability distributions)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad() 
      out = model(data.x)  
      # Only use nodes with labels available for loss calculation --> mask
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  
      loss.backward() 
      optimizer.step()
      return loss

def test():
      model.eval()
      out = model(data.x)
      # Use the class with highest probability.
      pred = out.argmax(dim=1)  
      # Check against ground-truth labels.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  
      # Derive ratio of correct predictions.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  
      return test_acc

losses = []
for epoch in range(0, 5001):
    loss = train()
    losses.append(loss)
    acc=test()
    if epoch % 100 == 0:
      print(f'Epoch: {epoch:03d}, Loss: {loss:.4f},Accuracy: {acc:.4f}')

Epoch: 000, Loss: 1.9464,Accuracy: 0.0640
Epoch: 100, Loss: 1.9445,Accuracy: 0.0660
Epoch: 200, Loss: 1.7952,Accuracy: 0.1930
Epoch: 300, Loss: 1.6001,Accuracy: 0.3200
Epoch: 400, Loss: 1.3975,Accuracy: 0.3660
Epoch: 500, Loss: 1.2928,Accuracy: 0.4970
Epoch: 600, Loss: 1.2421,Accuracy: 0.5050
Epoch: 700, Loss: 1.2174,Accuracy: 0.5120
Epoch: 800, Loss: 1.2203,Accuracy: 0.5270
Epoch: 900, Loss: 1.2283,Accuracy: 0.5220
Epoch: 1000, Loss: 1.2107,Accuracy: 0.5240
Epoch: 1100, Loss: 1.2042,Accuracy: 0.5230
Epoch: 1200, Loss: 1.2027,Accuracy: 0.5180
Epoch: 1300, Loss: 1.2127,Accuracy: 0.5130
Epoch: 1400, Loss: 1.2085,Accuracy: 0.5120
Epoch: 1500, Loss: 1.1939,Accuracy: 0.5260
Epoch: 1600, Loss: 1.2191,Accuracy: 0.5170
Epoch: 1700, Loss: 1.2264,Accuracy: 0.5130
Epoch: 1800, Loss: 1.2020,Accuracy: 0.5280
Epoch: 1900, Loss: 1.2045,Accuracy: 0.5190
Epoch: 2000, Loss: 1.1980,Accuracy: 0.5160
Epoch: 2100, Loss: 1.2026,Accuracy: 0.5120
Epoch: 2200, Loss: 1.2101,Accuracy: 0.5230
Epoch: 2300, Loss: 1.

GCN with features

In [10]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv #GATConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels,num_features):
        super(GCN, self).__init__()
        torch.manual_seed(42)

        # Initialize the layers
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.out = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        # First Message Passing Layer (Transformation)
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        # Second Message Passing Layer
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        # Output layer 
        x = F.softmax(self.out(x), dim=1)
        return x


In [28]:
# Initialize model
model = GCN(hidden_channels=32,num_features=data.num_features)

# Use GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
data = data.to(device)

# Initialize Optimizer
learning_rate = 0.01
decay = 5e-4
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate, 
                             weight_decay=decay)
# Define loss function (CrossEntropyLoss for Classification Problems with 
# probability distributions)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad() 
      # Use all data as input, because all nodes have node features
      out = model(data.x, data.edge_index)  
      # Only use nodes with labels available for loss calculation --> mask
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  
      loss.backward() 
      optimizer.step()
      return loss

def test():
      model.eval()
      out = model(data.x, data.edge_index)
      # Use the class with highest probability.
      pred = out.argmax(dim=1)  
      # Check against ground-truth labels.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  
      # Derive ratio of correct predictions.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  
      return test_acc

losses = []
for epoch in range(0, 5001):
    loss = train()
    losses.append(loss)
    acc=test()
    if epoch % 100 == 0:
      print(f'Epoch: {epoch:03d}, Loss: {loss:.4f},Accuracy: {acc:.4f}')

Epoch: 000, Loss: 1.9460,Accuracy: 0.1490
Epoch: 100, Loss: 1.7192,Accuracy: 0.3970
Epoch: 200, Loss: 1.4347,Accuracy: 0.6350
Epoch: 300, Loss: 1.3069,Accuracy: 0.6930
Epoch: 400, Loss: 1.2518,Accuracy: 0.7580
Epoch: 500, Loss: 1.2946,Accuracy: 0.7550
Epoch: 600, Loss: 1.2637,Accuracy: 0.7690
Epoch: 700, Loss: 1.2503,Accuracy: 0.7610
Epoch: 800, Loss: 1.2386,Accuracy: 0.7590
Epoch: 900, Loss: 1.2716,Accuracy: 0.7650
Epoch: 1000, Loss: 1.2532,Accuracy: 0.7690
Epoch: 1100, Loss: 1.2580,Accuracy: 0.7650
Epoch: 1200, Loss: 1.2553,Accuracy: 0.7780
Epoch: 1300, Loss: 1.2620,Accuracy: 0.7640
Epoch: 1400, Loss: 1.2506,Accuracy: 0.7850
Epoch: 1500, Loss: 1.2389,Accuracy: 0.7520
Epoch: 1600, Loss: 1.2598,Accuracy: 0.7730
Epoch: 1700, Loss: 1.2521,Accuracy: 0.7540
Epoch: 1800, Loss: 1.2574,Accuracy: 0.7570
Epoch: 1900, Loss: 1.2466,Accuracy: 0.7620
Epoch: 2000, Loss: 1.2634,Accuracy: 0.7500
Epoch: 2100, Loss: 1.2534,Accuracy: 0.7460
Epoch: 2200, Loss: 1.2490,Accuracy: 0.7620
Epoch: 2300, Loss: 1.

GCN with Identity


In [13]:
#CREATING A NXN MATRIX
import numpy as np
d=torch.tensor(np.identity(data.num_nodes, dtype = float))

In [29]:
# Initialize model
model = GCN(hidden_channels=32,num_features=data.num_nodes)

# Use GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
d = d.to(device)

# Initialize Optimizer
learning_rate = 0.01
decay = 5e-4
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate, 
                             weight_decay=decay)
# Define loss function (CrossEntropyLoss for Classification Problems with 
# probability distributions)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad() 
      # Use all data as input, because all nodes have node features
      out = model(d.float(), data.edge_index)  
      # Only use nodes with labels available for loss calculation --> mask
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  
      loss.backward() 
      optimizer.step()
      return loss

def test():
      model.eval()
      out = model(d.float(), data.edge_index)
      # Use the class with highest probability.
      pred = out.argmax(dim=1)  
      # Check against ground-truth labels.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  
      # Derive ratio of correct predictions.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  
      return test_acc

losses = []
for epoch in range(0, 5001):
    loss = train()
    losses.append(loss)
    acc=test()
    if epoch % 100 == 0:
      print(f'Epoch: {epoch:03d}, Loss: {loss:.4f},Accuracy: {acc:.4f}')

Epoch: 000, Loss: 1.9460,Accuracy: 0.1030
Epoch: 100, Loss: 1.2622,Accuracy: 0.4880
Epoch: 200, Loss: 1.2253,Accuracy: 0.5930
Epoch: 300, Loss: 1.2274,Accuracy: 0.6540
Epoch: 400, Loss: 1.2233,Accuracy: 0.6280
Epoch: 500, Loss: 1.2023,Accuracy: 0.6310
Epoch: 600, Loss: 1.2065,Accuracy: 0.6320
Epoch: 700, Loss: 1.2134,Accuracy: 0.6400
Epoch: 800, Loss: 1.1934,Accuracy: 0.5930
Epoch: 900, Loss: 1.2145,Accuracy: 0.6030
Epoch: 1000, Loss: 1.2007,Accuracy: 0.5390
Epoch: 1100, Loss: 1.2093,Accuracy: 0.5930
Epoch: 1200, Loss: 1.2073,Accuracy: 0.6000
Epoch: 1300, Loss: 1.2098,Accuracy: 0.5910
Epoch: 1400, Loss: 1.2237,Accuracy: 0.6500
Epoch: 1500, Loss: 1.2012,Accuracy: 0.6070
Epoch: 1600, Loss: 1.1985,Accuracy: 0.5910
Epoch: 1700, Loss: 1.2109,Accuracy: 0.6120
Epoch: 1800, Loss: 1.2119,Accuracy: 0.5950
Epoch: 1900, Loss: 1.2089,Accuracy: 0.6540
Epoch: 2000, Loss: 1.2088,Accuracy: 0.5910
Epoch: 2100, Loss: 1.2169,Accuracy: 0.6210
Epoch: 2200, Loss: 1.2064,Accuracy: 0.5550
Epoch: 2300, Loss: 1.

GCN with Node2Vec

In [17]:
!pip install node2vec --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [18]:
start=data.edge_index[0]
end=data.edge_index[1]

In [19]:
data.edge_index
start=data.edge_index[0]
end=data.edge_index[1]
with open("out.txt", "wt") as fout:
  for i in range(len(start)):
    fout.write(f'{start[i]} {end[i]} \n')
   

In [20]:
import networkx as nx
from node2vec import Node2Vec

# Load the Cora dataset
# G = nx.read_edgelist('cora.cites')
G = nx.read_edgelist('out.txt')
G=nx.convert_node_labels_to_integers(G,first_label=0)
# # Create a Node2Vec object with the appropriate parameters
node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=50, workers=4)

# Fit the Node2Vec model to the graph
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Get the node embeddings for all nodes in the graph
node_embeddings = model.wv.vectors

# Print the shape of the node embeddings
print('Node embeddings shape:', node_embeddings.shape)

Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Node embeddings shape: (2708, 64)


In [21]:
node_embeddings=torch.tensor(node_embeddings)

In [31]:
# Initialize model
model = GCN(hidden_channels=32,num_features=64)

# Use GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
node_embeddings = node_embeddings.to(device)

# Initialize Optimizer
learning_rate = 0.01
decay = 5e-4
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate, 
                             weight_decay=decay)
# Define loss function (CrossEntropyLoss for Classification Problems with 
# probability distributions)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad() 
      # Use all data as input, because all nodes have node features
      out = model(node_embeddings, data.edge_index)  
      # Only use nodes with labels available for loss calculation --> mask
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  
      loss.backward() 
      optimizer.step()
      return loss

def test():
      model.eval()
      out = model(node_embeddings, data.edge_index)
      # Use the class with highest probability.
      pred = out.argmax(dim=1)  
      # Check against ground-truth labels.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  
      # Derive ratio of correct predictions.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  
      return test_acc

losses = []
for epoch in range(0, 5001):
    loss = train()
    losses.append(loss)
    acc=test()
    if epoch % 100 == 0:
      print(f'Epoch: {epoch:03d}, Loss: {loss:.4f},Accuracy: {acc:.4f}')

Epoch: 000, Loss: 1.9469,Accuracy: 0.1430
Epoch: 100, Loss: 1.5909,Accuracy: 0.2250
Epoch: 200, Loss: 1.4421,Accuracy: 0.2900
Epoch: 300, Loss: 1.3813,Accuracy: 0.2930
Epoch: 400, Loss: 1.3799,Accuracy: 0.3040
Epoch: 500, Loss: 1.3911,Accuracy: 0.3100
Epoch: 600, Loss: 1.3371,Accuracy: 0.3160
Epoch: 700, Loss: 1.3159,Accuracy: 0.3080
Epoch: 800, Loss: 1.2981,Accuracy: 0.3210
Epoch: 900, Loss: 1.3437,Accuracy: 0.3260
Epoch: 1000, Loss: 1.2963,Accuracy: 0.3130
Epoch: 1100, Loss: 1.3348,Accuracy: 0.3140
Epoch: 1200, Loss: 1.3300,Accuracy: 0.3280
Epoch: 1300, Loss: 1.3382,Accuracy: 0.3280
Epoch: 1400, Loss: 1.3216,Accuracy: 0.3390
Epoch: 1500, Loss: 1.3105,Accuracy: 0.3350
Epoch: 1600, Loss: 1.2969,Accuracy: 0.3130
Epoch: 1700, Loss: 1.3101,Accuracy: 0.3230
Epoch: 1800, Loss: 1.3324,Accuracy: 0.3380
Epoch: 1900, Loss: 1.3091,Accuracy: 0.3300
Epoch: 2000, Loss: 1.2780,Accuracy: 0.3220
Epoch: 2100, Loss: 1.3017,Accuracy: 0.3190
Epoch: 2200, Loss: 1.2882,Accuracy: 0.3240
Epoch: 2300, Loss: 1.