In [None]:
!pip install torch
import torch
print(torch.__version__)

Change torch.__version__ of following installation command.




In [None]:
!pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html

Dataset

In [3]:
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
#dataset = TUDataset(root='/tmp/imdb', name='IMDB-BINARY')

### add code for using the node degree as input features.
import os.path as osp
import torch
import torch_geometric.transforms as T
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import degree


class NormalizedDegree(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, data):
        deg = degree(data.edge_index[0], dtype=torch.float)
        deg = (deg - self.mean) / self.std
        data.x = deg.view(-1, 1)
        return data


def get_dataset(name, sparse=True, cleaned=False):
    # path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name)
    # dataset = TUDataset(path, name, cleaned=cleaned)
    dataset = TUDataset(root='/tmp/imdb', name=name)
    dataset.data.edge_attr = None

    if dataset.data.x is None:
        max_degree = 0
        degs = []
        for data in dataset:
            degs += [degree(data.edge_index[0], dtype=torch.long)]
            max_degree = max(max_degree, degs[-1].max().item())

        if max_degree < 1000:
            dataset.transform = T.OneHotDegree(max_degree)
        else:
            deg = torch.cat(degs, dim=0).to(torch.float)
            mean, std = deg.mean().item(), deg.std().item()
            dataset.transform = NormalizedDegree(mean, std)

    if not sparse:
        num_nodes = max_num_nodes = 0
        for data in dataset:
            num_nodes += data.num_nodes
            max_num_nodes = max(data.num_nodes, max_num_nodes)

        # Filter out a few really large graphs in order to apply DiffPool.
        if name == 'REDDIT-BINARY':
            num_nodes = min(int(num_nodes / len(dataset) * 1.5), max_num_nodes)
        else:
            num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes)

        indices = []
        for i, data in enumerate(dataset):
            if data.num_nodes <= num_nodes:
                indices.append(i)
        dataset = dataset.copy(torch.tensor(indices))

        if dataset.transform is None:
            dataset.transform = T.ToDense(num_nodes)
        else:
            dataset.transform = T.Compose(
                [dataset.transform, T.ToDense(num_nodes)])

    return dataset

dataset = get_dataset('IMDB-BINARY')
### add train, val, test loader.
train_dataset = dataset[:800]
val_dataset = dataset[800:900]
test_dataset = dataset[900:]

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

Downloading https://www.chrsmrrs.com/graphkerneldatasets/IMDB-BINARY.zip
Extracting /tmp/imdb/IMDB-BINARY/IMDB-BINARY.zip
Processing...
Done!


In [4]:
dataset

IMDB-BINARY(1000)

In [5]:
train_loader

<torch_geometric.loader.dataloader.DataLoader at 0x7f0b77ba5790>

In [6]:
len(dataset)

1000

In [7]:
dataset.num_classes

2

In [8]:
dataset.num_node_features

136

Model

In [9]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_add_pool, global_mean_pool

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 32)
        self.conv2 = GCNConv(32, 8)
        self.lin1 = Linear(8, dataset.num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        ### You can add more layers or alter the model structure. See geometric documents which layer or model you can use.
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        
        ### aggregate node embeddings into one representation
        x = global_mean_pool(x, batch)


        ### Pass aggregated representation to linear layer to make final prediction
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.lin1(x)
        return F.log_softmax(x, dim=1)

Train function

In [10]:
def train():
    model.train()
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, data.y.view(-1))
        loss.backward()
        loss_all += loss.item() * data.num_graphs
        optimizer.step()
    return loss_all / len(train_dataset)

Validation function

In [24]:
def val(loader):
    model.eval()
    loss_all = 0
    for data in loader:
        data = data.to(device)
        output = model(data)
        loss = F.nll_loss(output, data.y.view(-1))
        loss_all += loss.item() * data.num_graphs
    return loss_all / len(val_dataset)

Test function

In [12]:
def test(loader):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)
        output = model(data)
        pred = output.max(dim=1)[1]
        correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)


Main code

In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
#data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()

number_of_epochs = 200 # You can change.

tmp = 100
best_model = model
best_epoch = 0
for epoch in range(number_of_epochs):
    train_loss = train()
    val_loss = val(val_loader)

    # Choose the lowest validation loss checkpoint (you can implement early stopping as well)
    if val_loss < tmp:
      tmp = val_loss
      best_model = model 
      best_epoch = epoch
    #print(val_loss)   
print(f'Best_epoch: {best_epoch}, min_val_score: {tmp}')

# Load the lowest validation loss checkpoint and check the performance.
model = best_model
test_acc = test(test_loader)
print('Performance: ', test_acc)

Best_epoch: 73, min_val_score: 0.48040027141571046
Performance:  0.73
