In [None]:
from molToTensor import MoleculeDataset
import torch
torch.manual_seed(0)
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.nn import Linear, Sequential, BatchNorm1d, ReLU, Dropout
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GINConv
from torch_geometric.nn import global_add_pool
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE


In [None]:
if torch.cuda.is_available():
    print('GPU available')
else:
    print('GPU not available')

In [9]:
# convert molecules to graphs 
dataset = MoleculeDataset(root="../data/", filename="train_data.csv")
train_dataset = dataset[:int(len(dataset)*0.9)]
val_dataset = dataset[int(len(dataset)*0.9):]

test_dataset = MoleculeDataset(root="../data/", filename="test_data.csv", test=True)


In [None]:
print(f'Training set   = {len(train_dataset)} graphs')
print(f'Validation set = {len(val_dataset)} graphs')

In [15]:
# Loading dataset
train_loader = DataLoader(train_dataset, batch_size=64,
shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64,
shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=64,
shuffle=True)

In [17]:
def metrics(model, loader):
    '''
    define your metrics here. r2 and loss are defined 
    '''
    model.eval()
    loss = 0
    r2 = 0
    
    for data in loader:
        # data.y = data.y.type(torch.LongTensor)
        out = model(data.x, data.edge_index, data.batch)
        loss += criterion(out, data.y) / len(loader)
        r2 += r2_score(out, data.y) / len(loader)
    return loss, r2

In [18]:
# defining GIN achitecture 
# Gin is Graph Isomorphism Network  
class GIN(torch.nn.Module):
    def __init__(self, dim_h):
        super(GIN, self).__init__()
        self.conv1 = GINConv(
            Sequential(Linear(train_dataset.num_node_features,
dim_h), BatchNorm1d(dim_h), ReLU(), Linear(dim_h, dim_h),
ReLU()))
        self.conv2 = GINConv(
            Sequential(Linear(dim_h, dim_h),
BatchNorm1d(dim_h), ReLU(), Linear(dim_h, dim_h),
ReLU()))
        self.conv3 = GINConv(
            Sequential(Linear(dim_h, dim_h),
BatchNorm1d(dim_h), ReLU(), Linear(dim_h, dim_h),
ReLU()))
        self.conv4 = GINConv(
            Sequential(Linear(dim_h, dim_h),
BatchNorm1d(dim_h), ReLU(), Linear(dim_h, dim_h),
ReLU()))
        self.conv5 = GINConv(
            Sequential(Linear(dim_h, dim_h),
BatchNorm1d(dim_h), ReLU(), Linear(dim_h, dim_h),
ReLU()))

        self.lin1 = Linear(dim_h*3, dim_h*3)
        self.lin2 = Linear(dim_h*3, 1024)
        self.lin3 = Linear(1024, 512)
        self.lin4 = Linear(512, 128)
        self.lin5 = Linear(128, 1)

    def forward(self, x, edge_index, batch):
        # Node embeddings
        h1 = self.conv1(x, edge_index)
        h2 = self.conv2(h1, edge_index)
        h3 = self.conv3(h2, edge_index)
        h4 = self.conv4(h3, edge_index)
        h5 = self.conv5(h4, edge_index)
        # Graph-level readout
        h1 = global_add_pool(h1, batch)
        h2 = global_add_pool(h2, batch)
        h3 = global_add_pool(h3, batch)
        h4 = global_add_pool(h4, batch)
        h5 = global_add_pool(h5, batch)
        # Concatenate graph embeddings
        h = torch.cat((h1, h2, h3), dim=1)
        # Regressor 
        h = self.lin1(h)
        h = h.relu()
        h = F.dropout(h, p=0.5, training=self.training) 
        h = self.lin2(h)
        h = h.relu()
        h = F.dropout(h, p=0.5, training=self.training) 
        h = self.lin3(h)
        h = h.relu()
        h = F.dropout(h, p=0.5, training=self.training) 
        h = self.lin4(h)
        h = h.relu()
        h = F.dropout(h, p=0.5, training=self.training)         
        return self.lin5(h).view(-1)


In [19]:
@torch.no_grad()
def test(model, loader):
    criterion = torch.nn.MSELoss()
    model.eval()
    loss = 0
    r2 = 0
    for data in loader:
        out = model(data.x, data.edge_index, data.batch)
        loss += criterion(out, data.y)
        r2 += r2_score(out, data.y) 
    return loss, r2

In [20]:
def lr_lambda(epoch):
    base_lr = 0.1
    factor = 5
    # 
    return base_lr/(factor + epoch)

In [23]:
#regular training loop with mini-batching for 150 epochs:
def train(model, loader):
    criterion = torch.nn.MSELoss()
    lr = 0.1
    optimizer = torch.optim.AdamW(model.parameters(),lr=1e-1)
    # SGD instead of adam 
    
    scheduler = ReduceLROnPlateau(optimizer, 'min', min_lr=1e-6)
    epochs = 20
    model.train()
    for epoch in range(epochs+1):
        total_loss = 0
        r2 = 0
        val_loss = 0
        val_r2 = 0
        # Train on batches
        for data in loader:
            
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, data.batch)
            loss = criterion(out, data.y)
            total_loss += loss / len(loader)
            r2 += r2_score(out.detach().numpy(), data.y.detach().numpy()) / len(loader)
            loss.backward()
            optimizer.step()
        # Validation
        val_loss, val_r2 = test(model, val_loader)
        scheduler.step(loss)
        
        print(f"lr after update {optimizer.param_groups[0]['lr']}")
        print(f'Epoch {epoch:>3} | Train Loss:'
f'{total_loss:.2f} | Train r2: {r2:>5.2f} | Val'
f'Loss: {val_loss:.2f} | Val r2: {val_r2:.2f}')
    return model

In [None]:
gin = GIN(dim_h=128)
gin = train(gin, train_loader)