In [1]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cuda:0')
# gpustat -cuFi 1
parser.add_argument('--seed', type=int, default=42)

# learning params
parser.add_argument('--lr', type=float, default=1e-4)
parser.add_argument('--hdim', type=float, default=64)
parser.add_argument('--batchsize', type=float, default=128)

args = parser.parse_args([])

In [2]:
model_name = f'ECFP_MLP_h{args.hdim}b{args.batchsize}_lr{args.lr}'

In [3]:
import pandas as pd
import numpy as np
import sys

import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
sys.path.append('../')
from utils_dm import EarlyStopper, set_seed

In [5]:
whole_df = pd.read_csv('../../../2023-2/processed_data/ECFP/BBBP_ECFP_R2B1024.csv')

In [6]:
whole_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,bbbp
0,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2034,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2035,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2036,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
from torch.utils.data import Dataset, DataLoader

In [8]:
class MyDataset(Dataset):
    def __init__(self, dataset, labels):
        self.dataset = torch.tensor(dataset).float()
        self.labels = torch.tensor(labels, dtype=torch.float32)
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx], self.labels[idx]

In [9]:
whole_data = whole_df.values[:, :-1]
labels = whole_df.values[:, -1]
labels.shape, (labels==1).sum() # check number of positives

((2039,), 1560)

In [10]:
dataset = MyDataset(whole_data, labels)

In [11]:
# split dataset
test_ratio = 0.1
valid_ratio = 0.1

test_len = int(len(dataset)*test_ratio)
valid_len = int(len(dataset)*valid_ratio)
train_len = len(dataset) - valid_len - test_len
print(train_len, valid_len, test_len)

trainset,validset,testset = torch.utils.data.random_split(dataset, [train_len,valid_len,test_len],
                                      torch.Generator().manual_seed(42))
print(len(trainset), len(validset), len(testset))

1633 203 203
1633 203 203


In [12]:
# build dataloader
trainloader = DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=False,
                        generator=torch.Generator().manual_seed(42))
validloader = DataLoader(validset, batch_size=args.batch_size, shuffle=False, num_workers=0, drop_last=False)
testloader = DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=0, drop_last=False)

# Build model

In [13]:
import torch
import torch.nn as nn

class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, hdim, out_dim=1, dropout=0.1):
        super(NeuralNetwork, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(in_dim,hdim),
            nn.LayerNorm(hdim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hdim,hdim),
            nn.LayerNorm(hdim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hdim,out_dim)
        )
        
    def forward(self, x):
        x = self.layer(x)
        return x

In [14]:
in_dim = dataset[0][0].shape[0]
model = NeuralNetwork(in_dim, args.hdim).to(args.device)

## Train

In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
criterion = nn.BCEWithLogitsLoss()

# binary cross entropy nn.BCEWithLogitsLoss()
# Mean squared error  nn.MSELoss()

early_stopper = EarlyStopper(patience=20,printfunc=print,verbose=True,path=f'ckpts/{model_name}.pt')

In [16]:
def train(model, trainloader, args, optimizer=optimizer, criterion=criterion):
    model.train()
    train_loss = 0
    for batch, label in trainloader:
        batch = batch.to(args.device)
        label = label.to(args.device)

        optimizer.zero_grad()
        pred = model(batch).squeeze()
        
        loss = criterion(pred, label)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    return train_loss/len(trainloader)

In [17]:
def eval(model, loader, args, return_output=False, criterion=criterion):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch, label in loader:
            batch = batch.to(args.device)
            label = label.to(args.device)
            pred = model(batch)
            preds.append(pred)
            labels.append(label)
    preds = torch.cat(preds, dim=0)
    labels = torch.cat(labels, dim=0)
    
    loss = criterion(preds.squeeze(), labels.squeeze())

    if return_output:
        return loss.item(), preds, labels
    else:
        return loss.item()

In [18]:
epoch = 0
while True:
    epoch+=1
    train_loss = train(model,trainloader,args)
    valid_loss = eval(model,validloader,args)
    print(f'[Epoch{epoch}] train_loss: {train_loss:.4f}, valid_loss: {valid_loss:.4f}')
    early_stopper(valid_loss,model)
    if early_stopper.early_stop:
        print('early stopping')
        break

[Epoch1] train_loss: 0.6504, valid_loss: 0.5881
[Epoch2] train_loss: 0.5482, valid_loss: 0.5402
[Epoch3] train_loss: 0.5060, valid_loss: 0.5149
[Epoch4] train_loss: 0.4696, valid_loss: 0.4908
[Epoch5] train_loss: 0.4444, valid_loss: 0.4663
[Epoch6] train_loss: 0.4153, valid_loss: 0.4450
[Epoch7] train_loss: 0.3855, valid_loss: 0.4284
[Epoch8] train_loss: 0.3641, valid_loss: 0.4124
[Epoch9] train_loss: 0.3423, valid_loss: 0.4010
[Epoch10] train_loss: 0.3224, valid_loss: 0.3894
[Epoch11] train_loss: 0.3067, valid_loss: 0.3803
[Epoch12] train_loss: 0.2859, valid_loss: 0.3720
[Epoch13] train_loss: 0.2744, valid_loss: 0.3637
[Epoch14] train_loss: 0.2552, valid_loss: 0.3573
[Epoch15] train_loss: 0.2430, valid_loss: 0.3520
[Epoch16] train_loss: 0.2284, valid_loss: 0.3487
[Epoch17] train_loss: 0.2133, valid_loss: 0.3423
[Epoch18] train_loss: 0.2020, valid_loss: 0.3377
[Epoch19] train_loss: 0.1939, valid_loss: 0.3338
[Epoch20] train_loss: 0.1829, valid_loss: 0.3328
[Epoch21] train_loss: 0.1707,

### Validate

In [19]:
model.load_state_dict(torch.load(early_stopper.path, map_location=args.device))
model.eval()
print(f'loaded best model "{early_stopper.path}", valid loss: {early_stopper.val_loss_min:.4f}')

loaded best model "ckpts/ECFP_MLP_h64b32_lr0.0001.pt", valid loss: 0.3176


In [20]:
test_loss = eval(model,testloader,args)**0.5
print(f'Final test loss: {test_loss:.4f}')

Final test loss: 0.5844
