In [1]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

add /root/miniconda/lib/python3.7/site-packages to PYTHONPATH
rdkit is already installed


In [11]:
import argparse
import math
import os

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm

from build_vocab import WordVocab
from dataset import Seq2seqDataset

PAD = 0
UNK = 1
EOS = 2
SOS = 3
MASK = 4

class PositionalEncoding(nn.Module):
    "Implement the PE function. No batch support?"
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model) # (T,H)
        position = torch.arange(0., max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

class TrfmSeq2seq(nn.Module):
    def __init__(self, in_size, hidden_size, out_size, n_layers, dropout=0.1):
        super(TrfmSeq2seq, self).__init__()
        self.in_size = in_size
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(in_size, hidden_size)
        self.pe = PositionalEncoding(hidden_size, dropout)
        self.trfm = nn.Transformer(d_model=hidden_size, nhead=4, 
        num_encoder_layers=n_layers, num_decoder_layers=n_layers, dim_feedforward=hidden_size)
        self.out = nn.Linear(hidden_size, out_size)

    def forward(self, src):
        # src: (T,B)
        embedded = self.embed(src)  # (T,B,H)
        embedded = self.pe(embedded) # (T,B,H)
        hidden = self.trfm(embedded, embedded) # (T,B,H)
        out = self.out(hidden) # (T,B,V)
        out = F.log_softmax(out, dim=2) # (T,B,V)
        return out # (T,B,V)

    def _encode(self, src):
        # src: (T,B)
        embedded = self.embed(src)  # (T,B,H)
        embedded = self.pe(embedded) # (T,B,H)
        output = embedded
        for i in range(self.trfm.encoder.num_layers - 1):
            output = self.trfm.encoder.layers[i](output, None)  # (T,B,H)
        penul = output.detach().numpy()
        output = self.trfm.encoder.layers[-1](output, None)  # (T,B,H)
        if self.trfm.encoder.norm:
            output = self.trfm.encoder.norm(output) # (T,B,H)
        output = output.detach().numpy()
        # mean, max, first*2
        return np.hstack([np.mean(output, axis=0), np.max(output, axis=0), output[0,:,:], penul[0,:,:] ]) # (B,4H)
    
    def encode(self, src):
        # src: (T,B)
        batch_size = src.shape[1]
        if batch_size<=100:
            return self._encode(src)
        else: # Batch is too large to load
            print('There are {:d} molecules. It will take a little time.'.format(batch_size))
            st,ed = 0,100
            out = self._encode(src[:,st:ed]) # (B,4H)
            while ed<batch_size:
                st += 100
                ed += 100
                out = np.concatenate([out, self._encode(src[:,st:ed])], axis=0)
            return out

#     return parser.parse_args()



def evaluate(model, test_loader, vocab):
    model.eval()
    total_loss = 0
    total_acc = 0
    for b, (sm, sm_true) in enumerate(test_loader):
        sm = torch.t(sm.cuda()) # (T,B)
        sm_true = torch.t(sm_true.cuda()) # (T,B)
        with torch.no_grad():
            output = model(sm) # (T,B,V)
        # print(sm)
        # print(torch.max(output, dim=2).indices)

        acc = torch.eq(torch.sum(torch.where(torch.eq(sm,torch.as_tensor(PAD)), torch.as_tensor(True).cuda(), \
                                             torch.eq(torch.max(output, dim=2).indices, sm_true)), dim=0), sm.shape[0]).sum()
        
        loss = F.nll_loss(output.view(-1, len(vocab)),
                               sm_true.contiguous().view(-1),
                               ignore_index=PAD)
        total_loss += loss.item()
        total_acc += acc.item()
    return total_loss / len(test_loader), total_acc / len(test_loader) / sm.shape[1]

In [3]:
!gdown --id 1kJ4Ofhjw6FH-gEvsWyvydR1R5XG3Gv6F


Downloading...
From: https://drive.google.com/uc?id=1kJ4Ofhjw6FH-gEvsWyvydR1R5XG3Gv6F
To: /content/trfm_new_1_120000.pkl
0.00B [00:00, ?B/s]2.43MB [00:00, 76.9MB/s]


In [4]:
!gdown --id 1jYVXKGxya3bhiLdeaFNb20wpPXey_n1N

Downloading...
From: https://drive.google.com/uc?id=1jYVXKGxya3bhiLdeaFNb20wpPXey_n1N
To: /content/my_train_smiles.csv
101MB [00:00, 164MB/s] 


In [16]:
!gdown --id 1tlwmS8sePg9TgMBpqeSPkMw7tOroS9FZ

Downloading...
From: https://drive.google.com/uc?id=1tlwmS8sePg9TgMBpqeSPkMw7tOroS9FZ
To: /content/trfm_12_23000.pkl
22.1MB [00:00, 70.4MB/s]


In [1]:
from collections import namedtuple
Args = namedtuple('Args', ['n_epoch', 'vocab', 'data', 'outdir', 'name', 'seq_len', 'batch_size', 'n_worker', 'hidden',\
                         'n_layer', 'n_head', 'lr', 'hpu'])

args = Args(5, 'vocab.pkl', 'my_train_smiles.csv','result','ST',250,128,2,64,4,4,1e-4,0)

In [5]:
import torch 
assert torch.cuda.is_available()

import argparse
import math
import os

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm

from build_vocab import WordVocab
from dataset import Seq2seqDataset

print('Loading dataset...')
vocab = WordVocab.load_vocab(args.vocab)

data = pd.read_csv(args.data)['first'].values

# dataset = Seq2seqDataset(, vocab)

Loading dataset...


In [8]:
dlist = list(data)

In [14]:
new_data = []
lengths = []
for item in dlist:
    lengths.append(len(item))
    if len(item) <= 100:
        new_data.append(item)
new_data = np.array(new_data)
max(lengths)

100

In [13]:
new_data.shape

(1722298,)

In [7]:
data = np.array([d])

numpy.ndarray

In [None]:
assert torch.cuda.is_available()

print('Loading dataset...')
vocab = WordVocab.load_vocab(args.vocab)
dataset = Seq2seqDataset(pd.read_csv(args.data)['first'].values, vocab)
test_size = 100000
train, test = torch.utils.data.random_split(dataset, [len(dataset)-test_size, test_size])
train_loader = DataLoader(train, batch_size=args.batch_size, shuffle=True, num_workers=args.n_worker)
test_loader = DataLoader(test, batch_size=args.batch_size, shuffle=False, num_workers=args.n_worker)
print('Train size:', len(train))
print('Test size:', len(test))
del dataset, train, test

model = TrfmSeq2seq(len(vocab), args.hidden, len(vocab), args.n_layer).cuda()

# model.load_state_dict(torch.load(os.path.normpath('trfm_new_1_120000.pkl')))

# model.load_state_dict(torch.load(os.path.normpath('trfm_12_23000.pkl')))


optimizer = optim.Adam(model.parameters(), lr=args.lr)
print(model)
print('Total parameters:', sum(p.numel() for p in model.parameters()))

# best_loss = None
for e in range(1, args.n_epoch):
    for b, (sm, sm_true) in enumerate(train_loader):
        sm = torch.t(sm.cuda()) # (T,B)
        sm_true = torch.t(sm_true.cuda()) # (T,B)
        optimizer.zero_grad()
        output = model(sm) # (T,B,V)
        # print(sm_true, sm)

        loss = F.nll_loss(output.view(-1, len(vocab)),
                sm_true.contiguous().view(-1), ignore_index=PAD)
        # assert False
        loss.backward()
        optimizer.step()
        if b%1000==0:
            print('Train {:3d}: iter {:5d} | loss {:.3f} | ppl {:.3f}'.format(e, b, loss.item(), math.exp(loss.item())))
        if (b+1)%10000==0:
            loss,acc = evaluate(model, test_loader, vocab)
            print('Val {:3d}: iter {:5d} | loss {:.3f} | ppl {:.3f}'.format(e, b, loss, math.exp(loss)))
            print('acc: ', acc)
            # Save the model if the validation loss is the best we've seen so far.
            if not best_loss or loss < best_loss:
                print("[!] saving model...")
                if not os.path.isdir("save"):
                    os.makedirs("save")
                torch.save(model.state_dict(), './drive/MyDrive/FinalProject/trfm_new_%d_%d.pkl' % (e,b))
                best_loss = loss

Total parameters: 274861
Train   1: iter     0 | loss 2.270 | ppl 9.683
Train   1: iter  1000 | loss 2.206 | ppl 9.078
Train   1: iter  2000 | loss 2.220 | ppl 9.204
Train   1: iter  3000 | loss 2.166 | ppl 8.723
Train   1: iter  4000 | loss 2.273 | ppl 9.704
Train   1: iter  5000 | loss 2.191 | ppl 8.945
Train   1: iter  6000 | loss 2.216 | ppl 9.167
Train   1: iter  7000 | loss 2.326 | ppl 10.236
Train   1: iter  8000 | loss 2.279 | ppl 9.766
Train   1: iter  9000 | loss 2.206 | ppl 9.081
Val   1: iter  9999 | loss 2.230 | ppl 9.297
acc:  5e-05
[!] saving model...
Train   1: iter 10000 | loss 2.132 | ppl 8.432
Train   1: iter 11000 | loss 2.321 | ppl 10.185
Train   1: iter 12000 | loss 2.277 | ppl 9.750
Train   1: iter 13000 | loss 2.260 | ppl 9.582
Train   1: iter 14000 | loss 2.203 | ppl 9.055
Train   1: iter 15000 | loss 2.165 | ppl 8.717
Train   1: iter 16000 | loss 2.289 | ppl 9.863
Train   1: iter 17000 | loss 2.181 | ppl 8.854
Train   1: iter 18000 | loss 2.196 | ppl 8.989
Tra

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
!cp save/trfm_new_1_99999.pkl  drive/MyDrive/FinalProject

In [None]:
import torch

output = torch.zeros(2,3,5) + torch.ge(torch.randn(2,3,5), torch.as_tensor(0.5))
print(output)
sm = torch.ones(2,3)
print(sm)
torch.eq(torch.sum(torch.eq(torch.max(output, dim=2).values, sm), dim=0), sm.shape[0]).sum()

tensor([[[0., 1., 0., 1., 0.],
         [0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 1.]],

        [[0., 1., 1., 1., 1.],
         [1., 1., 0., 0., 0.],
         [1., 0., 0., 1., 1.]]])
tensor([[1., 1., 1.],
        [1., 1., 1.]])


tensor(2)