In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import torch.functional as F
import numpy as np
import pandas as pd
import random
from tqdm import tqdm_notebook

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Data prep

In [3]:
#somehow load two parquet files into colab - drive?
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
!ls '/content/drive/My Drive/GRU_data'

combined_klek.parquet  combined_selfies.parquet


In [6]:
!pip install fastparquet
!pip install selfies
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fastparquet
  Downloading fastparquet-2023.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.6.2 fastparquet-2023.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting selfies
  Downloading selfies-2.1.1-py3-none-any.whl (35 kB)
Installing collected packages: selfies
Successfully installed selfies-2.1.1


In [None]:
import wandb
wandb.login()

In [7]:
import selfies as sf
data = pd.read_parquet('/content/drive/My Drive/GRU_data/combined_selfies.parquet')
alphabet = sf.get_alphabet_from_selfies(data.selfies)
#alphabet.add("[nop]") # [nop] is a special padding symbol
alphabet.add("[start]")
alphabet.add("[end]")
alphabet.add("[nop]")
alphabet = list(sorted(alphabet))
pad_to_len = max(sf.len_selfies(s) for s in data.selfies) + 10
print("Pad to len:", pad_to_len)
symbol_to_idx = {s: i for i, s in enumerate(alphabet)}
idx2char = {i: s for i, s in enumerate(alphabet)}

Pad to len: 119


In [8]:
import re
class SELFIESVectorizer:
    def __init__(self, alphabet, pad_to_len=None):
        self.alphabet = alphabet
        self.char2idx = {s: i for i, s in enumerate(alphabet)}
        self.idx2char = {i: s for i, s in enumerate(alphabet)}
        self.pad_to_len = pad_to_len
    def vectorize(self, selfie, no_special=False):
        ''' Vectorize a list of SMILES strings to a numpy array of shape (len(smiles), embed, len(charset))'''
        if no_special:
            splited = self.split_selfi(selfie)
        elif self.pad_to_len is None:
            splited = ['[start]'] + self.split_selfi(selfie) + ['[end]']
        else:
            splited = ['[start]'] + self.split_selfi(selfie) + ['[end]'] + ['[nop]'] * (self.pad_to_len - len(self.split_selfi(selfie)) - 2)
        X = np.zeros((len(splited), len(self.alphabet)))
        for i in range(len(splited)):
            X[i, self.char2idx[splited[i]]] = 1
        return X
    def devectorize(self, ohe, remove_special=False):
        ''' Devectorize a numpy array of shape (len(smiles), embed, len(charset)) to a list of SMILES strings'''
        selfie_str = ''
        for j in range(ohe.shape[0]):
            idx = np.argmax(ohe[j, :])
            if remove_special and (self.idx2char[idx] == '[start]' or self.idx2char[idx] == '[end]'):
                continue
            selfie_str += self.idx2char[idx]
        return selfie_str
    def idxize(self, selfie, no_special=False):
        if no_special:
            splited = self.split_selfi(selfie)
        else:
            splited = ['[start]'] + self.split_selfi(selfie) + ['[end]'] + ['[nop]'] * (self.pad_to_len - len(self.split_selfi(selfie)) - 2)
        return np.array([self.char2idx[s] for s in splited])
    def deidxize(self, idx):
        return "".join([self.idx2char[i] for i in idx])
    def split_selfi(self, selfie):
        pattern = r'(\[[^\[\]]*\])'
        return re.findall(pattern, selfie)

In [9]:
vectorizer = SELFIESVectorizer(alphabet, pad_to_len=pad_to_len)

In [23]:
from torch.utils.data import Dataset, DataLoader
class GRUDatasetv2(Dataset):
    def __init__(self, smiles_fp, selfies, vectorizer):
        self.smiles_fp = pd.read_parquet(smiles_fp)
        self.selfies = pd.read_parquet(selfies)
        # self.X = self.prepare_X(self.smiles_fp)
        # self.X = np.array([self.reconstruct_fp(fp) for fp in self.X])
        self.selfies= self.prepare_y(self.selfies)
        self.vectorizer = vectorizer
    def __len__(self):
        return len(self.smiles_fp)
    def __getitem__(self, idx):
        raw_selfie = self.selfies[idx][0]
        vectorized_selfie = self.vectorizer.idxize(raw_selfie)
        # esentially, we want to predict the next symbol in the SELFIE and offset the target by one makes teaching forcing implicit
        vectorized_selfie = vectorized_selfie
        raw_X = self.smiles_fp.fps[idx]
        X = np.array(eval(raw_X), dtype=int)
        X_reconstructed = self.reconstruct_fp(X)

        return torch.from_numpy(X_reconstructed).float(), torch.from_numpy(vectorized_selfie).long()

    @staticmethod
    def prepare_X(smiles_fp):
        fps = smiles_fp.fps.apply(eval).apply(lambda x: np.array(x, dtype=int))
        return fps
    @staticmethod
    def prepare_y(selfies):
        return selfies.values
    @staticmethod
    def reconstruct_fp(fp, length=4860):
        fp_rec = np.zeros(length)
        fp_rec[fp] = 1
        return fp_rec

In [24]:
dataset = GRUDatasetv2('/content/drive/My Drive/GRU_data/combined_klek.parquet', '/content/drive/My Drive/GRU_data/combined_selfies.parquet', vectorizer)
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
print("Dataset size:", len(dataset))
print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))

Dataset size: 402863
Train size: 362576
Test size: 40287


In [25]:
dataset[0]

(tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 tensor([41, 23, 23,  9, 23,  9, 18, 33, 11, 23,  7, 23, 12, 29, 23,  9, 23,  9,
         17,  9, 35,  7, 23, 12,  7, 23, 12, 29,  9, 17, 23, 29, 29, 23,  9, 33,
         11, 23,  9, 34, 33,  7, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
         40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
         40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
         40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
         40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40]))

# NN architecture

In [26]:
class EncoderNet(nn.Module):
    def __init__(self, fp_size, encoding_size):
        super(EncoderNet, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(fp_size, 2048)
        self.fc2 = nn.Linear(2048, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, encoding_size)
    def forward(self, x):
        out = self.relu(self.fc1(x))
        out = self.relu(self.fc2(out))
        out = self.relu(self.fc3(out))
        out = self.relu(self.fc4(out))
        out = self.relu(self.fc5(out))
        return out


class DecoderNet(nn.Module):
    def __init__(self, dictionary_size, emb_size, hidden_size, num_layers, drop_prob):
        super(DecoderNet, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.drop_prob = drop_prob
        # embedding layer [batch_size, seq_len] -> [batch_size, seq_len, emb_size]
        self.embedding = nn.Embedding(dictionary_size, emb_size)
        # gru layer [batch_size, seq_len, emb_size] -> [batch_size, seq_len, hidden_size]
        self.gru = nn.GRU(emb_size, hidden_size, num_layers, dropout=drop_prob, batch_first=True)
        # fully connected layer [batch_size, seq_len, hidden_size] -> [batch_size, seq_len, dictionary_size]
        self.fc = nn.Linear(hidden_size, dictionary_size)
        self.max_len = vectorizer.pad_to_len

    def forward(self, input, hidden):
        emdedded = self.embedding(input)
        prediction, hidden = self.gru(emdedded, hidden)
        prediction = self.fc(prediction)
        return prediction, hidden


    def init_hidden(self, encoded):
        return encoded.unsqueeze(0).repeat(self.num_layers, 1, 1).to(device)
    
class Autoencoder(nn.Module):
    def __init__(self, input_size=4860, 
                 encoding_size=256, 
                 dictionary_size=len(alphabet), 
                 emb_size=256, 
                 hidden_size=256, 
                 num_layers=2, 
                 teacher_forcing_ratio=0.5,
                 drop_prob=0.2):
        super(Autoencoder, self).__init__()
        self.encoder = EncoderNet(input_size, encoding_size)
        self.decoder = DecoderNet(dictionary_size, emb_size, hidden_size, num_layers, drop_prob)
        self.teacher_forcing_ratio = teacher_forcing_ratio

    def forward(self, src,trg, teacher_forcing_ratio=None):
        teacher_forcing_ratio = self.teacher_forcing_ratio if teacher_forcing_ratio is None else teacher_forcing_ratio
        # if in evaluation mode we don't use teacher forcing
        if not self.training:
            teacher_forcing_ratio = 0
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc.out_features
        outputs = []
        hidden = self.encoder(src)
        hidden = hidden.unsqueeze(0).repeat(self.decoder.num_layers, 1, 1)
        input = trg[:, 0].unsqueeze(1)
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden)
            outputs.append(output)
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = torch.argmax(output, dim=2)
            input = trg[:, t].unsqueeze(1) if teacher_force else top1
        outputs = torch.cat(outputs, dim=1)
        return outputs

# Training

In [14]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.25.0-py2.py3-none-any.whl (206 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.5/206.5 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

## Sweep config

In [29]:
import pprint
sweep_config = {
    'method': 'bayes',
    }
metric = {
    'name': 'val_loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric
parameters_dict = {
    'encoding_size': { 'values': [32, 64, 128, 256, 512, 1024] },
    'emb_size': { 'values': [32, 64, 128, 256, 512, 1024] }, 
    'num_layers': { 'values': [1, 2, 3, 4, 5] },
    'teacher_forcing_ratio': { 'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] }, # 'min': 0.0, 'max': 0.5, 'step': 0.1 },
    'drop_prob': { 'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] },
    'epochs': { 'value': 20}
    }


sweep_config['parameters'] = parameters_dict
parameters_dict.update({
    'learning_rate': {
        # a flat distribution between 0 and 0.01
        'distribution': 'uniform',
        'min': 0.0,
        'max': 0.01
      },
    'batch_size': {
        'value': 256
      }
    })
pprint.pprint(sweep_config)

{'method': 'bayes',
 'metric': {'goal': 'minimize', 'name': 'val_loss'},
 'parameters': {'batch_size': {'value': 256},
                'drop_prob': {'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]},
                'emb_size': {'values': [32, 64, 128, 256, 512, 1024]},
                'encoding_size': {'values': [32, 64, 128, 256, 512, 1024]},
                'epochs': {'value': 20},
                'learning_rate': {'distribution': 'uniform',
                                  'max': 0.01,
                                  'min': 0.0},
                'num_layers': {'values': [1, 2, 3, 4, 5]},
                'teacher_forcing_ratio': {'values': [0.0,
                                                     0.1,
                                                     0.2,
                                                     0.3,
                                                     0.4,
                                                     0.5,
                                                     0.6,


In [30]:
# sweep id is created only once, then string is used to run the sweep
#sweep_id = wandb.sweep(sweep_config, project="selfie-autoencoder_v2")
sweep_id = 'jjgazi3d'

In [31]:
def train_hyper(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # Config is a variable that holds and saves hyperparameters and inputs
        config = wandb.config
        # Define model, optimizer, and loss function
        autoencoder = Autoencoder(encoding_size=config.encoding_size, 
                                  emb_size=config.emb_size, 
                                  hidden_size=config.encoding_size,
                                  num_layers=config.num_layers, 
                                    teacher_forcing_ratio=config.teacher_forcing_ratio,
                                  drop_prob=config.drop_prob).to(device)
        dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config.batch_size, drop_last=True)
        val_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=config.batch_size, drop_last=True)
        for i in tqdm(range(config.epochs)):
            avg_loss = train_epoch(autoencoder, dataloader, config.learning_rate)
            val_loss, table = evaluate(autoencoder, val_dataloader)
            wandb.log({'epoch': i, 'loss': avg_loss})
            wandb.log({'epoch': i, 'val_loss': val_loss})
            wandb.log({'epoch': i, 'table': table})


def train_epoch(autoencoder, dataloader, learning_rate):
    autoencoder.train()
    criterion = nn.CrossEntropyLoss(ignore_index=vectorizer.char2idx['[nop]'])
    optimizer = optim.Adam(autoencoder.parameters(), lr=learning_rate)
    epoch_loss = 0
    for batch_idx, (src, trg) in enumerate(tqdm(dataloader)):
        src = src.to(device)
        trg = trg.to(device)
        optimizer.zero_grad()
        output = autoencoder(src, trg)
        trg = trg[:, 1:]
        output = output.permute(0, 2, 1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        wandb.log({'batch_loss': loss.item()})
    avg_loss = epoch_loss / len(dataloader)

    return avg_loss

def evaluate(autoencoder, dataloader):
    autoencoder.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=vectorizer.char2idx['[nop]'])
    epoch_loss = 0
    for batch_idx, (src, trg) in enumerate(tqdm(dataloader)):
        src = src.to(device)
        trg = trg.to(device)
        output = autoencoder(src, trg)
        trg = trg[:, 1:]
        output = output.permute(0, 2, 1)
        loss = criterion(output, trg)
        epoch_loss += loss.item()
    avg_loss = epoch_loss / len(dataloader)
    with torch.no_grad():
                visual_examples_src, visual_examples_trg = next(iter(dataloader))
                visual_examples_src = visual_examples_src.to(device)[:5]
                visual_examples_trg = visual_examples_trg.to(device)[:5]
                visual_examples_outputs = autoencoder(visual_examples_src, visual_examples_trg, teacher_forcing_ratio)
                visual_examples_outputs = visual_examples_outputs.permute(0, 2, 1)
                visual_examples_outputs = torch.argmax(visual_examples_outputs, dim=1)
                visual_examples_outputs = visual_examples_outputs.cpu().numpy()
                visual_examples_outputs = np.array([vectorizer.deidxize(o) for o in visual_examples_outputs])
                visual_examples_trg = visual_examples_trg.cpu().numpy()
                visual_examples_trg = np.array([vectorizer.deidxize(o) for o in visual_examples_trg])
                #log visual examples as table to wandb
                data = [[src, trg] for src, trg in zip(visual_examples_trg, visual_examples_outputs)]
                table = wandb.Table(data=data, columns = ["Target", "Output"])
                     
    return avg_loss, table


In [None]:
wandb.agent(sweep_id,project="selfie-autoencoder_v2", function=train_hyper, count=1)

[34m[1mwandb[0m: Agent Starting Run: fezzp3fb with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	drop_prob: 0.3
[34m[1mwandb[0m: 	emb_size: 256
[34m[1mwandb[0m: 	encoding_size: 1024
[34m[1mwandb[0m: 	epochs: 20
[34m[1mwandb[0m: 	learning_rate: 0.0005666226399107366
[34m[1mwandb[0m: 	num_layers: 1
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.8




  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/1416 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]