# 드라이브 마운트 및 공유 디렉토리 설정

In [None]:
# drive mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 7강 공유 폴더 

data_path = '/content/drive/MyDrive/For_studuent_sharing/7강/data/processed.h5'

# 데이터 준비 (Data preparation)

### 데이터 살펴보기 

---

**Raw data source**  
Drug-like commercially available molecules from ZINC database.  
https://zinc.docking.org/




In [None]:
import h5py
h5f = h5py.File(data_path, 'r')

In [None]:
# pre-processed(one-hot encoded) molecule data 

data_train = h5f['data_train'][:]
data_train.shape

(40000, 120, 33)


- Sample size: 40000  
- Max sequence length: 120   
- Number of characters: 33


In [None]:
# One-hot encoding에 사용된 characters 

h5f['charset'][:]

array([b' ', b'#', b')', b'(', b'+', b'-', b'/', b'1', b'3', b'2', b'5',
       b'4', b'7', b'6', b'=', b'@', b'C', b'B', b'F', b'I', b'H', b'O',
       b'N', b'S', b'[', b']', b'\\', b'c', b'l', b'o', b'n', b's', b'r'],
      dtype='|S1')

In [None]:
# 샘플 하나 예시
x_example = data_train[:8]
x_example.shape

(8, 120, 33)

In [None]:
data_test = h5f['data_test'][:]
data_test.shape

(10000, 120, 33)

- Sample size: 10000  
- Max sequence length: 120   
- Number of characters: 33

# 모델 학습 (Training)

### 학습에 필요한 함수 정의

In [None]:
import numpy as np

In [None]:

def from_one_hot_array(vec): 
    oh = np.where(vec == 1)
    if oh[0].shape == (0, ):
        return None
    return int(oh[0][0])

def decode_smiles_from_indexes(vec, charset):
    """숫자 index를 character로 바꾸기
    """
    return "".join(map(lambda x: charset[x], vec)).strip()

def load_dataset(filename, split = True):
    """데이터 불러오기 

    Args:
        filename (str): 전처리 된 h5py 파일이 저장된 경로
        split (bool): If True; train과 test 데이터 모두 반환. else; test만 반환

    Returns:
        tuple: data와 charset  
    """

    # 전처리 된 데이터 불러오기
    h5f = h5py.File(filename, 'r')
    
    # Train / Test data 나누기
    if split:
        data_train = h5f['data_train'][:]
    else:
        data_train = None
    data_test = h5f['data_test'][:]

    # molecule characters 
    charset =  h5f['charset'][:]
    h5f.close()

    if split:
        return (data_train, data_test, charset)
    else:
        return (data_test, charset)


### Data loading 

In [None]:
# 데이터 불러오기 
data_train, data_test, charset = load_dataset(data_path)
charset

array([b' ', b'#', b')', b'(', b'+', b'-', b'/', b'1', b'3', b'2', b'5',
       b'4', b'7', b'6', b'=', b'@', b'C', b'B', b'F', b'I', b'H', b'O',
       b'N', b'S', b'[', b']', b'\\', b'c', b'l', b'o', b'n', b's', b'r'],
      dtype='|S1')

In [None]:
charset = [char.decode('utf-8') for char in charset]
print(charset)

[' ', '#', ')', '(', '+', '-', '/', '1', '3', '2', '5', '4', '7', '6', '=', '@', 'C', 'B', 'F', 'I', 'H', 'O', 'N', 'S', '[', ']', '\\', 'c', 'l', 'o', 'n', 's', 'r']


### Torch Dataset 및 DataLoader

In [None]:
# Torch Dataset 
data_train = torch.utils.data.TensorDataset(torch.from_numpy(data_train))

# Torch DataLoader
train_loader = torch.utils.data.DataLoader(data_train, batch_size=250, shuffle=True)

In [None]:
for batch in train_loader:
    x_example = batch[0]
    print(x_example.shape)
    break

torch.Size([250, 120, 33])


### Variational AutoEncoder(VAE) 모델 만들기

---

![](https://drive.google.com/uc?export=view&id=1VImMz5Zo0dv1mu5NZ7Q_a-DBmGUTGOHz)


In [None]:
# 필요한 라이브러리 호출

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
import torch.optim as optim

In [None]:
class MolecularVAE(nn.Module):
    """Molecule 생성을 위한 VAE 모델
    """
    def __init__(self):
        super(MolecularVAE, self).__init__()

        # encoding을 위한 1D-convolution layer과 linear layer
        self.conv_1 = nn.Conv1d(120, 9, kernel_size=9)
        self.conv_2 = nn.Conv1d(9, 9, kernel_size=9)
        self.conv_3 = nn.Conv1d(9, 10, kernel_size=11)
        self.linear_0 = nn.Linear(70, 435)
        
        # Latent variables을 생성하는 layers
        self.linear_1 = nn.Linear(435, 292) # mean
        self.linear_2 = nn.Linear(435, 292) # variance

        # decoding을 위한 GRU layer과 linear layer
        self.linear_3 = nn.Linear(292, 292)
        self.gru = nn.GRU(292, 501, 3, batch_first=True)
        self.linear_4 = nn.Linear(501, 33)
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def encode(self, x):
        """Input을 encoding 해서 latent variables 추정

        Returns:
            tuple: latent variables (mean, log tranfromed variance)
        """
        # 1D-convolution
        x = self.relu(self.conv_1(x))                   # (batch_size, 9, 25)
        x = self.relu(self.conv_2(x))                   # (batch_size, 9, 17)
        x = self.relu(self.conv_3(x))                   # (batch_size, 10, 7)

        # flatten
        x = x.view(x.size(0), -1)                       # (batch_size, 70)

        # Linear embedding
        x = F.selu(self.linear_0(x))                    # (batch_size, 435)

        # Latent variable을 생성하는 linear embedding 
        z_mean = self.linear_1(x)                       # (batch_size, 292)
        # 학습 안정성을 위해 log transform한 분산을 추정
        z_logvar = self.linear_2(x)                     # (batch_size, 292)
        return z_mean, z_logvar

    def sampling(self, z_mean, z_logvar):
        """Reparametrization Trick 
        N(0,1)의 정규분포에서 랜던 샘플링한 epsilon 값을 sigam에 곱한 뒤 mean에 더하기

        sigma는 항상 양수 값을 가지므로 학습 안정성을 위해 실제 구현 시에는 log(var)을 추정한 뒤 
        'sigma = exp(0.5 * log(var))' 와 같이 sigma를 구함 
        
        https://stats.stackexchange.com/questions/486158/reparameterization-trick-in-vaes-how-should-we-do-this
        """
        # N(0,1)에서 랜덤 샘플링 
        epsilon = 1e-2 * torch.randn_like(z_logvar) 
        # log(var) -> sigma
        sigma = torch.exp(0.5 * z_logvar)
        # random sampling 된 z
        z = sigma * epsilon + z_mean
        return z

    def decode(self, z):
        """Random sampling 한 z 값으로 molecule generation
        """
        # random sampling 된 z를 linear embedding
        z = F.selu(self.linear_3(z))                        # (batch_size, 292)
        z = z.view(z.size(0), 1, z.size(-1))                # (batch_size, 1, 292)
        z = z.repeat(1, 120, 1)                             # (batch_size, sequence length(120), 292)

        # layer 3개의 GRU
        z, hn = self.gru(z)                                 # (batch_size, sequence length(120), 501), (1, 501)
        _b = z.size(0)
        z = z.contiguous().view(-1, z.size(-1))             # (batch_size x sequence length(120), 501)


        # contiguous -> https://inhyeokyoo.github.io/pytorch/contiguous/
        # contiguous -> https://f-future.tistory.com/entry/Pytorch-Contiguous
        # contiguous -> https://jimmy-ai.tistory.com/122

        
        # linear embedding 
        # character dimension(33차원)으로 만들기
        z = self.linear_4(z)                                # (batch_size x sequence length(120), 33)
        
        # 33개 character에 대한 확률 값으로 만들기
        z = F.softmax(z, dim=1)

        x_decoded = z.contiguous().view(_b, -1, z.size(-1)) # (batch_size, sequence length(120), 33)
        return x_decoded

    def forward(self, x):
        # Encoding
        z_mean, z_logvar = self.encode(x)
        # Random sampling (reparametrization trick)
        z = self.sampling(z_mean, z_logvar)
        # Decoding
        x_decoded = self.decode(z)
        return x_decoded, z_mean, z_logvar

### VAE toy example

In [None]:
# Toy 
x = x_example
vae = MolecularVAE()

In [None]:
# Encoding 
mean, var = vae.encode(x)
print(mean.shape)
print(var.shape)

torch.Size([250, 292])
torch.Size([250, 292])


In [None]:
# random sampling (Reparametrization trick)
z = vae.sampling(mean, var)
print(z.shape)

torch.Size([250, 292])


In [None]:
# Decoding
x_decoded = vae.decode(z)
print(x_decoded.shape)

torch.Size([250, 120, 33])


### 학습을 위한 설정

In [None]:
# random seed 고정하기
torch.manual_seed(42)

# 학습 epoch 수 
epochs = 50

# Device 설정 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'현재 device: {device}')

# 모델 선언하기
model = MolecularVAE().to(device)

# optimizer  
optimizer = optim.Adam(model.parameters())

현재 device: cuda


### 학습 진행

VAE 학습을 위한 loss 정의

---


![](https://drive.google.com/uc?export=view&id=19ANgkmP6I-fqasKaVV4RYkMvX1l3YHLJ)


In [None]:
def vae_loss(x_decoded_mean, x, z_mean, z_logvar):
    """VAE loss 계산
    """    
    # Reconstruction loss
    xent_loss = F.binary_cross_entropy(x_decoded_mean, x, size_average=False)

    # Regularization
    kl_loss = -0.5 * torch.sum(1 + z_logvar - z_mean.pow(2) - z_logvar.exp())
    
    return xent_loss + kl_loss

In [None]:


def train(epoch):
    """모델 학습
    """
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(train_loader):
        data = data[0].to(device)
        
        # forward-pass
        output, mean, logvar = model(data)
        
        # 데이터 예시
        if batch_idx==0:
            inp = data.cpu().numpy()
            outp = output.cpu().detach().numpy()
            lab = data.cpu().numpy()
 
            print("Input:")
            print(inp)
            print(decode_smiles_from_indexes(map(from_one_hot_array, inp[0]), charset))

            print("Label:")
            print(decode_smiles_from_indexes(map(from_one_hot_array, lab[0]), charset))
            sampled = outp[0].reshape(1, 120, len(charset)).argmax(axis=2)[0]

            print("Output:")
            print(decode_smiles_from_indexes(sampled, charset))
        
        loss = vae_loss(output, data, mean, logvar)

        optimizer.zero_grad()
        loss.backward()
        train_loss += loss
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'{epoch} / {batch_idx}\t{loss:.4f}')

    print('train', train_loss / len(train_loader.dataset))
    return train_loss / len(train_loader.dataset)


In [None]:
# 학습 진행
for epoch in range(1, epochs + 1):
    train_loss = train(epoch)

Input:
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]]
c1c



1 / 0	133759.0156
1 / 100	31155.8184
train tensor(149.7125, device='cuda:0', grad_fn=<DivBackward0>)
Input:
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ..

KeyboardInterrupt: ignored

# Molecule generation 

In [None]:
# Gaussian 분포에서 random sampling
z = torch.randn(64, 292).cuda()

# random sampling된 z로 molecule generation
sample = model.decode(z).cuda()

In [None]:
sample.shape

torch.Size([64, 120, 33])

In [None]:
outp = sample.cpu().detach().numpy()
outp[10]

array([[9.1399340e-04, 5.0477032e-04, 3.1022017e-03, ..., 4.4673006e-03,
        8.0889853e-04, 2.3443282e-04],
       [4.8638755e-04, 3.2591890e-04, 4.7977935e-03, ..., 9.1953501e-03,
        9.9231524e-04, 1.3034025e-04],
       [4.3968740e-04, 3.2560879e-04, 9.5880041e-03, ..., 1.1825797e-02,
        1.5764314e-03, 1.1669738e-04],
       ...,
       [9.9987209e-01, 2.4397045e-06, 1.0256973e-05, ..., 1.0649267e-06,
        1.0975990e-06, 4.5960915e-06],
       [9.9987221e-01, 2.4394603e-06, 1.0253043e-05, ..., 1.0645033e-06,
        1.0970655e-06, 4.5938923e-06],
       [9.9987233e-01, 2.4392186e-06, 1.0249153e-05, ..., 1.0640874e-06,
        1.0965373e-06, 4.5917072e-06]], dtype=float32)

In [None]:
# # Generation 된 molecule의 11번째 charcter index 구하기
sampled = outp[10].reshape(1, 120, len(charset)).argmax(axis=2)[0]
sampled

array([16,  7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
       27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 16, 16,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0])

In [None]:
# index -> character
decode_smiles_from_indexes(sampled, charset)