<a href="https://colab.research.google.com/github/cgjeong23/Deep-Learning-Models/blob/main/RNN_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 5.2 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.12.1


## Dataloader

In [None]:
from torch.utils.data import Dataset, DataLoader

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
import torch

class SequenceDataset(Dataset):

  def __init__(self, sequence, tokenizer_file='monkeypox_tokenizer.json'):
    """sequence: List of str
    
    ["ACTG......", "GTCA.....]"""
    self.sequence = sequence

    self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    self.tokenizer.pre_tokenizer = Whitespace()
    self.tokenizer = self.tokenizer.from_file(tokenizer_file)
    self.tokenizer.enable_padding()

  def __len__(self):
    return len(self.sequence)

  def __getitem__(self, idx):
    seq = self.sequence[idx]
    encoded_seq = self.tokenizer.encode(seq)
    return torch.LongTensor(encoded_seq.ids)

## Model

In [None]:
dataset = SequenceDataset(['ACTGACTACTGACGATCGACTGG','ACTGACCACTGACTGATCGGTG'])
train_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
dataset[0]

tensor([2716, 1470,   69,   64,   15])

In [None]:
from torch import nn

class RNNModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, pad_id, hidden_dim, num_layers):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, paddin_idx=pad_id)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
    self.out_layer = nn.Linear(hidden_dim, vocab_size)

  def forward(self, ids):
    # ids: [batch size, max sequence length] as [B, L]
    embedded_ids = self.embedding(ids) # [B, L, E]
    rnn_out, _ = self.rnn(embedded_ids) # [B, L, H]
    return self.out_layer(rnn_out) # [B, L, V]

## Loss

In [None]:
loss = nn.CrossEntropyLoss()

## Training

In [None]:
from tqdm import tqdm

def train(model, dataloader, loss_function, lr, epoch):

  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  history = []
  for e in range(epoch):

    pbar = tqdm(dataloader)  

    average_loss = []
    for batch_sequence in pbar:

      batch_sequence = batch_sequence.to('cuda')
      x = batch_sequence[:, :-1]
      y = batch_sequence[:, 1:] # [B, L]
      h = model(x) # [B, L, V]
      h = h.permute(0, 2, 1) # [B, V, L]
      j = loss_function(h,y)

      # do gread descent
      optimizer.zero_grad()
      j.backward()
      optimizer.step()

      average_loss.append(j.item())

    history.append(np.mean(average_loss))

    
  return history

  

## Evaluate

In [None]:
def accuracy(model, testX, testy, device):
  testX = torch.FloatTensor(testX).to(device)
  out = model(testX)
  pred = out.argmax(-1) # shape of (10000,)
  pred = pred.cpu().numpy()
  
  acc = (pred == testy).sum() / pred.shape[0] #array of (10000) boolean

  return acc



## Do Training

In [None]:
sequence = ['ACTG','GCTA']
tokenizer_file = 'monkeypox_tokenizer.json'

In [None]:
lr = 1e-4
batch_size =
num_epochs = 5
vocab_size = dataset.tokenizer.get_vocab_size()
pad_id = dataset.tokenizer.padding['pad_id']
embedding_dim = 256
hidden_dim = 512
num_layers = 1

In [None]:
model = RNNModel(vocab_size, embedding_dim, pad_id, hidden_dim, num_layers)
dataset = SequenceDataset(sequence, tokenizer_file)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
loss_function = nn.CrossEntropyLoss(ignore_index=pad_id)


In [None]:
loss_history = train(model, dataloader, loss_function, lr, epoch)