In [1]:
%load_ext autoreload
%autoreload 2

# Data

In [2]:
lang = 'cs-en'
l1 = lang[:2]
l2 = lang[3:]
l1,l2

('cs', 'en')

In [3]:
from datasets import load_dataset

dataset = load_dataset("wmt14", lang,cache_dir='../src/data',trust_remote_code=True)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 953621
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})

In [92]:
chars = set()
for i in dataset:
    for r in dataset[i]:
        for t in r['translation']:
            chars.update(r['translation'][t])
chars = sorted(chars)

In [93]:
stoi = {c: i for i, c in enumerate(vocab)}
itos = {i: c for i, c in enumerate(vocab)}
encode = lambda x: [stoi[c] for c in x]
decode = lambda x: ''.join([itos[c] for c in x])

In [94]:
itos

{0: '<s>',
 1: '<pad>',
 2: '</s>',
 3: '<unk>',
 4: '\t',
 5: '\r',
 6: ' ',
 7: '!',
 8: '"',
 9: '#',
 10: '$',
 11: '%',
 12: '&',
 13: "'",
 14: '(',
 15: ')',
 16: '*',
 17: '+',
 18: ',',
 19: '-',
 20: '.',
 21: '/',
 22: '0',
 23: '1',
 24: '2',
 25: '3',
 26: '4',
 27: '5',
 28: '6',
 29: '7',
 30: '8',
 31: '9',
 32: ':',
 33: ';',
 34: '<',
 35: '=',
 36: '>',
 37: '?',
 38: '@',
 39: 'A',
 40: 'B',
 41: 'C',
 42: 'D',
 43: 'E',
 44: 'F',
 45: 'G',
 46: 'H',
 47: 'I',
 48: 'J',
 49: 'K',
 50: 'L',
 51: 'M',
 52: 'N',
 53: 'O',
 54: 'P',
 55: 'Q',
 56: 'R',
 57: 'S',
 58: 'T',
 59: 'U',
 60: 'V',
 61: 'W',
 62: 'X',
 63: 'Y',
 64: 'Z',
 65: '[',
 66: '\\',
 67: ']',
 68: '^',
 69: '_',
 70: '`',
 71: 'a',
 72: 'b',
 73: 'c',
 74: 'd',
 75: 'e',
 76: 'f',
 77: 'g',
 78: 'h',
 79: 'i',
 80: 'j',
 81: 'k',
 82: 'l',
 83: 'm',
 84: 'n',
 85: 'o',
 86: 'p',
 87: 'q',
 88: 'r',
 89: 's',
 90: 't',
 91: 'u',
 92: 'v',
 93: 'w',
 94: 'x',
 95: 'y',
 96: 'z',
 97: '{',
 98: '|',
 99:

# Model

In [5]:
import sys
sys.path.append('../src')

In [6]:
from utils import *
from hydra import compose, initialize
from omegaconf import OmegaConf
import hydra
import torch
from tqdm import tqdm

from utils import init_wandb, set_deterministic, get_dataloaders, get_dataset, get_device
from omegaconf import OmegaConf
from accelerate import Accelerator

In [7]:
initialize(version_base=None, config_path="../src/conf", job_name="test_app")

hydra.initialize()

In [8]:
cfg = compose(config_name="main",overrides=['data.dir=../src/data'])
print(f"Hydra configuration:\n{OmegaConf.to_yaml(cfg)}")

Hydra configuration:
data:
  name: wmt14
  lang: cs-en
  dir: ../src/data
wandb:
  entity: crutch
  project: Data Augmentation for Neural Machine Translation
emb_size: 512
src_vocab_size: null
tgt_vocab_size: null
batch_size: 32
max_length: 512
lr: 0.001
epochs: 100
dropout: 0.1
name: null
group: null
seed: 42
device: null
tokenizer:
  _target_: tokenizer.character.CharacterTokenizer
model:
  _target_: model.seq2seq.Seq2Seq
  transformer:
    _target_: torch.nn.Transformer
    d_model: ${emb_size}
    dim_feedforward: 515
    nhead: 8
    num_encoder_layers: 3
    num_decoder_layers: 3
    dropout: ${dropout}
  src_tok_emb:
    _target_: model.utils.embedding.TokenEmbedding
    vocab_size: ${src_vocab_size}
    emb_size: ${emb_size}
  tgt_tok_emb:
    _target_: model.utils.embedding.TokenEmbedding
    vocab_size: ${tgt_vocab_size}
    emb_size: ${emb_size}
  positional_encoding:
    _target_: model.utils.positional.PositionalEncoding
    emb_size: ${emb_size}
    dropout: ${dropout}
  

In [9]:
set_deterministic(cfg.seed)
accelerator = Accelerator(
    mixed_precision="no",
    gradient_accumulation_steps=1,
    log_with="wandb",
    # logging_dir="logs" # unexpected argument?
)
device = get_device(cfg)
dataset = get_dataset(cfg)

In [10]:
tokenizer = hydra.utils.instantiate(cfg.tokenizer, dataset=dataset)
cfg.src_vocab_size = tokenizer.vocab_size
cfg.tgt_vocab_size = tokenizer.vocab_size
print(f"Tokenizer:\n{tokenizer}")

train_loader, val_loader, test_loader = get_dataloaders(cfg, tokenizer, dataset)
train_loader, val_loader, test_loader = accelerator.prepare(
    train_loader, val_loader, test_loader
)

Tokenizer:
<tokenizer.character.CharacterTokenizer object at 0x294dc6880>
DataLoaders are set up with the following configurations:
Train: samples=953621  batches=29801  
Valid: samples=3000    batches=94     
Test:  samples=3003    batches=94     


In [11]:
next(iter(train_loader))

(tensor([[   0, 1689, 1471,  ...,    1,    1,    1],
         [   0, 1455,  470,  ...,    1,    1,    1],
         [   0, 1455, 1755,  ...,    1,    1,    1],
         ...,
         [   0, 1809,  499,  ...,    1,    1,    1],
         [   0, 1514, 1368,  ...,    1,    1,    1],
         [   0, 1711,  499,  ...,    1,    1,    1]], device='mps:0'),
 tensor([[   0, 1472, 1084,  ...,    1,    1,    1],
         [   0, 1514, 1368,  ...,    1,    1,    1],
         [   0, 1518,  782,  ...,    1,    1,    1],
         ...,
         [   0, 1809,  499,  ...,    1,    1,    1],
         [   0, 1472,  609,  ...,    1,    1,    1],
         [   0, 1711,  499,  ...,    1,    1,    1]], device='mps:0'))

## pretrained

In [17]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaModel

In [18]:
tokenizer = XLMRobertaTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
model = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
input_ids = torch.tensor(tokenizer.encode("<mask>")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)

In [19]:
tokenizer.decode([0,1,1,1,1,200,200,2],skip_special_tokens=True)

'ki ki'

In [None]:
tokenizer.encode("<pad>"k)

In [None]:
train_loader, val_loader, test_loader = get_dataloaders(cfg,tokenizer)

In [None]:
for i in train_loader:
    print(i)
    break

In [None]:
epoch_loss = 0
optimizer = hydra.utils.instantiate(cfg.optimizer,params=model.parameters())
criterion = 
model.train()
for batch in train_loader:
    optimizer.zero_grad()
    input, output = batch
    predictions = model(input)

    loss = criterion(predictions, batch.label)
    # if regularizer is not None:
    #     loss += regularizer(model)
    # loss.backward()
    # if grad_clip_threshold is not None:
    #     torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_threshold)
    optimizer.step()
    epoch_loss += loss.item()