In [1]:
import torch
import torch.nn as nn

In [53]:
x = torch.tensor([[[0,5,1,3],[7,9,9,4],[9,6,3,4]],[[1,2,3,4],[5,6,7,8],[9,10,11,12]]])
x

tensor([[[ 0,  5,  1,  3],
         [ 7,  9,  9,  4],
         [ 9,  6,  3,  4]],

        [[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]]])

In [62]:
x.shape

torch.Size([2, 3, 4])

In [61]:
x.transpose(1,2)

tensor([[[ 0,  7,  9],
         [ 5,  9,  6],
         [ 1,  9,  3],
         [ 3,  4,  4]],

        [[ 1,  5,  9],
         [ 2,  6, 10],
         [ 3,  7, 11],
         [ 4,  8, 12]]])

In [57]:
m = x.float().mean(-1, keepdim=True)
m

tensor([[[ 2.2500],
         [ 7.2500],
         [ 5.5000]],

        [[ 2.5000],
         [ 6.5000],
         [10.5000]]])

In [23]:
y = torch.tensor([0,5,1,3])
y.unsqueeze(1)

tensor([[0],
        [5],
        [1],
        [3]])

In [64]:
from datasets import load_dataset
ds_raw = load_dataset('opus_books', 'en-fr', split='train')

README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [65]:
ds_raw

Dataset({
    features: ['id', 'translation'],
    num_rows: 127085
})

In [71]:
ds_raw[0]

{'id': '0', 'translation': {'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'}}

In [72]:
ds_raw = ds_raw.with_format("torch")

In [75]:
ds_raw

Dataset({
    features: ['id', 'translation'],
    num_rows: 127085
})

In [3]:
left = True
right = True

print(abs(left-right))

0


In [6]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
from tokenizers.normalizers import Lowercase, StripAccents
from tokenizers import normalizers
from pathlib import Path

def get_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"])
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer



In [7]:
def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "fr",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

In [8]:
config = get_config()

In [9]:
from datasets import load_dataset
ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split='train')
tokenizer_src = get_build_tokenizer(config, ds_raw, config['lang_src'])
tokenizer_tgt = get_build_tokenizer(config, ds_raw, config['lang_tgt'])

In [15]:
from dataset import BilingualDataset
ds = BilingualDataset(ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

In [21]:
for i in range(0,5):
    print(tokenizer_src.encode(ds[i]['src_text']).ids)

[226, 56, 9708, 1877]
[34, 9281, 16, 6020, 3553]
[4750, 12113]
[42]
[2290, 10334, 3229, 12379]
