In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import math
import copy
import pandas as pd
import sentencepiece as spm
import os
import random
from torch.optim.lr_scheduler import LambdaLR

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [3]:
# filepath='./drive/MyDrive/Colab Notebooks/data.csv'
filepath='./data/data.csv'
data=pd.read_csv(filepath)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           50000 non-null  int64  
 1   idx                  50000 non-null  int64  
 2   original             50000 non-null  object 
 3   en                   50000 non-null  object 
 4   ru                   50000 non-null  object 
 5   chrf_sim             50000 non-null  float64
 6   labse_sim            50000 non-null  float64
 7   forward_entailment   50000 non-null  float64
 8   backward_entailment  50000 non-null  float64
 9   p_good               50000 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 3.8+ MB


In [5]:
data['en'].to_csv('en.txt', header=False, index=False)

In [6]:
data['ru'].to_csv('ru.txt', header=False, index=False)

In [7]:
UNK_IDX, BOS_IDX, EOS_IDX, PAD_IDX = 0, 1, 2, 3

In [8]:
vocab_src = 20000
vocab_tgt = 20000
options1 = dict(
  # input spec
  input="en.txt",
  input_format="text",
  # output spec
  model_prefix="en", # output filename prefix
  # algorithm spec
  # BPE alg
  model_type="bpe",
  vocab_size=vocab_src,
  # normalization
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=200000000, # max number of training sentences
  max_sentence_length=4192, # max number of bytes per sentence
  seed_sentencepiece_size=1000000,
  shuffle_input_sentence=True,
  # rare word treatment
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=16,
  add_dummy_prefix=True,
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=UNK_IDX, # the UNK token MUST exist
  bos_id=BOS_IDX, # the others are optional, set to -1 to turn off
  eos_id=EOS_IDX,
  pad_id=PAD_IDX,
  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

options2 = dict(
  # input spec
  input="ru.txt",
  input_format="text",
  # output spec
  model_prefix="ru", # output filename prefix
  # algorithm spec
  # BPE alg
  model_type="bpe",
  vocab_size=vocab_tgt,
  # normalization
  normalization_rule_name="identity", # ew, turn off normalization
  remove_extra_whitespaces=False,
  input_sentence_size=200000000, # max number of training sentences
  max_sentence_length=4192, # max number of bytes per sentence
  seed_sentencepiece_size=1000000,
  shuffle_input_sentence=True,
  # rare word treatment
  character_coverage=0.99995,
  byte_fallback=True,
  # merge rules
  split_digits=True,
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  max_sentencepiece_length=16,
  add_dummy_prefix=True,
  allow_whitespace_only_pieces=True,
  # special tokens
  unk_id=UNK_IDX, # the UNK token MUST exist
  bos_id=BOS_IDX, # the others are optional, set to -1 to turn off
  eos_id=EOS_IDX,
  pad_id=PAD_IDX,
  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

spm.SentencePieceTrainer.train(**options1)
spm.SentencePieceTrainer.train(**options2)

In [9]:
sp_src = spm.SentencePieceProcessor()
sp_tgt = spm.SentencePieceProcessor()
sp_src.load('en.model')
sp_tgt.load('ru.model')

True

In [10]:
def encode(text, sp):
    return torch.tensor(sp.encode(text)).to(device)
def process_text(text, sp):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(sp.encode(text)),
                      torch.tensor([EOS_IDX]))).to(device)
def to_device(t):
    return t.to(device)

In [11]:
data_ = data[:50000]

In [12]:

src = data_['en'].apply(process_text, sp=sp_src)
tgt = data_['ru'].apply(process_text, sp=sp_tgt)

In [13]:
tgt_src = pd.Series([t[:-1].cpu() for t in tgt])
tgt_src = tgt_src.apply(to_device)

In [14]:
tgt_tgt = pd.Series([t[1:].cpu() for t in tgt])
tgt_tgt = tgt_tgt.apply(to_device)

In [15]:
st_ = {
    'src':src,
    'tgt_src':tgt_src,
    'tgt_tgt':tgt_tgt
}

src_tgt_ = pd.concat(st_, axis=1)
print(src_tgt_.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   src      50000 non-null  object
 1   tgt_src  50000 non-null  object
 2   tgt_tgt  50000 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB
None


In [16]:
max_seq_len = 100

In [17]:
src_tgt = src_tgt_[(src_tgt_['src'].apply(lambda x: x.numel() <= max_seq_len))
    & (src_tgt_['tgt_src'].apply(lambda x: x.numel() <= max_seq_len))
    & (src_tgt_['tgt_tgt'].apply(lambda x: x.numel() <= max_seq_len))]

In [18]:
src_tgt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49988 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   src      49988 non-null  object
 1   tgt_src  49988 non-null  object
 2   tgt_tgt  49988 non-null  object
dtypes: object(3)
memory usage: 1.5+ MB


In [19]:
def pad_seq(seq):
    seq_padded = F.pad(seq, (0, max_seq_len - len(seq)), 'constant', PAD_IDX)
    return seq_padded

In [20]:
def apply_pad_to_df(series):
    series = series.apply(pad_seq)
    return series

In [21]:
data_prepared = src_tgt.apply(apply_pad_to_df)

In [22]:
data_length = len(data_prepared)

In [23]:
train_data = data_prepared[:int(0.9*data_length)]
val_data = data_prepared[int(0.9*data_length):]

In [24]:
class Head(nn.Module):
    #one head of self-attention
    def __init__(self, d_model, head_size):
        super().__init__()
        self.key_linear = nn.Linear(d_model, head_size, bias=False)
        self.query_linear = nn.Linear(d_model, head_size, bias=False)
        self.value_linear = nn.Linear(d_model, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):

        query = self.query_linear(query)
        key = self.key_linear(key)
        value = self.value_linear(value)

        #compute attention scores
        wei = query @ key.transpose(-2, -1) * key.shape[-1] **-0.5 #(B,T,hs) @ (B,hs,T) -> (B,T,T)
        if mask is not None:
            wei = wei.masked_fill(mask == 0, float('-1e9')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) #(B,T,T)
        wei = self.dropout(wei)
        #perform the weighted agregation of the values
        out = wei @ value
        return out


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(d_model, head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, query, key, value, mask=None):
        out = torch.cat([h(query, key, value, mask) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, 4*d_model),
            nn.ReLU(),
            nn.Linear(4*d_model, d_model),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        head_size = d_model // n_head
        # communication
        self.sa = MultiHeadAttention(d_model, n_head, head_size)
        #computation
        self.ffwd = FeedForward(d_model)

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
    def forward(self, src, mask=None):
        x = self.ln1(src)
        src = src + self.sa(x, x, x, mask)
        x = self.ln2(src)
        out = src + self.ffwd(x)
        return out

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()
        head_size = d_model // n_head
        # communication
        self.sa1 = MultiHeadAttention(d_model, n_head, head_size)

        self.sa2 = MultiHeadAttention(d_model, n_head, head_size)
        #computation
        self.ffwd = FeedForward(d_model)

        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.ln3 = nn.LayerNorm(d_model)
    def forward(self, tgt, enc_out, tgt_mask=None):

        x = self.ln1(tgt)
        tgt = tgt + self.sa1(x, x, x, tgt_mask)

        x = tgt + self.sa2(self.ln2(tgt), enc_out, enc_out)

        out = x + self.ffwd(self.ln3(x))
        return out
        
def sin_pos_enc(seq_length, d_model):
    position = torch.arange(seq_length).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000) / d_model))
    pos_enc = torch.zeros(seq_length, d_model)
    pos_enc[:, 0::2] = torch.sin(position * div_term)
    pos_enc[:, 1::2] = torch.cos(position * div_term)
    return pos_enc
    
class Encoder(nn.Module):
    def __init__(self, vocab_size = 2000, d_model= 64, n_head = 8, n_layer=6):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        # self.position_embedding = nn.Embedding(block_size, d_model)

        position = torch.arange(block_size).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000) / d_model))
        pos_enc = torch.zeros(block_size, d_model)
        pos_enc[:, 0::2] = torch.sin(position * div_term)
        pos_enc[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe',pos_enc)
        
        # self.layers = nn.Sequential(*[EncoderLayer(d_model, n_head) for _ in range(n_layer)])
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_head)
            for _ in range(n_layer)
        ])


        self.ln_f = nn.LayerNorm(d_model)

    def make_src_mask(self, src):
        src_mask = (src != PAD_IDX).unsqueeze(1)
        return src_mask.to(device)

    def forward(self, src):
        B, T = src.shape
        src_mask = self.make_src_mask(src)

        tok_emb = self.token_embedding(src)
        # pos_emb = self.position_embedding(torch.arange(T, device=device))
        # x = tok_emb + pos_emb
        x = tok_emb + self.pe[:T]
        # x = self.layers(x, src_mask)
        for layer in self.layers:
            x = layer(x, src_mask)
        output = self.ln_f(x)
        return output.to(device)

class Decoder(nn.Module):
    def __init__(self, vocab_size = 2000, d_model= 64, n_head = 8, n_layer=6):
        super().__init__()

        self.token_embedding = nn.Embedding(vocab_size, d_model)
        # self.position_embedding = nn.Embedding(block_size, d_model)
        position = torch.arange(block_size).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000) / d_model))
        pos_enc = torch.zeros(block_size, d_model)
        pos_enc[:, 0::2] = torch.sin(position * div_term)
        pos_enc[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe',pos_enc)

        # self.layers = nn.Sequential(*[DecoderLayer(d_model, n_head) for _ in range(n_layer)])
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_head)
            for _ in range(n_layer)
        ])
        self.ln_f = nn.LayerNorm(d_model)

        self.linear = nn.Linear(d_model, vocab_size)

    def make_tgt_mask(self, tgt):
        N, tgt_len = tgt.shape
        tgt_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device))
        tgt_padding_mask = (tgt != PAD_IDX).unsqueeze(1)
        tgt_mask = tgt_mask.to(torch.int32)
        tgt_mask = tgt_mask & tgt_padding_mask
        return tgt_mask

    def forward(self, tgt, enc_out):
        B, T = tgt.shape
        tgt_mask = self.make_tgt_mask(tgt)
        tok_emb = self.token_embedding(tgt)
        # pos_emb = self.position_embedding(torch.arange(T, device=device))
        # x = tok_emb + pos_emb
        x = tok_emb + self.pe[:T]
        # x = self.layers(x, enc_out, tgt_mask)
        for layer in self.layers:
            x = layer(x, enc_out, tgt_mask)

        x = self.ln_f(x)
        logits = self.linear(x)

        return logits
class Transformer(nn.Module):
    def __init__(self, enc_vocab_size, dec_vocab_size, d_model = 64, n_head=8, n_layer=6):
        super().__init__()
        self.encoder = Encoder(enc_vocab_size, d_model, n_head, n_layer)
        self.decoder = Decoder(dec_vocab_size, d_model, n_head, n_layer)

    def forward(self, src, tgt):
        enc_out = self.encoder(src)
        out = self.decoder(tgt, enc_out)
        return out



In [50]:
# batch_size = 64
# block_size = max_seq_len
# dropout= 0.1
# d_model = 512
# n_head = 8
# n_layer = 6
# learning_rate = 3e-4
# max_iters = 20000
batch_size = 8
block_size = max_seq_len
dropout= 0.1
d_model = 16
n_head = 4
n_layer = 3
learning_rate = 1e-3
max_iters = 1000
warmup_steps = int(max_iters * 0.08)
model = Transformer(vocab_src, vocab_tgt, d_model, n_head, n_layer).to(device)

In [25]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = [random.randint(0, len(data)-1) for _ in range(batch_size)]
    x = torch.stack([data['src'].iloc[i] for i in ix]).to(device)
    y_src = torch.stack([data['tgt_src'].iloc[i] for i in ix]).to(device)
    y_tgt = torch.stack([data['tgt_tgt'].iloc[i] for i in ix]).to(device)
    return x, y_src, y_tgt

In [26]:
def lr_lambda(current_step):
    if current_step < warmup_steps:
        return float(current_step) / float(max(1, warmup_steps))  # Linear warm-up
    return max(0.0, (max_iters - current_step) / (max_iters - warmup_steps))

In [27]:
def train(model, learning_rate = 1e-3, max_iters=3000):
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    scheduler = LambdaLR(optimizer, lr_lambda)
    for iter in range(max_iters):
        xb, yb_src, yb_tgt = get_batch('train')
        logits = model(xb, yb_src)
        # print(logits)
        B,T,C = logits.shape
        logits = logits.view(B*T,C)
        yb_tgt = yb_tgt.view(B*T)
        loss = criterion(logits, yb_tgt)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        if iter%100 == 0:
            current_lr = optimizer.param_groups[0]['lr']
            print(f"iter: {iter}, loss: {loss.item():.4f}, lr: {current_lr}")
    return model

In [54]:
model = train(model, learning_rate=learning_rate, max_iters=max_iters)

iter: 0, loss: 8.0498, lr: 1.25e-05
iter: 100, loss: 4.7974, lr: 0.0009771739130434783
iter: 200, loss: 3.2522, lr: 0.0008684782608695653
iter: 300, loss: 2.5951, lr: 0.0007597826086956522
iter: 400, loss: 1.7517, lr: 0.0006510869565217391
iter: 500, loss: 2.0149, lr: 0.0005423913043478262
iter: 600, loss: 2.2431, lr: 0.0004336956521739131
iter: 700, loss: 2.2962, lr: 0.00032500000000000004
iter: 800, loss: 2.0313, lr: 0.00021630434782608695
iter: 900, loss: 2.6324, lr: 0.00010760869565217392


In [28]:
def generate(model, src, max_tokens = 50):
    model.eval()
    tgt = torch.tensor([BOS_IDX]).to(device).unsqueeze(0).long()
    for _ in range(max_tokens):
        logits = model(src, tgt)
        #focus only on the last time step
        logits = logits[:, -1, :] #becomes (B,C)
        #apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) #(B,C)
        #sample from the distribution
        token_next = torch.multinomial(probs, num_samples=1)
        # token_next = torch.argmax(probs)
        #append sampled index to the running sequence
        tgt = torch.cat((tgt, token_next), dim = 1) # (B, T+1)
        if token_next == EOS_IDX:
            break

    return tgt

In [29]:
def gen(text, model, sp_src, sp_tgt, max_tokens = 50):
    src = pad_seq(process_text(text, sp_src)).view(1, -1)
    tgt = generate(model, src, max_tokens)
    return sp_tgt.decode(tgt.tolist())

In [48]:
text='What is love?'
# text='hello world'
res = gen(text, model, sp_src, sp_tgt, 50)
print(res)

NameError: name 'model' is not defined

In [None]:
# save model
path = f"./transformer_50k_v1_b{batch_size}_t{block_size}_dm{d_model}_nh{n_head}_nl{n_layer}_lr{learning_rate}_mi{max_iters}.pt"
torch.save(model.state_dict(), path)

In [None]:
path_test= f"./drive/MyDrive/Colab Notebooks/transformer_test.pt"
torch.save(model_test_save.state_dict(), path_test)

In [39]:
#load model
batch_size = 64
block_size = max_seq_len
dropout= 0.1
d_model = 512
n_head = 4
n_layer = 4
learning_rate = 3e-4
max_iters = 20000
# path_load= f"./drive/MyDrive/Colab Notebooks/transformer_50k_v1_b{batch_size}_t{block_size}_dm{d_model}_nh{n_head}_nl{n_layer}_lr{learning_rate}_mi{max_iters}.pt"
# path_load = './models/transformer_50k_v1_3_3_b64_t100_dm512_nh4_nl4_lr0.0003_wms1600_mi20000.pt'
path_load = './models/transformer_50k_v1_2_b64_t100_dm512_nh4_nl4_lr0.0003_wms1600_mi20000.pt'
model_loaded = Transformer(vocab_src, vocab_tgt, d_model, n_head, n_layer).to(device)
model_loaded.load_state_dict(torch.load(path_load, weights_only=True))

<All keys matched successfully>

In [42]:
text='What is war?'
res = gen(text, model_loaded, sp_src, sp_tgt, 50)
print(res)

['Что же война?']


In [43]:
def generate2(model, src, max_tokens = 50, batch_size=8):
    model.eval()
    # tgt = torch.tensor([BOS_IDX]).unsqueeze(0).long().to(device)
    tgt = torch.full((batch_size,1), BOS_IDX).to(device)
    for _ in range(max_tokens):
        logits = model(src, tgt)
        #focus only on the last time step
        logits = logits[:, -1, :] #becomes (B,C)
        #apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) #(B,C)
        #sample from the distribution
        token_next = torch.multinomial(probs, num_samples=1)
        # token_next = torch.argmax(probs)
        #append sampled index to the running sequence
        tgt = torch.cat((tgt, token_next), dim = 1) # (B, T+1)
    return tgt
def gen2(text, model, sp_src, sp_tgt, max_tokens = 50, batch_size = 8):
    src = pad_seq(process_text(text, sp_src))
    src_batches = torch.stack([src for _ in range(batch_size)]).to(device)
    tgt = generate2(model, src_batches, max_tokens, batch_size)
    for row in tgt:
        idx = torch.where(row == EOS_IDX)[0]
        if len(idx) > 0:
            row[idx[0] + 1:] = PAD_IDX
    return sp_tgt.decode(tgt.tolist())

In [44]:
k = 20
text = data['en'].iloc[k]
print('На тренировачных данных: ')
print('Английский: ', text)
print('Действительный перевод: ', data['ru'].iloc[k])
print('Перевод модели:')
res = gen2(text, model_loaded, sp_src, sp_tgt, 50, batch_size=16)
for i in range(len(res)):
    print(f'{i+1})',res[i])

На тренировачных данных: 
Английский:  We have an average of 6% of such errors.
Действительный перевод:  У нас средний показатель таких ошибок составляет 6%.
Перевод модели:
1) У нас средний показатель таких ошибок составляетмарПредо. послать.
2) У нас средний показатель таких ошибок составляет 6%.
3) У нас средний показатель такого ошибок составляет 6%.
4) У нас средний показатель таких ошибок составляет 6% этой ошибки.
5) У нас средний показатель таких ошибок составляет 6%.
6) У нас средний показатель таких ошибок составляет 6 исповеИдея.
7) У нас средний показатель таких ошибок составляет 6% этой статьи.
8) У нас средний показатель таких ошибок составляет 6 процентов.
9) У нас средний показатель таких ошибок составляет 6%.
10) У нас средний показатель таких ошибок известна.
11) У нас средний показатель таких ошибок составляет 6%.
12) У нас средний показатель таких ошибок периода ждут подобных ошибок.
13) У нас средний показатель таких ошибок составляет 6%центра.
14) У нас средний по

In [45]:
k = 48000
text = data['en'].iloc[k]
print('На данных проверки: ')
print('Английский: ', text)
print('Действительный перевод: ', data['ru'].iloc[k])
print('Перевод модели:')
res = gen2(text, model_loaded, sp_src, sp_tgt, 50, batch_size=16)
for i in range(len(res)):
    print(f'{i+1})',res[i])

На данных проверки: 
Английский:  And in winter - because of shoes, a lot of complaints about the quality and reagents that spoil footwear - we can't help here, unfortunately.
Действительный перевод:  А в зимний период - из ‑ за обуви и множества жалоб на ее качество и вредные вещества, которые портят обувь, - мы здесь, увы, ничем помочь не можем.
Перевод модели:
1) департамент- знаний, потому что обувиaster + много вечера - это много отображает своих трассах гибкого отметил - нельзя удобнее нельзя, увы произойти.
2) А за " экскурсии" - потому что с водители много плохих и их ребенок, которые пор заметил тудавешивает в том, что ониторах - не могут помочьтельный.
3) А у Зимой - ведь подразделений, много�ходные жалобы и жидкостей, которые пор произведения, гиподите должныор покраститься - мы не можем понять, увы.
4) серьёзно называют выставок, потому что обуви многония жалоб обозначающие ваших пространств и обработки ее невозможно, которые порвать - мы можем сержусь.
5) А передний период

In [46]:
def evaluate(model, max_iters=50):
    model.eval()
    val_loss = 0
    criterion = nn.CrossEntropyLoss()
    for iter in range(max_iters):
      xb, yb_src, yb_tgt = get_batch('val')
      logits = model(xb, yb_src)
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      yb_tgt = yb_tgt.view(B*T)
      loss = criterion(logits, yb_tgt)
      print(f'iter: {iter}, loss:{loss.item():.4f}')
      val_loss += loss.item()
    return val_loss/max_iters

In [51]:
print(evaluate(model_loaded))

iter: 0, loss:1.0090
iter: 1, loss:1.1734
iter: 2, loss:1.0213
iter: 3, loss:1.1277
iter: 4, loss:0.9835
iter: 5, loss:1.0779
iter: 6, loss:1.1955
iter: 7, loss:1.1396
iter: 8, loss:0.9618
iter: 9, loss:1.0927
iter: 10, loss:1.0368
iter: 11, loss:1.1884
iter: 12, loss:1.2688
iter: 13, loss:1.1463
iter: 14, loss:1.2642
iter: 15, loss:1.0025
iter: 16, loss:1.1225
iter: 17, loss:1.1225
iter: 18, loss:1.0859
iter: 19, loss:1.2552
iter: 20, loss:1.0194
iter: 21, loss:1.1958
iter: 22, loss:1.2199
iter: 23, loss:1.2517
iter: 24, loss:1.1477
iter: 25, loss:0.9847
iter: 26, loss:1.0515
iter: 27, loss:1.0184
iter: 28, loss:1.0359
iter: 29, loss:1.0765
iter: 30, loss:0.9663
iter: 31, loss:1.1760
iter: 32, loss:0.9498
iter: 33, loss:1.0033
iter: 34, loss:1.1084
iter: 35, loss:1.2245
iter: 36, loss:1.2702
iter: 37, loss:1.1871
iter: 38, loss:1.0310
iter: 39, loss:1.2082
iter: 40, loss:0.9992
iter: 41, loss:1.1739
iter: 42, loss:1.0968
iter: 43, loss:1.0094
iter: 44, loss:0.9874
iter: 45, loss:0.999

In [52]:
def evaluate_training_data(model, max_iters=50):
    model.eval()
    val_loss = 0
    criterion = nn.CrossEntropyLoss(label_smoothing = 0.05)
    for iter in range(max_iters):
      xb, yb_src, yb_tgt = get_batch('train')
      logits = model(xb, yb_src)
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      yb_tgt = yb_tgt.view(B*T)
      loss = criterion(logits, yb_tgt)
      print(f'iter: {iter}, loss:{loss.item():.4f}')
      val_loss += loss.item()
    return val_loss/max_iters

In [53]:
evaluate_training_data(model_loaded)

iter: 0, loss:0.7476
iter: 1, loss:0.7484
iter: 2, loss:0.7487
iter: 3, loss:0.7448
iter: 4, loss:0.7451
iter: 5, loss:0.7376
iter: 6, loss:0.7423
iter: 7, loss:0.7484
iter: 8, loss:0.7444
iter: 9, loss:0.7425
iter: 10, loss:0.7469
iter: 11, loss:0.7392
iter: 12, loss:0.7434
iter: 13, loss:0.7438
iter: 14, loss:0.7458
iter: 15, loss:0.7446
iter: 16, loss:0.7428
iter: 17, loss:0.7456
iter: 18, loss:0.7437
iter: 19, loss:0.7465
iter: 20, loss:0.7426
iter: 21, loss:0.7399
iter: 22, loss:0.7382
iter: 23, loss:0.7443
iter: 24, loss:0.7433
iter: 25, loss:0.7462
iter: 26, loss:0.7427
iter: 27, loss:0.7408
iter: 28, loss:0.7488
iter: 29, loss:0.7399
iter: 30, loss:0.7407
iter: 31, loss:0.7465
iter: 32, loss:0.7417
iter: 33, loss:0.7452
iter: 34, loss:0.7412
iter: 35, loss:0.7400
iter: 36, loss:0.7436
iter: 37, loss:0.7464
iter: 38, loss:0.7475
iter: 39, loss:0.7517
iter: 40, loss:0.7455
iter: 41, loss:0.7422
iter: 42, loss:0.7442
iter: 43, loss:0.7412
iter: 44, loss:0.7401
iter: 45, loss:0.742

0.7440951931476593

In [53]:
# bleu score
import sacrebleu

In [47]:
def get_random_corpus(size = 20):
    xs = []
    ys = []
    for i in range(size):
        k = random.randint(0, int(0.9*len(data_prepared)))
        xs.append(data.iloc[k]['en'])
        ys.append(data.iloc[k]['ru'])
    return xs, ys

In [71]:
# Reference translations (should be a list of strings, not tokenized)
references = [["The cat is on the mat." , "There is a cat on the mat."]]

# Candidate translation (Model's output)
candidate = ["The cat is on the mat dermo hui"]

# Compute BLEU score
bleu = sacrebleu.corpus_bleu(candidate, references)
print(f"BLEU Score: {bleu.score:.4f}")

BLEU Score: 100.0000


In [81]:
def get_bleu_score(model):
    x, y = get_random_corpus()
    candidates = []
    bleus = []
    for i in range(len(x)):
        res = gen(x[i], model, sp_src, sp_tgt, 50)
        bleu = sacrebleu.corpus_bleu(res, [[y[i]]])
        bleus.append(bleu.score)
    print(bleus)
    return bleaus
bleaus = get_bleu_score(model)

[0.0, 2.247346032110286, 2.6757454896417534, 2.3629035975324633, 1.5843969409738214, 0.0, 2.0705706652424007, 2.8316557261689033, 0.9584157964125064, 2.159701133933343, 1.8815557141800423, 1.3862040232457764, 2.0540268312306345, 0.0, 3.0890553181566975, 4.767707020457095, 1.387819277861591, 4.935157841536379, 4.196114906296549, 1.7392453207295933]
