In [4]:
from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE
import ast
import numpy as np

tokenizer = WordPunctTokenizer()

def tokenize(x):
    return ' '.join(tokenizer.tokenize(x.lower()))


train = []
with open('/kaggle/input/data-transl/train', 'r') as f, open('train.alien', 'w') as f_src, open('train.en', 'w') as f_dst:
    for line in f:
        train.append(ast.literal_eval(line.strip()))
        f_src.write(tokenize(train[-1]['src']) + '\n')
        f_dst.write(tokenize(train[-1]['dst']) + '\n')


bpe = {}
for lang in ['alien', 'en']:
    learn_bpe(open(f'./train.{lang}'), open(f'bpe_rules.{lang}', 'w'), num_symbols=8000)
    bpe[lang] = BPE(open(f'./bpe_rules.{lang}'))

    
    with open(f'train.bpe.{lang}', 'w') as f_out:
        for line in open(f'./train.{lang}'):
            f_out.write(bpe[lang].process_line(line.strip()) + '\n')


val = []
with open('/kaggle/input/data-transl/val', 'r') as v, open('val.alien', 'w') as v_src, open('val.en', 'w') as v_dst:
    for line in v:
        val.append(ast.literal_eval(line.strip()))
        v_src.write(tokenize(val[-1]['src']) + '\n')
        v_dst.write(tokenize(val[-1]['dst']) + '\n')

for lang in ['alien', 'en']:
    with open(f'val.bpe.{lang}', 'w') as v_out:
        for line in open(f'./val.{lang}'):
            v_out.write(bpe[lang].process_line(line.strip()) + '\n')

data_inp = np.array(open('./train.bpe.alien').read().split('\n'))
data_out = np.array(open('./train.bpe.en').read().split('\n'))
train_out = data_out
train_inp = data_inp

datav_inp = np.array(open('./val.bpe.alien').read().split('\n'))
datav_out = np.array(open('./val.bpe.en').read().split('\n'))
dev_out = datav_out
dev_inp = datav_inp

for i in range(2):
    print('inp:', train_inp[i])
    print('out:', train_out[i], end='\n\n')

for i in range(2):
    print('inp:', dev_inp[i])
    print('out:', dev_out[i], end='\n\n')

100%|██████████| 8000/8000 [00:56<00:00, 140.98it/s]
100%|██████████| 8000/8000 [00:09<00:00, 862.61it/s] 


inp: ◄▴◓@@ ◠▨ ◨@@ ▽◠▦@@ ◈◬◓▪@@ ▼◬▵
out: - intri@@ gu@@ ing .

inp: ▽◪@@ ◎◗▦@@ ◫▦◫ ▫▴▨◓◠◓ ▴▫◎◪@@ ▱◫ ◚▴ ◞◧▦@@ ◞▾▢@@ ▱◨▨ ◒◠◓◠@@ ◀@@ ▪▦◈◠▦ ◫◉@@ ◎▴@@ ▱◫▵
out: he would need to repeat his vo@@ ws in the land of the living and drink from the wine of ages .

inp: ◘@@ ◚ ◞◠▷◫@@ ◀◗ ▫◠▨◬@@ ◎ ▨◪▦◈@@ ◫▦◫ ▫◧▻▱◠@@ ◈▪ ◚◪ ◝◂@@ ▾▼@@ ▷◠◓@@ ◈@@ '◬▦ 2@@ 7 : 3@@ 7 '@@ ◈▴▨◗ ◕@@ ◂▱@@ ◭ ◀◗◓ ▨▴▢ ◈◠▷◠ ◞@@ ▨◂◓@@ ◨ ▴@@ ◒◗@@ ▫@@ ▱◪@@ ◈◗▵
out: the ho@@ sts re@@ grou@@ ped , and bou@@ chard ev@@ ened the score again , scor@@ ing a goal with a 27 - 37 man advantage .

inp: ◤@@ ◪▦◫ ▨◠▦@@ ◞▴◓ ◠◒▪@@ ◞▪■ ◀◠◐▪@@ ◒◬@@ ▨@@ ▱▪▨ ◞@@ ◫◞▫@@ ◪◎@@ ◫▦▴ ▨◣▫◭ ▦@@ ◫◳@@ ◪▫@@ ▱◗ ▷@@ ▩▼@@ ◓◪@@ ▱▴◓◫ "@@ ◕◣◓@@ ◎▴▽@@ ◫@@ " ◇◐◓@@ ◪▫▴@@ ◀◫▱◗◓
out: a new cancer vac@@ cine may teach the immune system to " see " ab@@ normal cells



In [5]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
import sys
import numpy as np
import torch
import torch.nn.functional as F

class Vocab:
    def __init__(self, tokens, bos="_BOS_", eos="_EOS_", unk='_UNK_', pad='_PAD_'):
        assert all(tok in tokens for tok in (bos, eos, unk, pad))
        self.tokens = tokens
        self.token_to_ix = {t: i for i, t in enumerate(tokens)}
        self.bos, self.eos, self.unk, self.pad = bos, eos, unk, pad
        self.bos_ix = self.token_to_ix[bos]
        self.eos_ix = self.token_to_ix[eos]
        self.unk_ix = self.token_to_ix[unk]
        self.pad_ix = self.token_to_ix[pad]

    def __len__(self):
        return len(self.tokens)

    @staticmethod
    def from_lines(lines, bos="_BOS_", eos="_EOS_", unk='_UNK_', pad='_PAD_'):
        flat_lines = '\n'.join(list(lines)).split()
        tokens = sorted(set(flat_lines))
        tokens = [t for t in tokens if t not in (bos, eos, unk, pad) and len(t)]
        tokens = [bos, eos, unk, pad] + tokens
        return Vocab(tokens, bos, eos, unk, pad)

    def tokenize(self, string):
        if not string.strip():
            return [self.bos, self.eos]
        tokens = [tok if tok in self.token_to_ix else self.unk
                  for tok in string.split()]
        return [self.bos] + tokens + [self.eos]

    def to_matrix(self, lines, dtype=torch.int64, max_len=None):
        lines = list(map(self.tokenize, lines))
        if not lines:
            return torch.empty((0, 0), dtype=dtype)
        max_len = max_len or max(map(len, lines))

        matrix = torch.full((len(lines), max_len), self.pad_ix, dtype=dtype)
        for i, seq in enumerate(lines):
            row_ix = list(map(self.token_to_ix.get, seq))[:max_len]
            matrix[i, :len(row_ix)] = torch.as_tensor(row_ix)
        return matrix

    def to_lines(self, matrix, crop=True):
        if matrix.numel() == 0:
            return []
        lines = []
        for line_ix in map(list, matrix):
            if crop:
                if line_ix[0] == self.bos_ix:
                    line_ix = line_ix[1:]
                if self.eos_ix in line_ix:
                    line_ix = line_ix[:line_ix.index(self.eos_ix)]
            line = ' '.join(self.tokens[i] for i in line_ix if i != self.pad_ix)
            lines.append(line)
        return lines

    def compute_mask(self, input_ix):
        return F.pad(torch.cumsum(input_ix == self.eos_ix, dim=-1)[..., :-1] < 1, pad=(1, 0, 0, 0), value=True)

    def get_pad_index(self):
        return self.pad_ix

    def get_bos_index(self):
        return self.bos_ix

    def get_eos_index(self):
        return self.eos_ix

    def to_string(self, indices):
        """
        Преобразует последовательность индексов в строку токенов.
        """
        return ' '.join(self.tokens[idx] for idx in indices if idx != self.pad_ix)



inp_voc = Vocab.from_lines(train_inp)
out_voc = Vocab.from_lines(train_out)
#out_voc.get_pad_index('<pad>')
tgt_pad_index = out_voc.get_pad_index()

import pickle

with open('inp_voc.pkl', 'wb') as f:
    pickle.dump(inp_voc, f)

with open('out_voc.pkl', 'wb') as f:
    pickle.dump(out_voc, f)

# Neural Machine Translation with Transformers



In [7]:
import math
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
class MultiHeadAttention(d2l.Module): 
    def __init__(self, num_hiddens, num_heads, dropout, bias=False, **kwargs):
        super().__init__()
        self.num_heads = num_heads
        self.attention = d2l.DotProductAttention(dropout)
        self.W_q = nn.LazyLinear(num_hiddens, bias=bias)
        self.W_k = nn.LazyLinear(num_hiddens, bias=bias)
        self.W_v = nn.LazyLinear(num_hiddens, bias=bias)
        self.W_o = nn.LazyLinear(num_hiddens, bias=bias)

    def forward(self, queries, keys, values, valid_lens):
        queries = self.transpose_qkv(self.W_q(queries))
        keys = self.transpose_qkv(self.W_k(keys))
        values = self.transpose_qkv(self.W_v(values))

        if valid_lens is not None:
            valid_lens = torch.repeat_interleave(
                valid_lens, repeats=self.num_heads, dim=0)

        output = self.attention(queries, keys, values, valid_lens)
        output_concat = self.transpose_output(output)
        return self.W_o(output_concat)
@d2l.add_to_class(MultiHeadAttention)  #save
def transpose_qkv(self, X):
    X = X.reshape(X.shape[0], X.shape[1], self.num_heads, -1)
    X = X.permute(0, 2, 1, 3)
    return X.reshape(-1, X.shape[2], X.shape[3])

@d2l.add_to_class(MultiHeadAttention)
def transpose_output(self, X):
    X = X.reshape(-1, self.num_heads, X.shape[1], X.shape[2])
    X = X.permute(0, 2, 1, 3)
    return X.reshape(X.shape[0], X.shape[1], -1)
class PositionalEncoding(nn.Module):
    def __init__(self, num_hiddens, dropout, max_len=1000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.P = torch.zeros((1, max_len, num_hiddens))
        X = torch.arange(max_len, dtype=torch.float32).reshape(
            -1, 1) / torch.pow(10000, torch.arange(
            0, num_hiddens, 2, dtype=torch.float32) / num_hiddens)
        self.P[:, :, 0::2] = torch.sin(X)
        self.P[:, :, 1::2] = torch.cos(X)

    def forward(self, X):
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        return self.dropout(X)
class PositionWiseFFN(nn.Module):
    def __init__(self, ffn_num_hiddens, ffn_num_outputs):
        super().__init__()
        self.dense1 = nn.LazyLinear(ffn_num_hiddens)
        self.relu = nn.ReLU()
        self.dense2 = nn.LazyLinear(ffn_num_outputs)

    def forward(self, X):
        return self.dense2(self.relu(self.dense1(X)))

class AddNorm(nn.Module): 
    def __init__(self, norm_shape, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.ln = nn.LayerNorm(norm_shape)

    def forward(self, X, Y):
        return self.ln(self.dropout(Y) + X)



## Encoder

In [8]:
class TransformerEncoderBlock(nn.Module):  #
    
    def __init__(self, num_hiddens, ffn_num_hiddens, num_heads, dropout,
                 use_bias=False):
        super().__init__()
        self.attention = d2l.MultiHeadAttention(num_hiddens, num_heads,
                                                dropout, use_bias)
        self.addnorm1 = AddNorm(num_hiddens, dropout)
        self.ffn = PositionWiseFFN(ffn_num_hiddens, num_hiddens)
        self.addnorm2 = AddNorm(num_hiddens, dropout)

    def forward(self, X, valid_lens):
        Y = self.addnorm1(X, self.attention(X, X, X, valid_lens))
        return self.addnorm2(Y, self.ffn(Y))

class TransformerEncoder(d2l.Encoder):  #
    
    def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens,
                 num_heads, num_blks, dropout, use_bias=False):
        super().__init__()
        self.num_hiddens = num_hiddens
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for i in range(num_blks):
            self.blks.add_module("block"+str(i), TransformerEncoderBlock(
                num_hiddens, ffn_num_hiddens, num_heads, dropout, use_bias))

    def forward(self, X, valid_lens):
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self.attention_weights = [None] * len(self.blks)
        for i, blk in enumerate(self.blks):
            X = blk(X, valid_lens)
            self.attention_weights[
                i] = blk.attention.attention.attention_weights
        return X

## Decoder

In [10]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, num_hiddens, ffn_num_hiddens, num_heads, dropout, i):
        super().__init__()
        self.i = i
        self.attention1 = d2l.MultiHeadAttention(num_hiddens, num_heads,
                                                 dropout)
        self.addnorm1 = AddNorm(num_hiddens, dropout)
        self.attention2 = d2l.MultiHeadAttention(num_hiddens, num_heads,
                                                 dropout)
        self.addnorm2 = AddNorm(num_hiddens, dropout)
        self.ffn = PositionWiseFFN(ffn_num_hiddens, num_hiddens)
        self.addnorm3 = AddNorm(num_hiddens, dropout)

    def forward(self, X, state):
        enc_outputs, enc_valid_lens = state[0], state[1]
        if state[2][self.i] is None:
            key_values = X
        else:
            key_values = torch.cat((state[2][self.i], X), dim=1)
        state[2][self.i] = key_values
        if self.training:
            batch_size, num_steps, _ = X.shape
            
            dec_valid_lens = torch.arange(
                1, num_steps + 1, device=X.device).repeat(batch_size, 1)
        else:
            dec_valid_lens = None
        X2 = self.attention1(X, key_values, key_values, dec_valid_lens)
        Y = self.addnorm1(X, X2)
        Y2 = self.attention2(Y, enc_outputs, enc_outputs, enc_valid_lens)
        Z = self.addnorm2(Y, Y2)
        return self.addnorm3(Z, self.ffn(Z)), state

class TransformerDecoder(d2l.AttentionDecoder):
    def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads,
                 num_blks, dropout):
        super().__init__()
        self.num_hiddens = num_hiddens
        self.num_blks = num_blks
        self.embedding = nn.Embedding(vocab_size, num_hiddens)
        self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout)
        self.blks = nn.Sequential()
        for i in range(num_blks):
            self.blks.add_module("block"+str(i), TransformerDecoderBlock(
                num_hiddens, ffn_num_hiddens, num_heads, dropout, i))
        self.dense = nn.LazyLinear(vocab_size)

    def init_state(self, enc_outputs, enc_valid_lens):
        return [enc_outputs, enc_valid_lens, [None] * self.num_blks]

    def forward(self, X, state):
        X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens))
        self._attention_weights = [[None] * len(self.blks) for _ in range (2)]
        for i, blk in enumerate(self.blks):
            X, state = blk(X, state)
            self._attention_weights[0][
                i] = blk.attention1.attention.attention_weights
            self._attention_weights[1][
                i] = blk.attention2.attention.attention_weights
        return self.dense(X), state

    @property
    def attention_weights(self):
        return self._attention_weights

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_inp_tensor = inp_voc.to_matrix(train_inp).to(device)
train_out_tensor = out_voc.to_matrix(train_out).to(device)

dev_inp_tensor = inp_voc.to_matrix(dev_inp).to(device)
dev_out_tensor = out_voc.to_matrix(dev_out).to(device)


In [12]:

from torch.utils.data import DataLoader, TensorDataset
train_dataset = TensorDataset(train_inp_tensor, train_out_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

dev_dataset = TensorDataset(dev_inp_tensor, dev_out_tensor)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
class TransformerModel(nn.Module):
    def __init__(self, inp_voc, out_voc, num_hiddens, ffn_num_hiddens, num_heads, num_blks, dropout):
        super(TransformerModel, self).__init__()
        self.encoder = TransformerEncoder(len(inp_voc), num_hiddens, ffn_num_hiddens, num_heads, num_blks, dropout)
        self.decoder = TransformerDecoder(len(out_voc), num_hiddens, ffn_num_hiddens, num_heads, num_blks, dropout)
        self.out_voc = out_voc

    def forward(self, inp, out):
        enc_outputs = self.encoder(inp, None)
        dec_state = self.decoder.init_state(enc_outputs, None)
        return self.decoder(out, dec_state)


#num_hiddens = 512
#ffn_num_hiddens = 2048
#num_heads = 8
#num_blks = 6
torch.cuda.empty_cache()
num_blks = 4
num_heads = 4
num_hiddens = 256
ffn_num_hiddens = 1024

dropout = 0.1


model = TransformerModel(inp_voc, out_voc, num_hiddens, ffn_num_hiddens, num_heads, num_blks, dropout).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=out_voc.get_pad_index())
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [18]:

def train(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inp, out in train_loader:
            inp.to(device), out.to(device)
            optimizer.zero_grad()
            out_pred, _ = model(inp, out[:, :-1])
            loss = criterion(out_pred.permute(0, 2, 1), out[:, 1:])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')
        
        
        torch.save(model.state_dict(), f'../working/model_epoch_{epoch+1}.pth')
        torch.cuda.empty_cache()


def validate(model, dev_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inp, out in dev_loader:
            inp, out = inp.to(device), out.to(device)
            out_pred, _ = model(inp, out[:, :-1])
            loss = criterion(out_pred.permute(0, 2, 1), out[:, 1:])
            total_loss += loss.item()
    print(f'Validation Loss: {total_loss/len(dev_loader)}')


torch.cuda.empty_cache()
train(model, train_loader, criterion, optimizer, num_epochs=10)



Epoch 1, Loss: 4.22470003179878
Epoch 2, Loss: 3.6913880393013154
Epoch 3, Loss: 3.4349060890852727
Epoch 4, Loss: 3.26138519123137
Epoch 5, Loss: 3.1266665501719406
Epoch 6, Loss: 3.01427622593776
Epoch 7, Loss: 2.919137865376469
Epoch 8, Loss: 2.8355444994209784
Epoch 9, Loss: 2.76395321649511
Epoch 10, Loss: 2.7030348901869194


In [19]:
validate(model, dev_loader, criterion)

Validation Loss: 5.787325635552406


In [20]:
test = []
with open('/kaggle/input/data-transl/test_no_reference', 'r') as f, open('test.alien', 'w') as test_src:
    for line in f:
        data = ast.literal_eval(line.strip())
        test.append(data)
        test_src.write(tokenize(data['src']) + '\n')

for lang in ['alien']:
    with open(f'test.bpe.{lang}', 'w') as test_out:
        for line in open(f'test.{lang}'):
            test_out.write(bpe[lang].process_line(line.strip()) + '\n')

test_data_inp = np.array(open('./test.bpe.alien').read().split('\n'))
test_inp = test_data_inp

test_inp_tensor = inp_voc.to_matrix(test_inp).to(device)

test_dataset = TensorDataset(test_inp_tensor)

In [28]:
test = []
with open('/kaggle/input/data-transl/test_no_reference', 'r') as f, open('test.alien', 'w') as test_src:
    for line in f:
        data = ast.literal_eval(line.strip())
        test.append(data)
        test_src.write(tokenize(data['src']) + '\n')

for lang in ['alien']:
    with open(f'test.bpe.{lang}', 'w') as test_out:
        for line in open(f'test.{lang}'):
            test_out.write(bpe[lang].process_line(line.strip()) + '\n')

test_data_inp = np.array([line for line in open('./test.bpe.alien').read().split('\n') if line.strip()])
test_inp = test_data_inp

test_inp_tensor = inp_voc.to_matrix(test_inp).to(device)

test_dataset = TensorDataset(test_inp_tensor)

In [29]:
len(test_inp)

1000

In [30]:
def to_string(self, indices):
    """
    Преобразует последовательность индексов в строку токенов.
    """
    valid_indices = [idx for idx in indices if idx < len(self.tokens) and idx != self.pad_ix]
    return ' '.join(self.tokens[idx] for idx in valid_indices)


def generate_predictions(model, test_dataset, inp_voc, out_voc, max_len=100):
    model.eval()
    predictions = []
    BOS_IDX = out_voc.get_bos_index()  # Индекс токена начала последовательности
    EOS_IDX = out_voc.get_eos_index()  # Индекс токена конца последовательности

    with torch.no_grad():
        for inp in test_dataset:
            inp = inp[0].unsqueeze(0).to(device)
            enc_outputs = model.encoder(inp, None)
            dec_state = model.decoder.init_state(enc_outputs, None)
            dec_input = torch.tensor([[BOS_IDX]], device=device)
            output_seq = []
            for _ in range(max_len):
                out_pred, dec_state = model.decoder(dec_input, dec_state)
                dec_input = out_pred.argmax(dim=2)
                next_token = dec_input.squeeze().item()
                if next_token == EOS_IDX:
                    break
                output_seq.append(next_token)
            
            
            predictions.append(out_voc.to_string(output_seq))  # Используем to_string
    return predictions

In [31]:
import json 
predictions = generate_predictions(model, test_dataset, inp_voc, out_voc)

In [32]:
print(f"Number of predictions: {len(predictions)}")
print(f"Number of test items: {len(test)}")

Number of predictions: 1000
Number of test items: 1000


In [33]:
with open('predictions.jsonl', 'w') as f:
    for i, pred in enumerate(predictions[:-1]):
        json_line = json.dumps({"dst": pred, "src": test[i]['src']}, ensure_ascii=False)
        f.write(json_line + '\n')