In [1]:
import torch
import tqdm
from nltk.translate.bleu_score import corpus_bleu
from torchtext.legacy.data import BucketIterator
from config import read_training_pipeline_params
from load_data import get_dataset, split_data, _len_sort_key
import my_network
from train_model import evaluate
from utils import generate_translation, get_text
import random
import numpy as np

## Seq2seq c предобученным эмбедингом

Обучал 20 эпох, каждая эпоха примерно 1мин 10сек

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
Encoder = my_network.Encoder
Decoder = my_network.Decoder
Seq2Seq = my_network.Seq2Seq

In [4]:
config = read_training_pipeline_params("train_config_pretrained_emb.yaml")

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
SRC, TRG, dataset = get_dataset(config.dataset_path)
train_data, valid_data, test_data = split_data(dataset, **config.split_ration.__dict__)
SRC.vocab = torch.load("vocabs/src_vocab_pret")
TRG.vocab = torch.load("vocabs/trg_vocab_pret")
# SRC.build_vocab(train_data, min_freq=3)
# TRG.build_vocab(train_data, min_freq=3)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=config.BATCH_SIZE,
    device=device,
    sort_key=_len_sort_key
)

NameError: name 'device' is not defined

In [7]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

In [8]:
enc = Encoder(INPUT_DIM, config.net_params.ENC_EMB_DIM, config.net_params.HID_DIM,
              config.net_params.N_LAYERS, config.net_params.ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, config.net_params.DEC_EMB_DIM, config.net_params.HID_DIM,
              config.net_params.N_LAYERS, config.net_params.DEC_DROPOUT)

In [9]:
model = Seq2Seq(enc, dec, device).to(device)
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(9252, 300)
    (rnn): LSTM(300, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6734, 300)
    (rnn): LSTM(300, 512, num_layers=2, dropout=0.5)
    (out): Linear(in_features=512, out_features=6734, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [10]:
checkpoint = torch.load("models/pret_emb_model.pt", map_location='cpu')
model.load_state_dict(checkpoint, strict=True)

<All keys matched successfully>

In [13]:
for idx, batch in enumerate(test_iterator):
    if idx > 5:
        break
    src = batch.src[:, idx:idx+1]
    trg = batch.trg[:, idx:idx+1]
    
    generate_translation(src, trg, model, TRG.vocab, SRC.vocab)

Source: гости могут готовить на общей кухне .
Original: you will find a shared kitchen at the property .
Generated: guests will find a shared kitchen .

Source: у отеля разбит большой сад .
Original: the hotel has large gardens .
Generated: at a terrace .

Source: по запросу и за дополнительную плату организуется трансфер .
Original: a shuttle service can be arranged on request at an additional fee .
Generated: airport , and shuttle service can be arranged at a surcharge .

Source: поездка до международного аэропорта дель бахио займет 25 минут .
Original: the international airport can be reached in a 25 - minute drive .
Generated: international airport is a minutes - minute drive away .

Source: в собственной ванной комнате в вашем распоряжении душ и ванна .
Original: featuring a shower , private bathroom also comes with a bathtub .
Generated: the private bathroom comes with a bath or shower .

Source: помимо этого , на территории отеля гости смогут воспользоваться принадлежностями для

In [26]:
original_text = []
generated_text = []
with torch.no_grad():

    for i, batch in tqdm.tqdm(enumerate(test_iterator)):

        src = batch.src
        trg = batch.trg

        output = model(src, trg, 0) #turn off teacher forcing

        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]

        output = output.argmax(dim=-1)
        
        original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])
        generated_text.extend([get_text(x, TRG.vocab) for x in output[1:].detach().cpu().numpy().T])


15it [01:09,  4.64s/it]


In [45]:
corpus_bleu([[text] for text in original_text], generated_text) * 100

1.393766008719758

## Seq2Seq с предобученным эмбедингом и аттеншеном

Время эпохи увеличилось примерно до 3-х минут. Где-то после 8-й эпохи модель начала переобучаться

In [4]:
import network_gru_attention

In [5]:
config = read_training_pipeline_params("train_config_pretrained_emb.yaml")

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SRC, TRG, dataset = get_dataset(config.dataset_path)
train_data, valid_data, test_data = split_data(dataset, **config.split_ration.__dict__)
SRC.vocab = torch.load("vocabs/src_vocab_attn")
TRG.vocab = torch.load("vocabs/trg_vocab_attn")
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=config.BATCH_SIZE,
    device=device,
    sort_key=_len_sort_key
)

In [10]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

In [11]:
Encoder = network_gru_attention.Encoder
Decoder = network_gru_attention.Decoder
Seq2Seq = network_gru_attention.Seq2Seq
Attention = network_gru_attention.Attention
attn = Attention(config.net_params.HID_DIM, config.net_params.HID_DIM)
enc = Encoder(INPUT_DIM, config.net_params.ENC_EMB_DIM, config.net_params.HID_DIM, config.net_params.HID_DIM,
              config.net_params.ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, config.net_params.DEC_EMB_DIM, config.net_params.HID_DIM, config.net_params.HID_DIM,
              config.net_params.DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device)

In [12]:
checkpoint = torch.load("models/attn_model.pt", map_location='cpu')
model.load_state_dict(checkpoint, strict=True)

<All keys matched successfully>

In [16]:
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(9252, 300)
    (rnn): GRU(300, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(6734, 300)
    (rnn): GRU(1324, 512)
    (fc_out): Linear(in_features=1836, out_features=6734, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [17]:
for idx, batch in enumerate(test_iterator):
    if idx > 5:
        break
    src = batch.src[:, idx:idx+1]
    trg = batch.trg[:, idx:idx+1]
    
    generate_translation(src, trg, model, TRG.vocab, SRC.vocab)

Source: дети до 11 лет размещаются бесплатно .
Original: children under 11 stay for free .
Generated: children ’ s playground is free free of charge .

Source: на стойке регистрации можно взять бесплатные газеты .
Original: free newspapers are available at the reception .
Generated: free newspapers can be free at the reception desk and free .

Source: расстояние до международного аэропорта бухареста составляет км .
Original: bucharest international airport is km from here .
Generated: the nearest airport is belgrade international airport , km from the property .

Source: научный центр расположен в 20 , 2 км .
Original: science center is 12 . 6 miles away .
Generated: the center center is 20 . 2 km away .

Source: к услугам гостей кабельное телевидение , стиральная машина и микроволновая печь .
Original: facilities include cable tv , washing machine and microwave .
Generated: the cable tv , a washing machine , a washing machine and a microwave .

Source: гости могут провести время за пр

In [18]:
original_text = []
generated_text = []
with torch.no_grad():

    for i, batch in tqdm.tqdm(enumerate(test_iterator)):

        src = batch.src
        trg = batch.trg

        output = model(src, trg, 0) #turn off teacher forcing

        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]

        output = output.argmax(dim=-1)
        
        original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])
        generated_text.extend([get_text(x, TRG.vocab) for x in output[1:].detach().cpu().numpy().T])


15it [03:33, 14.26s/it]


In [19]:
corpus_bleu([[text] for text in original_text], generated_text) * 100

30.54667083223913

## Seq2seq Трансформер с предобученным эмбедингом

In [2]:
import network_transformer

In [3]:
config = read_training_pipeline_params("train_config_pretrained_emb_transformer.yaml")

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SRC, TRG, dataset = get_dataset(config.dataset_path, config.net_params.transformer)
train_data, valid_data, test_data = split_data(dataset, **config.split_ration.__dict__)
SRC.vocab = torch.load("vocabs/src_vocab_transformer")
TRG.vocab = torch.load("vocabs/trg_vocab_transformer")
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=config.BATCH_SIZE,
    device=device,
    sort_key=_len_sort_key
)

In [7]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

In [8]:
Encoder = network_transformer.Encoder
Decoder = network_transformer.Decoder
Seq2Seq = network_transformer.Seq2Seq
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM,
              HID_DIM,
              ENC_LAYERS,
              ENC_HEADS,
              ENC_PF_DIM,
              ENC_DROPOUT,
              device)

dec = Decoder(OUTPUT_DIM,
              HID_DIM,
              DEC_LAYERS,
              DEC_HEADS,
              DEC_PF_DIM,
              DEC_DROPOUT,
              device)
model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device)

In [9]:
checkpoint = torch.load("models/transformer_model.pt", map_location='cpu')
model.load_state_dict(checkpoint, strict=True)

<All keys matched successfully>

In [10]:
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(9252, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
     

In [12]:
for idx, batch in enumerate(test_iterator):
    if idx > 3:
        break
    src = batch.src[:, idx:idx+1]
    trg = batch.trg[:, idx:idx+1]
    
    generate_translation(src, trg, model, TRG.vocab, SRC.vocab, config.net_params.transformer)

Source: 
Original: 
Generated: a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a

Source: гостям в гостям к в - в кроме в ежедневно в гости на позавтракать из гостям гостям в бесплатная на до на кроме на на до в в в постельное у предоставляются апартаменты до гостям в каждую в номер на каждое на в на в гости также гостям к аэропорт из на в регистрация в в кухня апартаменты большинство в при в на среди рядом гости на гости места местный гостям на гостям в по номера в в каждое на завтрак в в в до прямо на в в все в на поблизости эти до 

In [16]:
original_text = []
generated_text = []
with torch.no_grad():

    for i, batch in tqdm.tqdm(enumerate(test_iterator)):

        src = batch.src
        trg = batch.trg

        output, _ = model(src, trg) #turn off teacher forcing

        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]

        output = output.argmax(dim=-1)
        
        original_text.extend([get_text(x, TRG.vocab) for x in trg.cpu().numpy().T])
        generated_text.extend([get_text(x, TRG.vocab) for x in output[1:].detach().cpu().numpy().T])


30it [00:25,  1.18it/s]


In [17]:
corpus_bleu([[text] for text in original_text], generated_text) * 100

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


6.3223772129702204e-77

## end