In [13]:
import os
import torch

import utils

%load_ext autoreload
%autoreload 2

CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
print(os.listdir("../data/"))

import math
from typing import List, Tuple

import nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from torch import Tensor
from torch import nn
from torch.autograd import Variable
from torch.nn import TransformerEncoderLayer, TransformerEncoder
from torch.types import Device
from torchtext.vocab import build_vocab_from_iterator, Vocab


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
['kawiki.txt', 'data.txt', 'data2.txt']


In [14]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

data = open("../data/kawiki.txt", encoding="utf-8").read().split("\n")
df = pd.DataFrame(data, columns=["sentence"])
train, test = train_test_split(df["sentence"].tolist(), train_size=0.6,
                                       test_size=0.2, random_state=0)
train, valid = train_test_split(train, test_size=0.25, random_state=0)
train_iter = train
valid_iter = valid
test_iter = test
vocab = build_vocab_from_iterator(map(word_tokenize, train_iter),
                                               specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

def __data_process(raw_text_iter):
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(word_tokenize(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# train_iter was "consumed" by the process of building the vocab,
# so we have to create it again
train_data = __data_process(train_iter)
val_data = __data_process(valid_iter)
test_data = __data_process(test_iter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Args:
        data: Tensor, shape [N]
        bsz: int, batch size

    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
val_data = batchify(val_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)

In [15]:
bptt = 35
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int

    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [16]:
from utils import TransformerModel

ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [17]:
from utils import generate_square_subsequent_mask
import copy
import time

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5,
                                                              min_lr=1e-6, patience=10)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = lr_scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [19]:
best_val_loss = float('inf')
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    lr_scheduler.step(val_loss)

-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 11.99s | valid loss  7.57 | valid ppl  1946.55
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 11.93s | valid loss  7.63 | valid ppl  2066.51
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 11.73s | valid loss  7.60 | valid ppl  2005.78
-----------------------------------------------------------------------------------------


In [20]:
test_loss = evaluate(best_model, test_data)
test_ppl = math.exp(test_loss)
print('=' * 89)
print(f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}')
print('=' * 89)

| End of training | test loss  7.58 | test ppl  1962.83


In [21]:
def generate_text(model, device: Device, vocab: Vocab, context: str,
                  length: int = 10, ):
    model = model.to(device)
    model.eval()
    src_mask = generate_square_subsequent_mask(1).to(device)

    res = context.split(' ')
    for i in range(length):
        x = vocab(word_tokenize(''.join(res)))
        x = torch.LongTensor(x).view(1, len(x)).to(device)
        y_pred = model(x, src_mask)
        y_pred = torch.topk(y_pred[0, -1], dim=-1, k=5).indices

        # we filter out <unk> and <eos> tokens having indices 0 and 3 respectively
        y_pred = [i.item() for i in y_pred if i.item() not in [0, 1]]
        res.append(vocab.get_itos()[np.random.choice(y_pred)])

    return ' '.join(res)

In [22]:
text = generate_text(best_model,device, vocab, 'პოლიტიკური კარიერა', 10)
print(text)

პოლიტიკური კარიერა და . მუნიციპალიტეტის და : წლის — . წელს :
