In [2]:
import gensim.utils as utils
from tqdm import tqdm
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
from nltk.tokenize import word_tokenize
import sentencepiece as spm

In [202]:
# spm.SentencePieceTrainer.train('--input=./data/train_test_small.txt --model_prefix=tokenization --vocab_size=2000')
spm.SentencePieceTrainer.train('--input=./data/train_test_small.txt --vocab_size=10000 --model_prefix=vs10k --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3')

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=./data/train_test_small.txt --vocab_size=10000 --model_prefix=vs10k --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./data/train_test_small.txt
  input_format: 
  model_prefix: vs10k
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  

In [2]:
sp = spm.SentencePieceProcessor("m.model")
pieces = sp.encode_as_pieces('ZCarr porzucił pracę księgowego w 1983 i utworzył swoją pierwszą klinikę Easyway, aby pomagać innym palaczom.')
print(pieces)
sp.decode_pieces(pieces)

ids = sp.encode_as_ids('ZCarr porzucił pracę księgowego w 1983 i utworzył swoją pierwszą klinikę Easyway, aby pomagać innym palaczom.')

sp.encode_as_ids

# print(sp.piece_to_id())
sp.id_to_piece([5,6,7])
# sp.decode_ids(ids + [sp.eos_id()])
sp.vocab_size()

sp.decode

['▁Z', 'C', 'ar', 'r', '▁po', 'rzucił', '▁pracę', '▁księg', 'owego', '▁w', '▁1983', '▁i', '▁u', 'tworzy', 'ł', '▁swoją', '▁pierwszą', '▁klin', 'i', 'kę', '▁E', 'a', 'sy', 'wa', 'y', ',', '▁aby', '▁pomaga', 'ć', '▁innym', '▁pal', 'a', 'czo', 'm', '.']


<bound method SentencePieceProcessor.Decode of <sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7fa60cac6cc0> >>

In [3]:
class SentencePieceDataset(torch.utils.data.Dataset):
    def __init__(self, sp_model, corpus_path = './data/train_small.txt', corpus_length = None, device = None):
        corpus_file = open(corpus_path)

        self.sp_processor = spm.SentencePieceProcessor(sp_model)

        self.vocab_size = self.sp_processor.vocab_size()

        if device == None:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.device = device

        if corpus_length == None:
            corpus_length = sum(1 for line in corpus_file)
            corpus_file.seek(0)
        
        self.corpus_indexes = []


        for i in tqdm(range(corpus_length)):
            self.corpus_indexes.append(self.sp_processor.encode_as_ids(corpus_file.readline()))

        self.corpus_indexes = sorted(self.corpus_indexes, key=lambda x: len(x))

        self.input_corpus_indexes = [[self.sp_processor.bos_id()] + corp_ind for corp_ind in self.corpus_indexes]

        self.output_corpus_indexes = [corp_ind + [self.sp_processor.eos_id()] for corp_ind in self.corpus_indexes]
        
    def ids_to_pieces(self, ids):
        return [self.sp_processor.id_to_piece(id) for id in ids]

    def pieces_to_ids(self, pieces):
        return [self.sp_processor.piece_to_id(piece) for piece in pieces]

    def __len__(self):
        return len(self.input_corpus_indexes)

    def __getitem__(self, index):
        return (torch.tensor(self.input_corpus_indexes[index], device=self.device),
            torch.tensor(self.output_corpus_indexes[index], device=self.device))



In [4]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [25]:
def pad_collate(data):
    def left_pad_sequence(tensors):
        max_len = max(list(map(len, tensors)))
        padded_seq = [torch.hstack([torch.zeros(max_len - len(t), device=t.device, dtype=torch.int32), t]) for t in tensors]
        return torch.stack(padded_seq)


    inputs = [d[0] for d in data]
    outputs = [d[1] for d in data]
    inputs = left_pad_sequence(inputs)
    outputs = left_pad_sequence(outputs)
    return inputs, outputs

In [7]:
dataset = SentencePieceDataset(sp_model = "./m.model", corpus_length = None, device=device)
# loader = torch.utils.data.DataLoader(dataset, batch_size=20, collate_fn=pad_collate)

100%|██████████| 1000000/1000000 [00:38<00:00, 26250.62it/s]


In [5]:
test_dataset = SentencePieceDataset(sp_model = "./m.model", corpus_path = "./data/test_small.txt", corpus_length = None, device=device)

100%|██████████| 100000/100000 [00:02<00:00, 38607.64it/s]


In [118]:
for x,y in loader:
    for s_in, s_out in zip(x,y):
        print(dataset.ids_to_pieces([x.item() for x in s_in]))
        print(dataset.ids_to_pieces([x.item() for x in s_out]))
    print('-----------------------------------')
    break

['<pad>', '<pad>', '<s>', '▁Kto', '▁wstrzymał', '▁się', '▁od', '▁głosu', '?']
['<pad>', '<pad>', '▁Kto', '▁wstrzymał', '▁się', '▁od', '▁głosu', '?', '</s>']
['<pad>', '<pad>', '<s>', '▁Kto', '▁wstrzymał', '▁się', '▁od', '▁głosowania', '?']
['<pad>', '<pad>', '▁Kto', '▁wstrzymał', '▁się', '▁od', '▁głosowania', '?', '</s>']
['<pad>', '<pad>', '<s>', '▁W', '▁braku', '▁odpowiedzi', '▁powtarza', '▁pytanie', '.']
['<pad>', '<pad>', '▁W', '▁braku', '▁odpowiedzi', '▁powtarza', '▁pytanie', '.', '</s>']
['<pad>', '<pad>', '<s>', '▁Warto', '▁zauważyć', '▁zmianę', '▁stanowiska', '▁rządu', '.']
['<pad>', '<pad>', '▁Warto', '▁zauważyć', '▁zmianę', '▁stanowiska', '▁rządu', '.', '</s>']
['<pad>', '<pad>', '<s>', '▁A', '▁dlaczego', '▁akurat', '▁50', '▁g', '?']
['<pad>', '<pad>', '▁A', '▁dlaczego', '▁akurat', '▁50', '▁g', '?', '</s>']
['<pad>', '<s>', '▁Otóż', '▁jaką', '▁wiz', 'ję', '▁ma', '▁prezydent', '?']
['<pad>', '▁Otóż', '▁jaką', '▁wiz', 'ję', '▁ma', '▁prezydent', '?', '</s>']
['<pad>', '<s>', '▁W

In [28]:
class RNN(nn.Module):
    def __init__(self, dataset, device, embedding_dim=100, hidden_size = 128, num_layers = 2):
        super(RNN, self).__init__()
        self.device = device

        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        n_vocab = dataset.vocab_size

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )

        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size, n_vocab)

    def forward(self, x, h0 = None):

        x.to(self.device)

        embed = self.embedding(x)

        if h0 == None:
            if len(x.shape) == 2:
                h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=self.device)
            else:
                h0 = torch.zeros(self.num_layers, self.hidden_size, device = self.device)

        output, state = self.rnn(embed, h0)
        
        logits = self.fc(output)

        return logits, state


        
model = RNN(dataset, device) 
model.to(device)

RNN(
  (embedding): Embedding(8000, 100, padding_idx=0)
  (rnn): RNN(100, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=8000, bias=True)
)

In [19]:
# model.load_state_dict(torch.load('./models/RNN_30ep.model'))
model.load_state_dict(torch.load("./models/RNN_1kk_bs50_2ep.model"))
torch.cuda.empty_cache()

In [None]:
def train(dataset, model, max_epochs = 30, batch_size = 1):
    model.train()

    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=pad_collate)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(2, max_epochs):        
        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()

            y_pred, _ = model(x)
            loss = criterion(y_pred.transpose(1, 2), y)
            
            loss.backward()
            optimizer.step()

            if (batch+1) % 500 == 0:
                print({ 'epoch': epoch, 'batch': batch + 1, 'loss': loss.item() })
                # torch.cuda.empty_cache()
        
        torch.save(model.state_dict(), f"./models/RNN_1kk_bs{batch_size}_{epoch+1}ep.model")
            
train(dataset, model)

In [29]:
model.load_state_dict(torch.load("./models/RNN/RNN_1kk_bs20_30ep.model"))

<All keys matched successfully>

In [10]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    
    ids = dataset.sp_processor.encode_as_ids(text)

    print(ids)

    x = torch.tensor([ids], device=model.device)
    y_pred, hidden_state = model(x)
    
    for i in range(0, next_words):
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        # word_index = np.argmax(p)
        ids.append(int(word_index))

        y_pred, hidden_state = model(torch.tensor([[word_index]], device=model.device), hidden_state)

    print(dataset.ids_to_pieces(ids))
    return dataset.sp_processor.decode_ids(ids)


In [33]:
for _ in range(1):
    print(predict(dataset, model, "Jednakże największe znaczenia zawsze miała", next_words=10))

[4812, 4360, 5182, 1164, 1015]
['▁Jednakże', '▁największe', '▁znaczenia', '▁zawsze', '▁miała', '▁bowiem', '▁do', '▁sytuacji', ',', '▁na', '▁którym', '▁umieszczono', '▁ze', '▁sobą', '▁opodatk']
Jednakże największe znaczenia zawsze miała bowiem do sytuacji, na którym umieszczono ze sobą opodatk


In [34]:
def beam_search(dataset, model, text, max_next_words, n_solutions):
    model.eval()

    words = text.split(' ')

    ids = dataset.sp_processor.encode_as_ids(text)

    x = torch.tensor([ids], device=model.device)

    y_pred, hidden_state = model(x)
    last_word_logits = y_pred[0][-1]
    log_p = torch.nn.functional.log_softmax(last_word_logits, dim=0).detach().cpu().numpy()

    best_indices = np.argsort(log_p)[::-1][:n_solutions]

    solutions = [(list(map(int, ids)) + [int(index)], log_p[index], hidden_state) for index in best_indices]

    for i in range(1, max_next_words):
        new_solutions = []

        for (prefix, score, prefix_state) in solutions:
            x = torch.tensor([[prefix[-1]]], device=model.device)
            y_pred, hi = model(x, prefix_state)
            last_word_logits = y_pred[0][-1]
            log_p = torch.nn.functional.log_softmax(last_word_logits, dim=0).detach().cpu().numpy()
            best_indices = np.argsort(log_p)[::-1][:n_solutions]
            new_solutions += [(prefix + [int(ind)], score + log_p[ind], hi) for ind in best_indices]

        best_indices = np.argsort([score for (_, score, _) in new_solutions])[::-1][:n_solutions]

        solutions = [new_solutions[ind] for ind in best_indices]

    return [(dataset.sp_processor.decode_ids(sent), lp) for (sent, lp, _) in solutions]
    # return solutions
            

In [35]:
beam_search(dataset, model, "świadkowie", max_next_words=3, n_solutions=3)

[('świadkowie zastępczej,', -5.503442),
 ('świadkowie, w tym', -6.339065),
 ('świadkowie publicznego, w', -6.4336476)]

In [40]:
def perplexity(dataset, model):
    model.eval()

    with torch.no_grad():
        criterion = nn.CrossEntropyLoss(reduction='sum')

        logs_sum = 0
        n_samples = 0
        
        for x,y in dataset:
            y_pred, _ = model(x)
            n_samples += len(x)
            logs_sum  += criterion(y_pred, y)

        return np.exp(logs_sum.item() / n_samples)
        

In [41]:
perplexity(test_dataset, model)

tensor(16469591., device='cuda:0')
3251844


158.3318002587777

In [43]:
perplexity(dataset, model)

tensor(1.6401e+08, device='cuda:0')
32565275


153.89469996251682