In [34]:
%load_ext autoreload
%autoreload 2
import os, sys
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
%matplotlib inline
%config InlineBackend.figure_format='retina'; # adapt plots for retina displays
import seaborn as sns
sns.set_style('darkgrid');
sns.set_context(context='notebook');
from typing import List
from tqdm import tqdm_notebook as tqdm

from pathlib import Path
from pprint import pprint
import numpy as np 
import pandas as pd
import io
import ftfy 
import re

import sentencepiece as spm

from studio_client import Environment

2019-03-22 23:08:49,822 : DEBUG : Loaded backend module://ipykernel.pylab.backend_inline version unknown.


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
env = Environment(project="dummy",
                  studio_endpoint="https://mlstudio.sapai.c.eu-de-1.cloud.sap:30001") 

dataset_path = env.get_file('datasets/ady.txt')

2019-03-22 23:08:50,741 : INFO : Initializing environment with Studio API: https://mlstudio.sapai.c.eu-de-1.cloud.sap:30001
2019-03-22 23:08:50,858 : DEBUG : Loading latest version (1) for datasets/ady.txt from local.


In [36]:
text = []
with io.open(dataset_path, mode='r', encoding='utf-8') as f:
    for line in f:
        line =  ftfy.fix_text(line, normalization='NFKC')
        line = line.replace('\n', '')
        line = re.sub("\s\s+", " ", line)
        line = line.strip()
        if len(line)<=2 : continue
        text += [line]

In [37]:
text[:100]

['Ady Endre összes költeménye',
 'ÚJ VERSEK – 1906',
 'E versek mind–mind a Léda asszonyéi, aki kedvelte és akarta õket. Én el',
 'szoktam pusztítani a verseimet fogyó életem növõ lázában, mély viharzásokon',
 'és poklok tüzében. Ennek a néhány versnek megkegyelmeztem. Engedtem',
 'õket életre jönni, s átnyújtom õket Léda asszonynak.',
 'GÓG ÉS MAGÓG FIA VAGYOK ÉN...',
 'Góg és Magóg fia vagyok én,',
 'Hiába döngetek kaput, falat',
 'S mégis megkérdem tõletek:',
 'Szabad–e sírni a Kárpátok alatt?',
 'Verecke híres útján jöttem én,',
 'Fülembe még õsmagyar dal rivall,',
 'Szabad–e Dévénynél betörnöm',
 'Új idõknek új dalaival?',
 'Fülembe forró ólmot öntsetek',
 'Legyek az új, az énekes Vazul,',
 'Ne halljam az élet új dalait,',
 'Tiporjatok reám durván, gazul.',
 'De addig sírva, kínban, mit se várva',
 'Mégiscsak száll új szárnyakon a dal',
 'S ha elátkozza százszor Pusztaszer,',
 'Mégis gyõztes, mégis új és magyar.',
 'LÉDA ASSZONY ZSOLTÁRAI',
 'A MI GYERMEKÜNK',
 'Bús szerelmünkbõl 

In [38]:
os.makedirs("data", exist_ok=True)
with io.open("data/ady_clean.txt", mode='wt') as f:
    for line in text:
        f.write(line+'\n')

In [39]:
os.listdir('data')

['model.vocab', 'model.pth', 'model.model', 'ady_clean.txt']

In [40]:
logger = logging.getLogger(__name__)

class SentPieceProcessor():
    
    TK_PAD = '<pad>'
    TK_UNK = '<unk>'
    TK_SOS = '<s>'
    TK_EOS = '</s>'
    
    def __init__(self, model_path:str):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(model_path))
        self.id_unk = self.sp.unk_id()
        self.tk_unk = self.sp.IdToPiece(self.id_unk)
        self.tk_sos = self.sp.IdToPiece(1)
        self.tk_eos = self.sp.IdToPiece(2)
        self.tk_ws = self.sp.IdToPiece(3)
        self.vocab_size = len(self.sp)
        logger.info(f"Initialized SentPieceProcessor from {model_path}")
    
    def numericalize(self, tokens: List[str]) -> List[List[int]]:
        if isinstance(tokens, str): tokens = [tokens]
        ids =  [self.sp.EncodeAsIds(s) for s in tokens]
        if len(ids) == 1: ids=ids[0]
        return ids

    def piecify(self, tokens: List[str]) -> List[List[str]]:
        if isinstance(tokens, str): tokens = [tokens]
        return [self.sp.EncodeAsPieces(s) for s in tokens]
    
    def textify(self, ids: List[int]) -> str:
        if isinstance(ids, list) and isinstance(ids[0], np.generic): 
            ids = [int(x) for x in ids]
        if not isinstance(ids, list) and not isinstance(ids[0], int):
            raise TypeError("Argument `ids` has to be a list of integers.")            
        return self.sp.DecodeIds(ids)
    
    def __repr__(self):
        s = f"SentPieceVocab (size: {self.vocab_size}" \
            f" 0:{self.tk_unk}, 1:{self.tk_sos}, 2:{self.tk_eos}, 3:{self.tk_ws})"    
        return s
    
    @classmethod
    def from_file(cls, input_file:str, output_path:str='default', vocab_size:int=16000, 
               char_cov:float=1.0, model_type:str='unigram'):
        
        assert model_type in ['unigram', 'bpe', 'char', 'word']
        assert 0 < char_cov <= 1
        input_file = str(input_file)
        output_file =  os.path.splitext(str(output_path))[0]
        ext = '.model'
        train_cmd = f"--input={input_file} --model_prefix={output_file} --user_defined_symbols={cls.TK_PAD}"\
                    f" --vocab_size={vocab_size} --character_coverage={char_cov} --model_type={model_type}"

        logger.info(f"Train command: {train_cmd}")
        logger.info(f"Started training SentencePiece model...")
        ret = spm.SentencePieceTrainer.Train(train_cmd)
        logger.info(f"Exit code: {int(ret)}")
        return cls(output_file+ext)

    @classmethod
    def from_texts(cls, texts:List[str], output_path:str='default', vocab_size:int=16000,
                   char_cov:float=1.0, model_type:str='unigram'):
        
        tmp_path = '/tmp/sentencepiece/'
        os.makedirs(tmp_path, exist_ok=True)

        with open(tmp_path+"tmp.txt", 'wt') as fin:
            for line in texts:
                fin.write(line+"\n")
        
        spp = cls.from_file(tmp_path, output_path, vocab_size=vocab_size,
                             char_cov=char_cov, model_type=model_type)
        os.remove(tmp_path+"tmp.txt")
        return spp
    
        
    def _get_state():
        pass
    
    def _set_state(state):
        pass
     

In [41]:
processor = SentPieceProcessor.from_file("data/ady_clean.txt", output_path="data/model", 
                                         char_cov=0.999, vocab_size=32000, model_type='bpe')

2019-03-22 23:08:54,204 : INFO : Train command: --input=data/ady_clean.txt --model_prefix=data/model --user_defined_symbols=<pad> --vocab_size=32000 --character_coverage=0.999 --model_type=bpe
2019-03-22 23:08:54,205 : INFO : Started training SentencePiece model...
2019-03-22 23:09:02,989 : INFO : Exit code: 1
2019-03-22 23:09:03,010 : INFO : Initialized SentPieceProcessor from data/model.model


In [42]:
ids = processor.numericalize("Elmúlt a tél, Léda is elmúlt...")
ids

[11913, 5, 2173, 31931, 2923, 106, 12047, 156]

In [43]:
processor.textify(ids)

'Elmúlt a tél, Léda is elmúlt...'

In [44]:
processor.sp.IdToPiece(0), processor.sp.IdToPiece(1), processor.sp.IdToPiece(2), processor.sp.IdToPiece(3)

('<unk>', '<s>', '</s>', '<pad>')

In [45]:
import torch

from torch import nn, optim, tensor
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [46]:
from collections import defaultdict


class AdyDataset(Dataset):
    
    def __init__(self, texts, processor: SentPieceProcessor):
        self.texts = texts
        self.processor = processor
                
        self.itos = [self.processor.sp.IdToPiece(i) for i in range(self.processor.vocab_size)]
        self.stoi = defaultdict(int,{v:k for k,v in enumerate(self.itos)}) 
        
        self.vocab_size = self.processor.vocab_size
        self.num_samples = len(texts)
        self.max_len = max(len(t) for t in texts)
    
    def __getitem__(self, idx):
        seq = self.numericalize(self.texts[idx])
        x_len = len(seq)
        pad = (0, self.max_len - x_len)
        seq = F.pad(seq, pad, 'constant', value = self.stoi['<pad>'])
        return {'x': seq[:-1], 
                'y': seq[1:],
                'x_len': x_len}
    def __len__(self): return self.num_samples
    
    def numericalize(self, text:str, sos_idx=1, eos_idx=2):
        assert isinstance(text, str), "text has to be a string"
        sequence =  [sos_idx] + self.processor.numericalize(text) + [eos_idx]
        return torch.tensor(sequence)
    
    def textify(self, ids:list):
        assert isinstance(ids, (list, torch.Tensor)), "ids has to be an iterable (list/tensor)"
        if isinstance(ids, torch.Tensor): ids = list(ids.detach().cpu().numpy())
        return self.processor.textify(ids)

In [47]:
processor = SentPieceProcessor.from_file("data/ady_clean.txt", output_path="data/model", 
                                         char_cov=0.999, vocab_size=32000, model_type='bpe')
data = AdyDataset(text, processor)

2019-03-22 23:09:03,506 : INFO : Train command: --input=data/ady_clean.txt --model_prefix=data/model --user_defined_symbols=<pad> --vocab_size=32000 --character_coverage=0.999 --model_type=bpe
2019-03-22 23:09:03,507 : INFO : Started training SentencePiece model...
2019-03-22 23:09:11,876 : INFO : Exit code: 1
2019-03-22 23:09:11,892 : INFO : Initialized SentPieceProcessor from data/model.model


In [48]:
ids = data.numericalize("Leda elbujt")
data.textify(ids)

'Leda elbujt'

In [49]:
# bs = 4

# dataloader = DataLoader(dataset=data, batch_size=bs, shuffle=True)

# for batch in dataloader:
    
#     sorted_idx = np.array(batch['x_len']).argsort()[::-1].tolist()
#     data_batch = {k:t[sorted_idx] for k,t in batch.items()}
#     print(data_batch)
    
#     break

In [50]:
x = data_batch['x']
x.shape

NameError: name 'data_batch' is not defined

In [179]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size=128, hidden_size=128, num_layers=1, 
                 dropout_emb=0.2, dropout_lstm=0.0, tie_weights=True):
        super().__init__()
        
        self.dropout = nn.Dropout(p=dropout_emb)
        self.encoder = nn.Embedding(vocab_size, embedding_size, padding_idx=3)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, bias=True, 
                           batch_first=True, dropout=dropout_lstm)
        

        self.decoder = nn.Linear(hidden_size, vocab_size)
        
        if tie_weights:
            if embedding_size != hidden_size:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight
        
        self.init_weights()
        self.embedding_size=embedding_size
        self.hidden_size=hidden_size
        self.num_layers=num_layers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
    
    def init_hidden(self, bs):
        weight = next(self.parameters())
        return (weight.new_zeros(self.num_layers, bs, self.hidden_size),
                weight.new_zeros(self.num_layers, bs, self.hidden_size))
    
    def forward(self, inputs, hidden=None, softmax=False):  
        emb = self.dropout(self.encoder(inputs))
        output, hidden = self.lstm(emb, hidden)
        output = self.dropout(output)
        bs, sl, hsz = output.shape
        output = output.contiguous().view(bs * sl, hsz)
        output = self.decoder(output).view(bs, sl, -1)
        
        return (F.softmax(output, dim=2), hidden) if softmax else (output, hidden)

In [180]:
model = RNN(data.vocab_size, embedding_size=32, hidden_size=32, num_layers=2, tie_weights=True)

In [42]:
b = next(iter(dataloader))
inp, lens, targ = b['x'], b['x_len'], b['y']

In [43]:
output, hidden = model(inp)

In [44]:
inp.shape, data.vocab_size, data.max_len-1

(torch.Size([4, 81]), 32000, 81)

In [45]:
targ.shape, output.shape

(torch.Size([4, 81]), torch.Size([4, 81, 32000]))

In [46]:
_, pred_inds = torch.max(output, dim=1)
pred_inds.shape

torch.Size([4, 32000])

## train model

In [184]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_size=128, hidden_size=128, num_layers=2, 
                 dropout_emb=0.1, dropout_lstm=0.25, tie_weights=True):
        super().__init__()
        
        self.dropout = nn.Dropout(p=dropout_emb)
        self.encoder = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, bias=True, 
                           batch_first=True, dropout=dropout_lstm)
        

        self.decoder = nn.Linear(hidden_size, vocab_size)
        
        if tie_weights:
            if embedding_size != hidden_size:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight
        
        self.init_weights()
        self.embedding_size=embedding_size
        self.hidden_size=hidden_size
        self.num_layers=num_layers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
    
    def init_hidden(self, bs):
        weight = next(self.parameters())
        return (weight.new_zeros(self.num_layers, bs, self.hidden_size),
                weight.new_zeros(self.num_layers, bs, self.hidden_size))
    
    def forward(self, inputs, hidden=None, softmax=False):  
        emb = self.dropout(self.encoder(inputs))
        output, hidden = self.lstm(emb, hidden)
        output = self.dropout(output)
        bs, sl, hsz = output.shape
        output = output.contiguous().view(bs * sl, hsz)
        output = self.decoder(output).view(bs, sl, -1)
        
        return (F.softmax(output, dim=2), hidden) if softmax else (output, hidden)

In [12]:
processor = SentPieceProcessor.from_file("data/ady_clean.txt", output_path="data/model", 
                                         char_cov=0.999, vocab_size=32000, model_type='bpe')
data = AdyDataset(text, processor)

2019-03-22 21:51:51,514 : INFO : Train command: --input=data/ady_clean.txt --model_prefix=data/model --user_defined_symbols=<pad> --vocab_size=32000 --character_coverage=0.999 --model_type=bpe
2019-03-22 21:51:51,516 : INFO : Started training SentencePiece model...
2019-03-22 21:51:59,470 : INFO : Exit code: 1
2019-03-22 21:51:59,487 : INFO : Initialized SentPieceProcessor from data/model.model


In [185]:
bs = 40
train_dl = DataLoader(dataset=data, batch_size=bs, shuffle=True)

num_epochs = 200
max_norm = .5

lr = 0.03
log_interval = 300

embedding_size=256
hidden_size=256
num_layers=1
tie_weights=True

model = RNN(data.vocab_size, embedding_size=embedding_size, 
            hidden_size=hidden_size, num_layers=num_layers, tie_weights=True).to(device)

opt_fn = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=data.stoi['<pad>'], reduction='mean')

  "num_layers={}".format(dropout, num_layers))


In [186]:
model

RNN(
  (dropout): Dropout(p=0.1)
  (encoder): Embedding(32000, 256, padding_idx=0)
  (lstm): LSTM(256, 256, batch_first=True, dropout=0.25)
  (decoder): Linear(in_features=256, out_features=32000, bias=True)
)

In [None]:
losses=[]
hidden = model.init_hidden(bs)
try:
    for epoch in range(num_epochs):
        total_loss = 0.0
        for i, batch in enumerate(tqdm(train_dl)):
            x, y = batch['x'].to(device), batch['y'].to(device)
            x_len =  batch['x_len'].to(device)
#             hidden = repackage_hidden(hidden)
            model.zero_grad()
            output, hidden = model(x, softmax=False)
            _, pred = torch.max(output, dim=2)
            loss = criterion(output.permute(0, 2, 1), y)
            loss.backward()
            if max_norm:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

            opt_fn.step()

            total_loss += loss.item()
            losses.append(loss.item())

#             if i % log_interval == 0 and i > 0:
#                 cur_loss = total_loss / log_interval
#                 print("epoch {} | batch {}/{} | loss {:5.2f} | ppl {:8.2f}".format(
#                     epoch, i, len(train_dl), cur_loss, np.exp(cur_loss)))
#                 total_loss = 0
        epoch_loss = total_loss / len(train_dl)
        print("epoch {} | loss {:5.2f} | ppl {:8.2f}".format(epoch, epoch_loss, math.exp(epoch_loss)))

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

finally:
    torch.save(model.state_dict(), "data/model.pth")

HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

epoch 0 | loss  5.87 | ppl   355.55


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

epoch 1 | loss  5.89 | ppl   359.71


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

epoch 2 | loss  5.88 | ppl   358.55


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

epoch 3 | loss  5.87 | ppl   356.01


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

epoch 4 | loss  5.89 | ppl   360.66


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

epoch 5 | loss  5.89 | ppl   359.93


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

epoch 6 | loss  5.89 | ppl   361.58


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

epoch 7 | loss  5.88 | ppl   356.63


HBox(children=(IntProgress(value=0, max=756), HTML(value='')))

In [131]:
inp = "S a táj"
# inp = torch.tensor([data.stoi[t] for t in inp]).unsqueeze(0)

In [135]:
input = data.numericalize(inp).unsqueeze(0).to(device)
emb = model.encoder(input)
out, hid = model.lstm(emb, hid)
out = model.decoder(out)

NameError: name 'hid' is not defined

In [168]:
inp = "S a táj"

hidden = model.init_hidden(1)
vocab_size = data.vocab_size
max_len=10
temp = 0.9
#
# input = torch.randint(vocab_size, (1, 1), dtype=torch.long).to(device)

input = data.numericalize(inp).long().unsqueeze(0).to(device)
with torch.no_grad():  # no tracking history
    for i in range(max_len):
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().div(temp).exp().cpu()
        word_idx = torch.multinomial(word_weights, 1)[0]
        input.fill_(word_idx)
        word = data.processor.sp.IdToPiece(int(word_idx.item()))
#         outf.write(word + ('\n' if i % 20 == 19 else ' '))
        print(word)

RuntimeError: _th_fill_ only supports a 0-dimensional value tensor, but got tensor with 1 dimension(s).

In [166]:
input.shape

torch.Size([1, 1])

In [73]:
def generate_example(model, stoi, itos, temperature=1.0,  max_len=100, hidden_state=None):

    start_token, start_idx = '<s>', 1
    
    # Start state.
    inputs = torch.tensor(stoi[start_token]).unsqueeze(0).unsqueeze(0).to(device)

    sentence = [start_token]
    i = 0
    while i < max_len and sentence[-1] not in ['</s>', '<pad>']:
        i += 1
        
        embedded = model.encoder(inputs)
        output, hidden_state = model.lstm(embedded, hidden_state)

        batch_size, sequence_len, hidden_size = output.shape
        output = output.contiguous().view(batch_size * sequence_len, hidden_size)    
        output = model.decoder(output).view(batch_size, sequence_len, -1).squeeze(0)
        #_, prediction = torch.max(F.softmax(output, dim=2), dim=2)
        
        word_weights = output.div(temperature).exp().cpu()
        if len(word_weights.shape) > 1:
            word_weights = word_weights[-1] # Pick the last word.    
        
        word_idx = torch.multinomial(word_weights, 1).view(-1)
        
        sentence.append(itos[int(word_idx)])
        
        inputs = tensor([stoi[word] for word in sentence]).unsqueeze(0).to(device)
        
    print(''.join(sentence))

In [93]:
for _ in range(10):
    generate_example(model, data.stoi, data.itos, max_len=50, temperature=0.5)

<s>▁A▁Halált,</s>
<s>▁Mi–voltom,</s>
<s>▁S▁ha▁a▁sok▁világ,</s>
<s>▁S▁nem,▁magyar▁a▁legszebb</s>
<s>▁S▁nem▁a▁Sors▁régi</s>
<s>▁S▁egy–e,▁a▁messze,</s>
<s>▁A▁nagy,▁szent▁tervelõk</s>
<s>▁S▁a▁magyar,</s>
<s>▁S▁ha▁én,▁hogy,</s>
<s>▁Egy▁a▁magyar.</s>


In [65]:
text[50:100]

['A VÁR FEHÉR ASSZONYA',
 'A lelkem ódon, babonás vár,',
 'Mohos, gõgös és elhagyott.',
 '(A két szemem, ugye, milyen nagy?',
 'És nem ragyog és nem ragyog.)',
 'Konganak az elhagyott termek,',
 'A bús falakról rámered',
 'Két nagy, sötét ablak a völgyre.',
 '(Ugye, milyen fáradt szemek?)',
 'Örökös itt a lélekjárás,',
 'A kripta–illat és a köd.',
 'Árnyak suhognak a sötétben',
 'S elátkozott had nyöszörög.',
 '(Csak néha, titkos éji órán',
 'Gyúlnak ki e bús, nagy szemek.)',
 'A fehér asszony jár a várban',
 'S az ablakokon kinevet.',
 'MERT ENGEM SZERETSZ',
 'Áldott csodáknak',
 'Tükre a szemed,',
 'Mert engem nézett.',
 'Te vagy a bölcse,',
 'Mesterasszonya',
 'Az ölelésnek.',
 'Áldott ezerszer',
 'Az asszonyságod,',
 'Mert engem nézett,',
 'Mert engem látott.',
 'S mert nagyon szeretsz:',
 'Nagyon szeretlek',
 'S mert engem szeretsz:',
 'Te vagy az Asszony,',
 'Te vagy a legszebb.',
 'A KÖNNYEK ASSZONYA',
 'Bús arcát érzem szívemen',
 'A könnyek asszonyának,',
 'Rózsás, remegõ ujja