<a href="https://colab.research.google.com/github/ben0it8/poetry-language-model/blob/master/mixed_language_model_wordpiece.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Install extra requirements

In [0]:
!pip install -U ftfy
!pip install -U sentencepiece

Collecting ftfy
[?25l  Downloading https://files.pythonhosted.org/packages/8f/86/df789c5834f15ae1ca53a8d4c1fc4788676c2e32112f6a786f2625d9c6e6/ftfy-5.5.1-py3-none-any.whl (43kB)
[K    100% |████████████████████████████████| 51kB 2.5MB/s 
Installing collected packages: ftfy
Successfully installed ftfy-5.5.1
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/7e/8a/0e4a10bc00a0263db8d45d0062c83892598eb58e8091f439c63926e9b107/sentencepiece-0.1.81-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 13.6MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.81


In [0]:
import os, sys
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%matplotlib inline
%config InlineBackend.figure_format='retina'; # adapt plots for retina displays
import seaborn as sns
sns.set_style('darkgrid');
sns.set_context(context='notebook');
from typing import List
from tqdm import tqdm_notebook as tqdm
from pathlib import Path
from pprint import pprint
import numpy as np 
import math
import time
import pandas as pd
import io
import ftfy 
import re
import requests
import os
import torch
from collections import defaultdict
import dill
import pickle

import torch
from torch import nn, optim, tensor
from torch.nn import functional as F
from random import sample

from google.colab import files
from zipfile import ZipFile

import sentencepiece as spm

logger = logging.getLogger(__name__)

data_dir = Path("data/mixed").resolve()
data_dir.mkdir(exist_ok=True, parents=True)
url = "https://raw.githubusercontent.com/ben0it8/ady/master/data/merged.txt" 

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [0]:
def get_file_paths(path):
  path = Path(path)
  ret = []
  for x in path.glob('*'):
    if x.is_file() and not str(x).endswith('.zip'):
      ret.append(str(x))
  return ret

def zip_dir(path, name):
  if not name.endswith('.zip'): name += '.zip'
  file_paths = get_file_paths(path)
  
  with ZipFile(path/name, 'w') as zip:
    for file in file_paths:
      zip.write(file, os.path.basename(file))
  print(f"Zipped files at {path}")
  return (path/name).resolve()

##Read data

## Clean data & write to disk

In [60]:
def is_title(t):
  t = re.sub('[\d\s\s\-!?,]', '', t)
  return t.isupper()

def fix_text(t:str):
  t = ftfy.fix_text(t, normalization='NFKC')
  t = t.replace('\n', '') # remove newlines
  t = re.sub(r'[»«]', '', t) # remove special parenthesis
  t = re.sub(r'[0-9]','', t)
  t = re.sub("\s\s+", " ", t) # skip whitespaces
  t = t.strip()
  return t
    
def fix_texts(texts:list):
  out = []
  for i, line in enumerate(texts):
    if "\u2424" in line:
      line = line.split("\u2424")
    elif '\u2028' in line:
      line = line.split('\u2028')
    elif '\u000A' in line:
      line = line.split('\u000A')
    else:
      line = [line] 
    for t in line:
      t = fix_text(t)
      if (t is None or len(t.replace(' ', ''))<=3 or is_title(t) or
          t.startswith(('.', ',', '?', '!', '-', ';')) or len(t) > 100):
        continue
      else:
        out += [t]   
  return out

response =  requests.get(url)
texts = [line.decode() for line in response.iter_lines()]
print(f"No. of lines: {len(texts)}")
clean_texts = fix_texts(texts)
(data_dir/'text_clean.txt').open(mode='wt').writelines(f"{line}\n" for line in clean_texts)
print(f"No. of lines: {len(clean_texts)}")
np.random.shuffle(clean_texts)

No. of lines: 163126
No. of lines: 177393


## Define Tokenizer (which uses SentencePiece), Corpus ( data handler)

In [0]:
class Tokenizer():
    
    ID_UNK = 0
    ID_SOS = 1
    ID_EOS = 2
    
    def __init__(self, model_path:str):
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(model_path))
        self.id_unk = self.sp.unk_id()
        self.tk_unk = self.sp.IdToPiece(self.id_unk)
        self.tk_sos = self.sp.IdToPiece(self.ID_SOS)
        self.tk_eos = self.sp.IdToPiece(self.ID_EOS)
        self.vocab_size = len(self.sp)
        logger.info(f"Initialized SentPieceProcessor from {model_path}")
    
    def numericalize(self, tokens: List[str]) -> List[List[int]]:
        if isinstance(tokens, str): tokens = [tokens]
        ids =  [self.sp.EncodeAsIds(s) for s in tokens]
        if len(ids) == 1: ids=ids[0]
        return ids

    def piecify(self, tokens: List[str]) -> List[List[str]]:
        if isinstance(tokens, str): tokens = [tokens]
        return [self.sp.EncodeAsPieces(s) for s in tokens]
    
    def textify(self, ids: List[int]) -> str:
        if isinstance(ids, list) and isinstance(ids[0], np.generic): 
            ids = [int(x) for x in ids]
        if not isinstance(ids, list) and not isinstance(ids[0], int):
            raise TypeError("Argument `ids` has to be a list of integers.")            
        return self.sp.DecodeIds(ids)
    
    @classmethod
    def from_file(cls, input_file:str, output_path:str='default', vocab_size:int=16000, 
                  char_cov:float=1.0, model_type:str='unigram'):
        
        assert model_type in ['unigram', 'bpe', 'char', 'word']
        assert 0 < char_cov <= 1
        input_file = str(input_file)
        output_file =  os.path.splitext(str(output_path))[0]
        ext = '.model'
        train_cmd = f"--input={input_file} --model_prefix={output_file}"\
                    f" --vocab_size={vocab_size} --character_coverage={char_cov} --model_type={model_type}"

        logger.info(f"Train command: {train_cmd}")
        logger.info(f"Started training SentencePiece model...")
        ret = spm.SentencePieceTrainer.Train(train_cmd)
        logger.info(f"Exit code: {int(ret)}")
        return cls(output_file+ext)
      
def batchify(data, bsz):
    # work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

class Corpus(object):

  def __init__(self, file_path, tokenizer, bs=20):
       
        self.processor = tokenizer
        self.id_sos, self.id_eos = self.processor.ID_SOS, self.processor.ID_EOS
        self.bs = bs
        self.data = self.tokenize(file_path)        
        self.vocab_size = self.processor.vocab_size
        
  def tokenize(self, path):
      """Tokenizes a text file."""
      ids = []
      with open(path, 'r') as f:
          for line in f:
              numericalized = self.processor.numericalize(line)
              ids.extend([self.id_sos] + numericalized + [self.id_eos])

      return batchify(torch.LongTensor(ids), self.bs)

## Define RNN model architecture,  training loop and helpers

In [0]:
class RNNModel(nn.Module):

    def __init__(self, rnn_type, ntoken, emsize, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, emsize)
        assert (rnn_type in ['LSTM', 'GRU']), "Arg `rnn_type` has to be one of {GRU, LSTM}."
        self.rnn = getattr(nn, rnn_type)(emsize, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        if tie_weights:
            if nhid != emsize:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)
       
      
def repackage_hidden(h):
  """Wraps hidden states in new Tensors, to detach them from their history."""
  if isinstance(h, torch.Tensor):
    return h.detach()
  else:
    return tuple(repackage_hidden(v) for v in h)
      
def get_num_params(model):
  return sum([p.nelement() for p in model.parameters() if p.requires_grad])

def get_batch(source, i, bptt):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target
  
def train_epoch(train_data, model, vocab_size, bs=16, bptt=20, clip=.25):
    model.train()
    total_loss = 0.0
    hidden = model.init_hidden(bs)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i, bptt)

        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        total_loss += len(data) * loss.item()
        
    return total_loss / (len(train_data) - 1)

In [63]:
# INIT AND TRAIN TOKENIZER FROM CLEAN TEXTS
bs = 64

model_type = 'unigram' # can be "bpe"/"unigram" to support wordpieces

vocab_size = 22000 # how many wordpieces to consider

tokenizer = Tokenizer.from_file(data_dir/'text_clean.txt', 
                                output_path=data_dir/'tokenizer', char_cov=1.0,
                                model_type=model_type, vocab_size=vocab_size) 

# INIT CORPUS FROM CLEAN TEXTS AND TOKENIZER WITH BATCH_SIZE `BS`

corpus = Corpus(data_dir/'text_clean.txt', tokenizer=tokenizer, bs=bs)

2019-04-10 15:49:19,385 : INFO : Train command: --input=/content/data/mixed/text_clean.txt --model_prefix=/content/data/mixed/tokenizer --vocab_size=22000 --character_coverage=1.0 --model_type=unigram
2019-04-10 15:49:19,388 : INFO : Started training SentencePiece model...
2019-04-10 15:49:55,676 : INFO : Exit code: 1
2019-04-10 15:49:55,749 : INFO : Initialized SentPieceProcessor from /content/data/mixed/tokenizer.model


In [73]:
# LSTM PARAMETERS
model_type='GRU'

emsize = 600
nhid = 600

nlayers = 1

dropout = 0.05
clip = 3.5

tied = True

bptt = 80

lr = 1e-3

# INIT LSTM MODEL, LOSS FUNCTION AND OPTIMIZER

model = RNNModel(model_type, corpus.vocab_size, emsize, nhid, nlayers, dropout, tied).to(device)
print(f"No. of parameters: {get_num_params(model)}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

  "num_layers={}".format(dropout, num_layers))


No. of parameters: 15385600


## Let's train the model

In [74]:
epochs = 100

start_time = time.time()
try:
  
  for epoch in range(1, epochs+1):

    loss = train_epoch(corpus.data, model, corpus.vocab_size, 
                       clip=clip, bs=bs, bptt=bptt)
    elapsed = time.time() - start_time
    print("epoch {:3d} | loss {:5.2f} | perplexity {:8.2f}| elapsed {:5.2f}s ".format(epoch, loss, math.exp(loss), elapsed))

except KeyboardInterrupt:
  print('-' * 89)
  print('Exiting from training early')

finally:
  params = {"model_type": model_type,
            "ntoken": corpus.vocab_size,
            "emsize": emsize,
            "nhid": nhid,
            "nlayers": nlayers,
            "dropout": dropout,
            "tied": tied}
 
  with open(data_dir/'model_state.pth', 'wb') as f:
    torch.save({"state_dict": model.state_dict(),
                "params": params}, f)
    
zipfile = zip_dir(data_dir, data_dir.name)

epoch   1 | loss  5.89 | perplexity   360.75| elapsed 98.40s 
epoch   2 | loss  5.24 | perplexity   187.93| elapsed 196.78s 
epoch   3 | loss  4.90 | perplexity   134.45| elapsed 295.31s 
epoch   4 | loss  4.65 | perplexity   104.25| elapsed 393.90s 
epoch   5 | loss  4.43 | perplexity    83.63| elapsed 492.40s 
epoch   6 | loss  4.22 | perplexity    68.05| elapsed 590.94s 
epoch   7 | loss  4.03 | perplexity    56.03| elapsed 689.58s 
epoch   8 | loss  3.84 | perplexity    46.47| elapsed 788.13s 
epoch   9 | loss  3.66 | perplexity    38.94| elapsed 886.69s 
epoch  10 | loss  3.50 | perplexity    33.07| elapsed 985.27s 
epoch  11 | loss  3.35 | perplexity    28.38| elapsed 1083.89s 
epoch  12 | loss  3.20 | perplexity    24.54| elapsed 1182.39s 
epoch  13 | loss  3.07 | perplexity    21.49| elapsed 1280.95s 
epoch  14 | loss  2.95 | perplexity    19.09| elapsed 1379.50s 
epoch  15 | loss  2.84 | perplexity    17.18| elapsed 1478.02s 
epoch  16 | loss  2.75 | perplexity    15.64| elaps

## Generate verses! 

In [0]:
from random import sample
import torch

def is_unbalanced(s):
  if s.count('"') % 2 != 0 or s.count('(') != s.count(')'):
    return True
  else:
    return False

def sample_punkt():
  return sample(['.', '?', '!'], 1)[0]

def parse_last_line(s):
  l = list(s)
  if l[-1] == ',': 
    l[-1] = sample_punkt()
  
  if l[-1] not in list('.?!'): 
    l.append(sample_punkt())  
  return "".join(l)
  
def generate_line(model, hidden=None, temp=1.0, 
               sos_id=1, eos_id=2, unk_id=0, max_len=None):
  """Generate line from `model` with `hidden` state at `temp`."""
  ids = []
  
  if hidden is None:
    hidden = model.init_hidden(1)
  
  input = torch.tensor([sos_id], dtype=torch.long).reshape(1,1).to(device)
  
  id = 0
  while id != eos_id and len(ids)<max_len :
    output, hidden = model(input, hidden)
    probs = output.squeeze().div(temp).exp().cpu() 
    id = torch.multinomial(probs, num_samples=1).item() 
    if id == sos_id or id == unk_id: continue
    input.fill_(id)
    ids += [id]
  
  return ids, hidden

def generate(model, tokenizer, num_lines=8, min_len=8, max_len=15,
             unk_id=0, sos_id=1, eos_id=2, temp=0.6):
  """ 
  Generate a verse consisting of `num_lines` lines of max. length `max_tokens`.
  Since the hidden state is passed onto the next line, 
  observing some cross-line consistency would be expected, or less
  optimistically, at least grammatically correct sentences.
  NOTE: line length can be tuned by changing max_tokens (i.e. subword pieces).
  
  Args:
    model: Trained PyTorch language model
    tokenizer: SentencePiece tokenizer
    temp: Temperature parameter; lower: more conservative, higher: more diverse
    num_lines: No. of lines to generate.
    max_len: Max no. of tokens per line (not words!)
    sos_id: Start of sequence id in vocabulary
    eos_id: End of sequence id in vocabulary
  
  Returns: list of strings
  """
  model.eval()
  
  lines = []
  line_cnt = 0
  hidden = model.init_hidden(1)

  with torch.no_grad():
    
    while line_cnt != num_lines:
      try:

        ids, hidden = generate_line(model, hidden=hidden, temp=temp, max_len=max_len,
                                    sos_id=sos_id, eos_id=eos_id, unk_id=unk_id)
        
        if len(ids) <= min_len: raise Exception
        line = tokenizer.textify(ids).strip()
        
        if line.startswith(tuple("-?!.,()")): raise Exception
        if is_unbalanced(line): raise Exception
        
        lines += [line]
        line_cnt +=1
        
      except Exception as e:
        pass
    
  last_line = lines.pop()
  l = parse_last_line(last_line)
  lines.append(l)
  
  return lines

In [82]:
generate(model, tokenizer, temp=0.4, num_lines=10, min_len=8, max_len=12)

['Minden, ami a miénk lesz az a jövevény,',
 'a jég, mely a kék és sötét műhelyekben',
 'és az árnyékmá meg a tiszta kékségben',
 'a nap s a torony, az óriás, merev tekintetű',
 'a parasztságra véres késsel a fölcskét,',
 'a boldog vágy, de nem gondol senkire, ki látja',
 'a szabad kőre követni kívánnak.',
 's az igazság itt minden igazság bugyognak benne.',
 'a harctér s a tavasz-tis',
 'a munka, ez a penészes végzet.']

In [83]:
generate(model, tokenizer, temp=0.6, num_lines=8, max_len=15)

['a Határ és a bevezetés.',
 'a jéghidegöst, az istent és a halált.',
 'a lány és nem volt kérdem - -',
 's az illyes deszkák alatt,',
 'a munka s az én derék a Puffadok alatt,',
 'a leglangodott a világ,',
 'és az örök mozgás, amely egy igazság szerint,',
 'és a nevetésben, amint föl-föl is aszna hordja magát?']

In [78]:
generate(model, tokenizer, temp=0.7, num_lines=10, max_len=15)

['arra kérd a vezér: "Mit beszélsz? Rém unalmas én vagyok?"',
 'a te ispocs-hang, a színház',
 'a lány. Az én fülemüket ontják',
 'né a maga szövőszékeken s a',
 'a vörös dúb-kuvál',
 'a vöröses és alagsori,',
 'a salmiák és a kiváncsiak.',
 'a tanszék támogatja a gazdag teljesedést.',
 'az élet a nótának remélhet.',
 'a munka, de a meglibos jéggel a!']