In [1]:
!pip install youtokentome



In [0]:
import pandas as pd
import re
import numpy as np
from itertools import chain

import torch, torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

from torch.utils.data import Dataset
import random
import youtokentome as yttm
import os
from train_utils import train_eval_loop
import warnings
warnings.filterwarnings("ignore")

### Load and preparation data

In [0]:
quotes = pd.read_csv('../data/quotes-2.csv')

In [5]:
quotes.head(5)

Unnamed: 0,quote,author,lenght,c,m
0,You know you're in love when you can't fall as...,Dr. Seuss,18,dreams,85
1,A friend is someone who knows all about you an...,Elbert Hubbard,13,friendship,52
2,Darkness cannot drive out darkness: only light...,"Martin Luther King Jr., A Testament of Hope: T...",20,inspirational,91
3,We accept the love we think we deserve.,"Stephen Chbosky, The Perks of Being a Wallflower",8,inspirational,32
4,It is better to be hated for what you are than...,"André Gide, Autumn Leaves",19,life,62


In [0]:
phrases = list(quotes.quote.values)

In [0]:
def save_text_to_file(texts,filename):
    with open(filename,'w') as outf:
        outf.write('\n'.join(texts))  

In [0]:
TOKEN_RE = re.compile(r"[a-zA-Z]+")



def tokenize_text_simple_regex(text,min_token_size=0):
    text = text.lower()
    text = text.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will").replace("'d"," would")
    text = re.sub(r"([0-9]+)000000", r"\1m", text)
    text = re.sub(r"([0-9]+)000", r"\1k", text)
    
    all_tokens = TOKEN_RE.findall(text)
    return [token for token in all_tokens if len(token)>=min_token_size]
    
def tokenize_text(corpus,tokenizer=tokenize_text_simple_regex,**tokenizer_kwargs):
    return [tokenizer(txt,**tokenizer_kwargs) for txt in corpus]

In [0]:
phrase_token = tokenize_text(phrases)

In [0]:
phrase_token = [ph for ph in phrase_token if len(ph)<30 and len(ph)>3]

In [11]:
new_phrases = [' '.join(send) for send in phrase_token]
len(new_phrases)

145655

In [0]:
np.random.shuffle(new_phrases)

SPLIT = int(len(new_phrases) * 0.8)

train_data = new_phrases[:SPLIT]
test_data = new_phrases[SPLIT:]

In [31]:
print("Size of train dataset: {}".format(len(train_data)))
print("Size of test dataset: {}".format(len(test_data)))

Size of train dataset: 116524
Size of test dataset: 29131


### Tokenize text with BPE and create dataset

In [0]:
def save_text_to_file(texts,filename):
    with open(filename,'w') as outf:
        outf.write('\n'.join(texts))     

In [0]:
FILE_BPE = 'quotes.yttm'
TRAIN_TEXT_FILENAME = 'train_quotes.txt'
save_text_to_file(train_data,TRAIN_TEXT_FILENAME)

In [16]:
yttm.BPE.train(data = TRAIN_TEXT_FILENAME,vocab_size = 2000,model = FILE_BPE )

<youtokentome.youtokentome.BPE at 0x7fe8bfe5d160>

In [0]:
tokenizer = yttm.BPE(FILE_BPE)

In [0]:
train_token_ids = tokenizer.encode(train_data, bos = True, eos = True)
test_token_ids = tokenizer.encode(test_data, bos = True, eos = True)

In [19]:
unknown_subwords_in_test = sum(1 for text in test_token_ids for token_id in text if token_id == 1)
print('Number of cases with unknown n-grams of characters in the validation sample', unknown_subwords_in_test)

Number of cases with unknown n-grams of characters in the validation sample 0


In [0]:
def ensure_lenght(txt, length,pad_value):
    if(length>len(txt)):
        return list(txt) + [pad_value] * (length - len(txt))
    else:
        return txt[:length]
    
class LanguageModelDataset(Dataset):
    def __init__(self,token_ids, chunk_length = 30,pad_value = 0):
        self.token_ids = token_ids
        self.chunk_length = chunk_length
        self.pad_value = pad_value
    def __len__(self):
        return len(self.token_ids)
    def __getitem__(self,item):
        
        text = self.token_ids[item]
        
        seed_part = text[1:-1]
        target_part = text[2:]
        
        seed_part = np.array(ensure_lenght(seed_part,self.chunk_length,self.pad_value))
        target_part = np.array(ensure_lenght(target_part,self.chunk_length,self.pad_value))
        
        return seed_part,target_part

In [0]:
train_dataset = LanguageModelDataset(train_token_ids, 50,pad_value = 0)
test_dataset =  LanguageModelDataset(test_token_ids, 50,pad_value = 0 )

In [22]:
tokenizer.decode(list(train_dataset[2]))

['why fear death it is the most beautiful adventure in life<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>',
 'fear death it is the most beautiful adventure in life<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>']

### Model and training

In [0]:
def make_dependency_target_mask(size):
    full_mask = torch.ones((size,size))
    ignore_mask = torch.tril(full_mask)<1
    full_mask.masked_fill_(ignore_mask,float('-inf'))
    full_mask.masked_fill_(~ignore_mask,0)
    return full_mask
                    
def make_positional_encoding(max_len,emb_size):
    time = np.pi * torch.arange(0,max_len).float()
    freq_dividers = torch.arange(1,emb_size//2+1)
    inputs = time[:, None] / freq_dividers[None, :]
    
    result = torch.zeros(max_len,emb_size)
    result[:, 0::2] = torch.sin(inputs)
    result[:, 1::2] = torch.cos(inputs)
    return result

In [0]:
class LanguageModel(nn.Module):
    def __init__(self,vocab_size,emb_size,backbone,emb_dropout = 0.0):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.embeddings = nn.Embedding(vocab_size,emb_size,padding_idx = 0)
        self.emb_dropout = nn.Dropout(emb_dropout)
        self.backbone = backbone
        self.out = nn.Linear(emb_size,vocab_size)
        
    def forward(self,seed_token_ids):
        batch_size,max_in_len = seed_token_ids.shape
        
        seed_padding_mask = seed_token_ids == 0
        
        dependency_mask = make_dependency_target_mask(max_in_len).to(seed_token_ids.device)
        
        seed_emb = self.embeddings(seed_token_ids)
        
        pos_codes = make_positional_encoding(max_in_len,self.emb_size).unsqueeze(0).to(seed_token_ids.device)
        
        seed_emb = seed_emb + pos_codes
        seed_emb = self.emb_dropout(seed_emb)
        
        target_features = seed_emb
        target_features = self.backbone(seed_emb, mask = dependency_mask, src_key_padding_mask = seed_padding_mask)
        logits = self.out(target_features)
        return logits

In [0]:
def lm_cross_entropy(pred,target):
    """
    pred - BatchSize x TargetLen x VocabSize
    target - BatchSize x TargetLen
    """
    pred_flat = pred.view(-1,pred.shape[-1])
    target_flat = target.view(-1)
    return F.cross_entropy(pred_flat,target_flat,ignore_index = 0)

def lr_scheduler(optimizer):
    return torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 20, factor = 0.5, verbose = True)

In [0]:
class BatchFirstTranformerEncoder(nn.Module):
    def __init__(self,*args,**kwargs):
        super().__init__()
        self.impl = nn.TransformerEncoder(*args,**kwargs)
        self.initialize_weights()
    def forward(self,src,*args,**kwargs):
        src = src.transpose(0,1).contiguous()
        result = self.impl(src,*args,**kwargs)
        result = result.transpose(0,1).contiguous()
        return result
    def initialize_weights(self):
        for param in self.impl.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

In [0]:
model = LanguageModel(tokenizer.vocab_size(),128,BatchFirstTranformerEncoder(nn.TransformerEncoderLayer(d_model = 128,
                                                                                             nhead = 8,
                                                                                             dim_feedforward=512,
                                                                                             dropout=0.2),num_layers = 2),emb_dropout = 0.2)

In [0]:
(best_val_loss,
 best_torch_transf_model) = train_eval_loop(model,
                                            train_dataset,
                                            test_dataset,
                                            lm_cross_entropy,
                                            lr=2e-3,
                                            epoch_n=700,
                                            batch_size=1024,
                                            device='cuda',
                                            early_stopping_patience=30,
                                            max_batches_per_epoch_train=1000,
                                            max_batches_per_epoch_val=1000,
                                            lr_scheduler_ctor=lr_scheduler)

In [0]:
torch.save(best_torch_transf_model.state_dict(), 'quotes_model.pth')

In [30]:
model.load_state_dict(torch.load('quotes_model.pth'))

<All keys matched successfully>

### Generation

In [0]:
class RandomGeneration():
    def __init__(self,model,tokenizer,device = 'cuda',eos_token_id = 3):
        self.model = model
        self.tokenizer = tokenizer
        self.eos_token_id = eos_token_id
        self.device = torch.device(device)
        self.model.to(self.device)
        
    def __call__(self,seed_phrase = '  ', temperature = 1.0,max_steps_n = 1000):
        seed_tokens = tokenizer.encode(list([seed_phrase]))[0]
        vocab_size = (self.tokenizer.vocab_size())
                      
        if len (seed_tokens) == 0:
            seed_tokens.append(2)
                      
        for _ in range(max_steps_n):
            in_batch = torch.tensor(seed_tokens).unsqueeze(0).to(self.device)
            logits = self.model(in_batch)[0,-1]
            logproba = F.softmax(logits/temperature).cpu().detach().numpy()
            
            next_token = np.random.choice(vocab_size,p = logproba)
            seed_tokens.append(next_token)
            
            if(next_token == self.eos_token_id): break
        
        return self.tokenizer.decode([seed_tokens])[0][:-5]

In [0]:
generation = RandomGeneration(model,tokenizer)

In [49]:
for _ in range(10):
    print(generation(seed_phrase = 'power',temperature = 0.7))

power is not the answer of your rewards but the question is how you treat them
power is the most important thing you can do in life
power is a delusion of change
power makes you feel good
power and power are the reflection of the world
power is strength of a dangerous silence
power is a genius so people can be a horse
power is the process of time
power is a choice it is a choice that is not a choice we make it happen to it
power is disposition is the essence of an illusion


In [0]:
class RandomGenerationStepTemp():
    def __init__(self,model,tokenizer,device = 'cuda',eos_token_id = 3):
        self.model = model
        self.tokenizer = tokenizer
        self.eos_token_id = eos_token_id
        self.device = torch.device(device)
        self.model.to(self.device)
        
    def __call__(self,seed_phrase = '  ', temperature = 1.0,size_step = 10,step = 0.1,max_steps_n = 100):
        seed_tokens = tokenizer.encode(list([seed_phrase]))[0]
        vocab_size = (self.tokenizer.vocab_size())
                      
        if len (seed_tokens) == 0:
            seed_tokens.append(2)
                      
        for i in range(max_steps_n):
            in_batch = torch.tensor(seed_tokens).unsqueeze(0).to(self.device)
            logits = self.model(in_batch)[0,-1]
            logproba = F.softmax(logits/temperature).cpu().detach().numpy()
            
            next_token = np.random.choice(vocab_size,p = logproba)
            seed_tokens.append(next_token)
            if(i%size_step == 0):
                temperature = temperature - step
                if(temperature<=0):temperature = 0.1
            
            if(next_token == self.eos_token_id): break
        
        return self.tokenizer.decode([seed_tokens])[0][:-5]

In [0]:
generation_step = RandomGenerationStepTemp(model,tokenizer)

In [61]:
for _ in range(5):
    print(generation_step(seed_phrase = 'artificial intelligence is',temperature = 0.8,size_step = 1,step = 0.05))

artificial intelligence is the highest form of spiritual creation
artificial intelligence is in the logic of adjustment
artificial intelligence is the opposite of the human mind
artificial intelligence is the most powerful force that you can see
artificial intelligence is the only thing that can be realized


In [58]:
for _ in range(5):
    print(generation_step(seed_phrase = 'freedom is',temperature = 0.7,size_step = 1,step = 0.05))

freedom is not a form of failure but a lack of courage
freedom is the root of the soul
freedom is the greatest gift of life
freedom is not the most important thing that is the greatest thing to do is to do
freedom is a process of finding out what you want to do
