## The NLP part

In [25]:
from tqdm import tqdm
import numpy as np
import pickle
import os
import time
from sklearn.model_selection import train_test_split
from multiprocessing import Pool
import re
from collections import Counter

### Load the data

In [26]:
import json
with open('classic_poems.json') as f:
    data_json = json.load(f)


In [27]:
data = []
content = []
labels = []
for row in data_json:
    data.append([row['content'], row['poet_id']])
    content.append(row['content'])
    labels.append(row['poet_id'])

In [28]:
len(data)

2496

In [29]:
print(data[-45][0][:100])

Ярмарка.
    Вовсю!
        Нелепица на нелепице.
Лейпциг гудит.
        Суетится Лейпциг.
Но площад


### Dictionary

In [30]:
def clean(sent):
    """
    убираем все знаки кроме киррилицы цифр, ,.?!
    """   

    reg = re.compile('[^а-яА-Я|!?,.\n ]')
    sent = reg.sub('', sent).lower()
    sent = re.findall(r"[\w']+|[!?,.\n]", sent)
    return sent
    
def get_c2i(counter, min_occurrence, add):
    """ делаем словарь из Countrer"""

    w2ind = {k: i + 1 for i, (k, v) in enumerate(counter.most_common()) if v >= min_occurrence}
    if add:
        w2ind["PADDING_TOKEN"] = 0
        w2ind["UNKNOWN_TOKEN"] = len(w2ind)

    ind2w = {v: k for k, v in w2ind.items()}
    return w2ind, ind2w


def get_uni2idx(text, min_occurrence=3, add=True):
    """ Словарь униграмм"""
    
    unigrams_counter = Counter()  
    for t in tqdm(text):
        unigrams_counter.update(t)

    w2ind, ind2w = get_c2i(unigrams_counter, min_occurrence=min_occurrence, add=add)
    print('length of unigram dictionary: ', len(w2ind))

    return w2ind, ind2w, unigrams_counter

In [31]:
with Pool(50) as p:
    content = p.map(clean, content)

In [32]:
uni2idx, inx2uni, _ = get_uni2idx(content, 3)

100%|██████████| 2496/2496 [00:00<00:00, 29342.22it/s]

length of unigram dictionary:  15212





In [36]:
lab2idx, inx2lab, lab_counter = get_uni2idx([labels], 3, add=False)

100%|██████████| 1/1 [00:00<00:00, 2079.48it/s]

length of unigram dictionary:  5





In [37]:
lab2idx

{'blok': 1, 'esenin': 3, 'mayakovskij': 5, 'pushkin': 2, 'tyutchev': 4}

## Building the dataset 

In [38]:
UNI_SEQ_LEN = 100

In [39]:
def vectorize_data(data, uni2idx, unigrams_sequence_len):
    """Возвращает векторизованные данные."""
    
    uni_unknown_index = uni2idx['UNKNOWN_TOKEN']
    uni_padding_index = uni2idx['PADDING_TOKEN']
    word_indices = np.array([uni_padding_index] * unigrams_sequence_len)
    
    sentence, label = data
    
    sentence = clean(sentence)

    for pos_in_padded_sentence, word in enumerate(sentence):
        
        if pos_in_padded_sentence>=unigrams_sequence_len:
            break
        word_idx = get_token_indices(word, uni2idx, uni_unknown_index)
        word_indices[pos_in_padded_sentence] = word_idx
        
    dataset = [word_indices, lab2idx[label]]

    return dataset

def get_token_indices(token, uni2idx, uni_unknown_index):

    return uni2idx.get(token, uni_unknown_index)

In [40]:
print('Vectorizing...')

def vectorize(x):
    return vectorize_data(x, uni2idx, UNI_SEQ_LEN)

with Pool(10) as p:
    data_vectorized = p.map(vectorize, data)


print('Splitting the data into train and evaluate...')
X_tr, X_ev = train_test_split(
                              data_vectorized,
                              test_size=0.1,
                              random_state=24)


Vectorizing...
Splitting the data into train and evaluate...


## Building the model

In [42]:
EMB_DIM = 100
N_LAYERS = 2
HID_EMB = 100
NUM_CLASSES = 5

In [45]:
import torch, torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class Encoder(nn.Module):
    def __init__(self):
        """ 
        A simple encoder for poem.
        
        output two vectors, one is responsoble for the style (author), the other - for the meaning
        """
        super(self.__class__, self).__init__()
        
        self.emb = nn.Embedding(len(uni2idx), EMB_DIM, padding_idx=uni2idx['PADDING_TOKEN'])
        self.rnn = nn.GRU(EMB_DIM, HID_EMB, N_LAYERS, batch_first=True)
        self.style = nn.Linear(HID_EMB, HID_EMB)
        self.meaning = nn.Linear(HID_EMB, HID_EMB)
        
    def forward(self, text_ix):
        """
        :param text_ix: int64 Variable of shape [batch_size, max_len]
        :returns: 
            tuple( [batch_size, seq_len, hid_size], [batch_size, seq_len, hid_size])
        """
        emb = self.emb(text_ix)
        print('emb', emb.shape)
        hid, _ = self.rnn(emb)
        print('hid', hid.shape)

        return self.style(hid), self.meaning(hid)

In [46]:
class Discriminator(nn.Module):
    def __init__(self):
        """ 
        Trying to distinguish between styles based on the input vector
        
        """
        super(self.__class__, self).__init__()            
        self.l1 = nn.Linear(HID_EMB, HID_EMB//2)
        self.logits = nn.Linear(HID_EMB//2, NUM_CLASSES)
        
    def forward(self, emb):
        """
        :param emb: [batch_size, HID_EMB]
        :returns:  [batch_size, NUM_CLASSES]
        """
        emb = emb.mean(1)
        emb_l1 = F.relu(self.l1(emb))
        out = self.logits(emb_l1)
        print('Discriminator out', out.shape)
        return out

class Motivator(nn.Module):
    def __init__(self):
        """ 
        Trying to distinguish between styles based on the input vector
        """
        super(self.__class__, self).__init__()
        self.m = Discriminator()
        
    def forward(self, text_ix):
        """
        :param emb: [batch_size, HID_EMB]
        :returns: [batch_size, NUM_CLASSES]
        """
        out = self.m(emb)
        print('Motivator out', out.shape)
        return out

In [47]:
class Decoder(nn.Module):
    def __init__(self,):
        """ 
        A sequential decoder.
        """
        super(self.__class__, self).__init__()
        
        self.rnn = nn.GRU(HID_EMB, 2*HID_EMB, N_LAYERS, batch_first=True)
        self.logits = nn.Linear(2*HID_EMB, len(uni2idx))
        
    def forward(self, emb_style, emb_meaning):
        """
        :param text_ix: tuple( [batch_size, seq_len, hid_size], [batch_size, seq_len, hid_size])
        :returns: [batch_size, tokens]
        """
        
        emb = torch.cat([emb_style, emb_meaning], dim=-1)
        hid = self.rnn(emb)
        logits = self.logits(hid)
        print('Decoder logits', logits.shape)
        return logits

### Losses

In [None]:
def m_loss(False_data, mode):
    if mode == 1: 
        loss = (1-False_data).log().mean()
    if mode == 2:
        loss = -False_data.log().mean()
    
    if mode == 3 or TASK == 4:
        loss = -False_data.mean()
    
    return  loss 

def d_loss(False_data, True_data, mode):
    if mode == 1 or mode == 2: 
        loss = -(True_data.log().mean(0)+(1-False_data).log().mean())
        
    if mode == 3 or mode == 4:
        loss = -(-False_data.mean()+True_data.mean())
        
    return  loss 

def decoder_loss(logits, labels):
    
    loss = -(True_data.log().mean(0)+(1-False_data).log().mean())
        
    return  loss 

## Training loop

In [None]:
from tqdm import tnrange
def iterate_minibatches(data, batch_size=32, shuffle=True, verbose=True):
    indices = np.arange(len(data))
    if shuffle:
        indices = np.random.permutation(indices)
    if max_batches is not None:
        indices = indices[: batch_size * max_batches]
        
    irange = tnrange if verbose else range
    
    for start in irange(0, len(indices), batch_size):
        yield [data[i] for i in indices[start : start + batch_size]]

In [None]:
n_epochs = 2
n_batches_per_epoch = len(train_data)
n_validation_batches = len(val_data)

In [None]:
def plot_history(history):
    plt.subplot(131)
    plt.title("decoder")
    plt.xlabel("#epoch")
    plt.ylabel("loss")
    plt.plot(history['decoder_train_loss'], 'b', label='train_loss')
    plt.plot(history['decoder_val_loss'], 'g', label='val_loss')
    plt.legend()
    plt.subplot(132)
    plt.title("motivator")
    plt.xlabel("#epoch")
    plt.ylabel("loss")
    plt.plot(history['motivator_train_loss'], label="train_loss")
    plt.plot(history['motivator_val_loss'], label="val_loss")
    plt.legend()
    plt.subplot(133)
    plt.title("discriminator")
    plt.xlabel("#epoch")
    plt.ylabel("loss")
    plt.plot(history['discriminator_train_loss'], label="train_loss")
    plt.plot(history['discriminator_val_loss'], label="val_loss")
    plt.legend()
    plt.show()


def train(generator, X_tr, X_te, y_tr, y_te, batchsize=3, n_epochs=3, verbose=True):
    """
    args:
        generator: model
        X_tr, X_te, y_tr, y_te: datasets
        batchsize: int
    returns:
        generator: model
        epoch: integer (the last epoch)
        history: dict
    """

    optimizer = optim.Adam(generator.parameters())
    n_train_batches = math.ceil(len(X_tr) / batchsize)
    n_validation_batches = math.ceil(len(X_te) / batchsize)

    history = {'decoder_train_loss': [], 'decoder_val_loss': [],
               'motivator_train_loss': [], 'motivator_val_loss': [], 'discriminator_train_loss': [], 
               'discriminator_val_loss': []}

    for epoch in range(n_epochs):

        start_time = time.time()

        train_loss = 0
        generator.train(True)
        
        try:
            for X, y in tqdm(iterate_minibatches(X_tr, y_tr, batchsize)):
                pred, sound = generator(X, y)
                loss = compute_loss(pred, y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                train_loss += loss.cpu().data.numpy()

            train_loss /= n_train_batches

            generator.train(False)
            val_loss = 0
            for X, y in tqdm(iterate_minibatches(X_te, y_te, batchsize)):
                pred, sound = generator(X, y)
                loss = compute_loss(pred, y)

                val_loss += loss.cpu().data.numpy()

            val_loss /= n_validation_batches

            history['train_loss'].append(train_loss)
            history['val_loss'].append(val_loss)
            # metrics computed on the last val batch
            history['EB_true'].append(EB(y).mean().data.cpu().numpy())
            history['UPC_true'].append(UPC(y).mean().data.cpu().numpy())
            history['QN_true'].append(QN(y).mean().data.cpu().numpy())
            history['EB_false'].append(EB(sound).mean().data.cpu().numpy())
            history['UPC_false'].append(UPC(sound).mean().data.cpu().numpy())
            history['QN_false'].append(QN(sound).mean().data.cpu().numpy())

        except KeyboardInterrupt:
            return generator, epoch, history

            # Visualize
        if verbose:
            display.clear_output(wait=True)
            plt.figure(figsize=(16, 6))
            # Then we print the results for this epoch:
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, n_epochs, time.time() - start_time))
            print('current train loss: {}'.format(history['train_loss'][-1]))
            print('current val loss: {}'.format(history['val_loss'][-1]))
            plot_history(history)

    print("Finished!")

    return generator, epoch, history

In [None]:
from tqdm import tqdm

for epoch in range(n_epochs):
    
    train_loss=0
    question_vectorizer.train(True)
    answer_vectorizer.train(True)
    it = iterate_minibatches(train_data, verbose = False)
    for _ in tqdm(range(n_batches_per_epoch)):
        
        batch = next(it)
        loss = compute_loss(*batch)     
        
        # clear old gradients; do a backward pass to get new gradients; then train with opt
        opt.zero_grad()
        loss.backward()
        opt.step()
        
#         print('loss', loss)
        train_loss += loss.cpu().data.numpy()
        recall = compute_recall(*batch)
        train_recall += recall
        
    train_loss /= n_batches_per_epoch
    train_recall /=n_batches_per_epoch
    
    val_loss=0
    question_vectorizer.train(False)
    answer_vectorizer.train(False)
    it = iterate_minibatches(val_data, verbose = False)
    for _ in range(n_validation_batches):
        batch = next(it)
        loss = compute_loss(*batch)
        val_loss += loss.cpu().data.numpy()
        recall = compute_recall(*batch)
        val_recall += recall
        
        
    val_loss /= n_validation_batches
    val_recall /=n_validation_batches
    
    print('\nEpoch: {}, train loss: {}, val loss: {}'.format(epoch, train_loss, val_loss))
    print('\nEpoch: {}, train recall: {}, val recall: {}'.format(epoch, train_recall, val_recall))

print("Finished!")

## Generating some masterpieces