# Assignment 7

Delelop language model, which generates death metal band names.  
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.  
You are free to use any other data, but the most easy way is just to take the band name column.

Your language model should be char-based autogression RNN.  
Text generation should be terminated when either max length is reached or terminal symbol is generated.  

<img src="images/example.png">

<img src="images/example2.png">

Different band names can be generated by:  
1. init $h_0$ as random vector from some probabilty distribution.
2. sampling over tokens at each timestep with probability = softmax 

Calculate perplexity for your model = your objective quality metric.  
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.  

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

import torch as tt
import torch.nn as nn

from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset, BPTTIterator
from torchtext.datasets import LanguageModelingDataset

from sklearn.model_selection import train_test_split
from random import shuffle
import torch.optim as optim


In [2]:
df = pd.read_csv('bands.csv')

In [3]:
df.head()

Unnamed: 0,id,name,country,status,formed_in,genre,theme,active
0,1,('M') Inc.,United States,Unknown,2009.0,Death Metal,,2009-?
1,2,(sic),United States,Split-up,1993.0,Death Metal,,1993-1996
2,3,.F.O.A.D.,France,Active,2009.0,Death Metal,Life and Death,2009-present
3,4,100 Suns,United States,Active,2004.0,Death Metal,,2004-present
4,5,12 Days of Anarchy,United States,Split-up,1998.0,Death Metal,Anarchy,1998-2002


In [4]:
groups = list(df['name'])
shuffle(groups)
with open('names.txt', 'a') as file:
    for group in groups:
        file.write(group + '\n')

In [5]:
with open('names.txt', 'r') as file, open('train.txt', 'a') as file_w1, open('test.txt', 'a') as file_w2:
    lines = file.readlines()
    sep = int(0.8 * len(lines))
    i = 0
    for line in lines:
        i += 1
        if i < sep:
            file_w1.write(line)
        else:
            file_w2.write(line)

In [6]:
TEXT = ReversibleField(use_vocab=True,
             tokenize=list,
             batch_first=True,
             lower=True
            )

train, test = LanguageModelingDataset.splits(path='.',
                                             train='train.txt',
                                             test='test.txt',
                                             text_field=TEXT,
                                             newline_eos=True)

In [7]:
TEXT.build_vocab(train)
len(TEXT.vocab.itos)

298

In [8]:
TEXT.vocab.itos[:10]

[' UNK ', '<pad>', 'e', '<eos>', 'a', 'r', 'i', 'o', 't', 'n']

In [9]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        self.drop = tt.nn.Dropout()
        
        self.init_weights()
        
    def init_weights(self):
        nn.init.uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        total_length = x.size(-1)
        
        x = self.embedding(x)
        x = self.drop(x)
        
        x, hidden = self.rnn(x, hidden)
         
        x = self.drop(x)
        x = x.contiguous().view(batch_size * total_length, -1)
        x = self.fc(x)
        x = x.contiguous().view(batch_size, total_length, -1)
        return x, hidden
    
    def init_hidden(self, batch):
        return (tt.rand(2, batch, 128, requires_grad=True),
                tt.rand(2, batch, 128, requires_grad=True))

In [10]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0
    
    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator,
                             total=n_batches,
                             desc='epoch %d' % (curr_epoch),
                             leave=True)

    hidden = model.init_hidden(30)
    
    for i, batch in enumerate(iterator):
        if batch.text.size(0) != 30:
            continue
        
        optimizer.zero_grad()
        hidden = (hidden[0].detach(), hidden[1].detach())
        
        pred, hidden = model(batch.text, hidden)
        pred_flat = pred.view(-1, len(TEXT.vocab.itos))
        loss = criterion(pred_flat, batch.target.view(-1))
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    hidden = model.init_hidden(30)

    with tt.no_grad():
        for batch in iterator:
            if batch.text.size(0) != 30:
                continue

            pred, hidden = model(batch.text, hidden)
            pred_flat = pred.view(-1, len(TEXT.vocab.itos))
            loss = criterion(pred_flat, batch.target.view(-1))
            epoch_loss += loss.data.item()
            hidden = (hidden[0].detach(), hidden[1].detach())

    return 2 ** (epoch_loss / n_batches)


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_perplexity = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        perplexity = _test_epoch(model,  valid_iterator, criterion)

        print('perplexity %.5f' % perplexity)

        record = {'epoch': epoch, 'train_loss': train_loss, 'perplexity': perplexity}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if perplexity > prev_perplexity:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.perplexity == history.perplexity.min()].iloc[0]
                print('Early stopping! best epoch: %d perplexity %.5f' % (best_epoch['epoch'],
                                                                          best_epoch['perplexity']))
                break

            prev_perplexity = min(prev_perplexity, perplexity)

    return model

In [11]:
batch_size = 32

model = MyModel(vocab_size=len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128
               )

train_iterator, test_iterator = BPTTIterator.splits(
    (train, test),
    bptt_len=30,
    batch_sizes=(batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True
)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

criterion = nn.CrossEntropyLoss()

In [12]:
%%time
model = nn_train(model, train_iterator, test_iterator,
                     criterion, optimizer, scheduler=scheduler, 
                     n_epochs=50, early_stopping=5)


perplexity 6.40329



perplexity 6.29804



perplexity 6.26962



perplexity 6.25564



perplexity 6.24648



perplexity 6.23904



perplexity 6.23435



perplexity 6.22988



perplexity 6.22740



perplexity 6.22584



perplexity 6.22436



perplexity 6.22353



perplexity 6.22314



perplexity 6.22166



perplexity 6.22206



perplexity 6.22085



perplexity 6.22068



perplexity 6.22132



perplexity 6.21971



perplexity 6.21972



perplexity 6.21972



perplexity 6.21936



perplexity 6.21912



perplexity 6.21793



perplexity 6.21803



perplexity 6.21922



perplexity 6.21891



perplexity 6.21851



perplexity 6.21799
Early stopping! best epoch: 23 perplexity 6.21793
CPU times: user 1h 24s, sys: 4min 28s, total: 1h 4min 52s
Wall time: 48min 33s


In [39]:
def generate(decoder, prime_str='<eos>', predict_len=30, temperature=0.8):
    hidden = decoder.init_hidden(1)
    inp = tt.tensor(TEXT.vocab.itos.index(prime_str)).unsqueeze(0).unsqueeze(0).long()
    predicted = ''

    _, hidden = decoder(inp, hidden)
        
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = tt.multinomial(output_dist, 1)[0]

        predicted_char = TEXT.vocab.itos[top_i]
        if predicted != '' and predicted_char == '<eos>':
            break
        else:
            predicted += predicted_char
        inp = tt.tensor(TEXT.vocab.itos.index(predicted_char)).unsqueeze(0).unsqueeze(0).long()


    return predicted

In [33]:
tt.save(model, 'model.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [34]:
file = 'model.pt'
decoder = tt.load(file)

In [40]:
for x in range(10):
    print(generate(decoder))

an
nelerexachon
ben
aus
stinsen
a
ty tyoror s
k hore
dshadeal
atat sitha
