In [1]:
import glob
import os
import unicodedata
import string
from collections import Counter
from itertools import chain
from IPython.display import clear_output

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
import numpy as np
import pandas as pd

In [4]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

<center> #Loading Dataset

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
path_dc = 'dc-wikia-data.csv'
path_marvel = 'marvel-wikia-data.csv'

In [7]:
class Vocabulary():
    def __init__(self, all_names, pad_idx=0, sos_idx=1, eos_idx=2, unk_idx=3, min_freq=50):
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.unk_idx = unk_idx
        self.min_freq = min_freq
        #count frequency of chars
        self.vocab = Counter(list(chain(*all_names)))
        #create char to index dictionaries
        self.char2idx = {'<pad>': self.pad_idx,
                         '<sos>': self.sos_idx, 
                         '<eos>': self.eos_idx,
                         '<unk>': self.unk_idx}
        index = 3
        for char, freq in self.vocab.items():
            if freq > self.min_freq:
                index += 1
                self.char2idx[char] = index
            else:
                self.char2idx[char] = self.unk_idx
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}
        self.idx2char[self.unk_idx] = '<unk>'
        self.vocab_size = len(set(self.char2idx.values()))
        
    def encode_string(self, string):
        return [self.char2idx[char] for char in string]

In [8]:
class Padding():
    def padding(self, batch_string, pad_idx):
        '''adds regular padding to string'''
        max_len = max(map(len, batch_string))
        padded_batch = []
        for string in batch_string:
            number_padding = max_len - len(string)
            padded_string = string + [pad_idx] * number_padding
            padded_batch.append(padded_string)
        return padded_batch
    
    def add_eos_sos(self, batch_string, sos_idx, eos_idx):
        '''adds end of string and start of string padding'''
        sos_batch = []
        eos_batch = []
        for string in batch_string:
            sos_string = [sos_idx] + string
            eos_string = string + [eos_idx]
            sos_batch.append(sos_string)
            eos_batch.append(eos_string)
        return sos_batch, eos_batch

In [9]:
class Dataset(Vocabulary):
    def __init__(self, path_dc, path_marvel):
        self.all_names = self.load_dataset(path_dc, path_marvel)
        #to char represantation of name
        self.all_names = list(map(list, self.all_names))
        self.vocab = Vocabulary(self.all_names)
        self.padding = Padding()
        
    def __len__(self):
        return len(self.all_names)
    
    def load_dataset(self, path_dc, path_marvel):
        dc_df = pd.read_csv(path_dc)
        marvel_df = pd.read_csv(path_marvel)
        df = dc_df.append(marvel_df, sort=False)
        all_names = (df.name.str.replace('\((.*?)\)', '')
                     .str.strip()
                     .str.lower()
                     .unique())
        return all_names
    
    def get_batch(self, batch_size):
        batch_chars = [self.all_names[np.random.randint(len(self.all_names))] 
                       for _ in range(batch_size)]
        #to index
        batch_numbers = [self.vocab.encode_string(string) for string in batch_chars]
        #add padding
        batch_numbers = self.padding.padding(batch_numbers, pad_idx=self.vocab.pad_idx)
        batch_in, batch_out = self.padding.add_eos_sos(batch_numbers, 
                                               sos_idx=self.vocab.sos_idx, 
                                               eos_idx=self.vocab.eos_idx)
        batch_in = torch.LongTensor(batch_in)
        batch_out = torch.LongTensor(batch_out)
        return batch_in, batch_out

### <center>Model

In [10]:
class Model(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, pad_idx):
        super().__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        #making layers
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=pad_idx)
        self.gru = nn.GRU(emb_size, hidden_size, num_layers=1, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, batch_in, hidden=None):
        embedded_chars = self.embedding(batch_in)
        if hidden is not None:
            outputs, hidden = self.gru(embedded_chars, hidden)
        else:
            outputs, hidden = self.gru(embedded_chars)
        logits = self.linear(outputs)
        logits = logits.view(-1, self.vocab_size)
        return logits, hidden
    
    def generate(self, determined, vocab):
        start_char = torch.LongTensor([[vocab.sos_idx]]).to(device)
        generated_name = [start_char]
        hidden = torch.zeros(1, 1, self.hidden_size).to(device)
        for _ in range(15):
            logit, hidden = self.forward(generated_name[-1], hidden)
            if determined:
                char = logit.topk(1)[1]
            else:
                char = F.softmax(logit, dim=1).multinomial(1)
            generated_name.append(char)
            if char.item() == vocab.eos_idx:
                break
        generated_name = [vocab.idx2char[char.item()] for char in generated_name 
                          if char not in  [vocab.pad_idx, 
                                           vocab.eos_idx, 
                                           vocab.sos_idx]]
        generated_name = ''.join(generated_name[1:])
        return generated_name

In [11]:
def plot_loss(epoch, loss):
    clear_output(True)
    plt.figure(figsize=(12, 6))
    plt.plot(losses)
    plt.title(f'Epoch:{epoch}|Loss:{np.mean(losses[-100:]):.4f}')
    plt.show()

In [12]:
def show_predicted_name(epoch, logits, batch_out, vocab):
    logit = logits.view(batch_size, -1, vocab.vocab_size)[0]
    truth_name = batch_out[0]
    generated_name = logit.topk(1)[1]
    generated_name = [vocab.idx2char[char.item()] for char in generated_name 
                      if char not in [vocab.pad_idx, vocab.eos_idx]]
    generated_name = ''.join(generated_name)
    truth_name = [vocab.idx2char[char.item()] for char in truth_name 
                  if char not in [vocab.pad_idx, vocab.eos_idx]]
    truth_name = ''.join(truth_name)
    print('--------')
    print(f'Epoch:{epoch}\n Predicted: {generated_name}\n Truth: {truth_name}')

### <center> Trainig

In [13]:
dataset = Dataset(path_dc, path_marvel)

In [14]:
vocab = dataset.vocab

In [15]:
batch_size = 64
emb_size = 64
hidden_size = 256
vocab_size = vocab.vocab_size
lr = 0.001

In [16]:
model = Model(vocab_size, emb_size, hidden_size, vocab.pad_idx).to(device)

In [17]:
optimizer = torch.optim.Adam(model.parameters())

In [18]:
criterion = nn.CrossEntropyLoss()

In [19]:
num_epochs = 100
to_print_loss = False

In [20]:
%%time
losses = []
for epoch in range(num_epochs):
    for batch_idx in range((len(dataset)) // batch_size):
        optimizer.zero_grad()
        batch_in, batch_out = dataset.get_batch(batch_size)
        logits, _ = model(batch_in.to(device))
        batch_out_flatten = batch_out.view(-1).to(device)
        mask = (batch_out_flatten != vocab.pad_idx)
        loss = criterion(logits[mask], batch_out_flatten[mask])
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    show_predicted_name(epoch, logits, batch_out, vocab)
    print(f'Loss:{np.mean(losses[-100:]):.4f}')
    if to_print_loss:
        plot_loss(epoch, losses)

--------
Epoch:0
 Predicted: mrrerbh moaloerdarr
 Truth: bobo t. chimpanzee
Loss:2.1740
--------
Epoch:1
 Predicted: saramhemgers
 Truth: mad thinker
Loss:2.0320
--------
Epoch:2
 Predicted: meon mtarfeerd 
 Truth: leah sheffield
Loss:1.9179
--------
Epoch:3
 Predicted: maar e cilneoighte
 Truth: dwayne wainwright
Loss:1.8450
--------
Epoch:4
 Predicted: mecd tibbeeng
 Truth: luna nurblin
Loss:1.7941
--------
Epoch:5
 Predicted: snoin haresta
 Truth: irwin hayes
Loss:1.7492
--------
Epoch:6
 Predicted: son liah srnhries
 Truth: jebediah guthrie
Loss:1.7037
--------
Epoch:7
 Predicted: saalerra 
 Truth: stygorr
Loss:1.6744
--------
Epoch:8
 Predicted: maroer o  
 Truth: barker
Loss:1.6491
--------
Epoch:9
 Predicted: maeve  carkioneer
 Truth: steven partridge
Loss:1.6095
--------
Epoch:10
 Predicted: marvisua   
 Truth: calculha
Loss:1.5918
--------
Epoch:11
 Predicted: sntreu00e<unk>  
 Truth: andr\u00e<unk>
Loss:1.5817
--------
Epoch:12
 Predicted: milliam kacterso
 Truth: william vic

<center> Inference

In [21]:
def print_superhero_name(model, n_names, vocab):
    for _ in range(n_names):
        generated_name = model.generate(False, vocab)
        print('------------------------------------------------------------------')
        print('LADYS AND GENTLEMEN PREPARED TO BW WONDERED BY GROUND NEW SUPERHERO!!!')
        print('CREATED BY MYSTERIOUS ROBOTIC INTELLIGENT!!!')
        print('RIGHT FROM LABORATORY OF EVIL GENIUS -- RECCURENT NEURAL NETWORK----!!!')
        print(f'NEW HERO NAMED AS  ----{generated_name.upper()}---!!!!!')
        print()

In [22]:
print_superhero_name(model, 5, vocab)

------------------------------------------------------------------
LADYS AND GENTLEMEN PREPARED TO BW WONDERED BY GROUND NEW SUPERHERO!!!
CREATED BY MYSTERIOUS ROBOTIC INTELLIGENT!!!
RIGHT FROM LABORATORY OF EVIL GENIUS -- RECCURENT NEURAL NETWORK----!!!
NEW HERO NAMED AS  ----QUIDER III---!!!!!

------------------------------------------------------------------
LADYS AND GENTLEMEN PREPARED TO BW WONDERED BY GROUND NEW SUPERHERO!!!
CREATED BY MYSTERIOUS ROBOTIC INTELLIGENT!!!
RIGHT FROM LABORATORY OF EVIL GENIUS -- RECCURENT NEURAL NETWORK----!!!
NEW HERO NAMED AS  ----RIGORY EARTERS---!!!!!

------------------------------------------------------------------
LADYS AND GENTLEMEN PREPARED TO BW WONDERED BY GROUND NEW SUPERHERO!!!
CREATED BY MYSTERIOUS ROBOTIC INTELLIGENT!!!
RIGHT FROM LABORATORY OF EVIL GENIUS -- RECCURENT NEURAL NETWORK----!!!
NEW HERO NAMED AS  ----UROYAMA KHANDL---!!!!!

------------------------------------------------------------------
LADYS AND GENTLEMEN PREPARED TO