In [1]:
import os
from collections import Counter
from itertools import chain
from IPython.display import clear_output

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
import numpy as np
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)


In [4]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Давайте посмотрим можете ли вы использовать GPU для нашей задачи

In [6]:
print(device)

cuda


### <center> Loading Dataset

In [7]:
path_dc = 'https://raw.githubusercontent.com/carnotaur/superhero_name_generation/master/data/dc-wikia-data.csv'
path_marvel = 'https://raw.githubusercontent.com/carnotaur/superhero_name_generation/master/data/marvel-wikia-data.csv'

In [8]:
class Vocabulary():
    def __init__(self, all_names, pad_idx=0, sos_idx=1, eos_idx=2):
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.all_names = all_names
        #count frequency of chars
        self.vocab = Counter(list(chain(*self.all_names)))
        #create char to index dictionaries
        self.char2idx = {'<pad>': self.pad_idx,
                         '<sos>': self.sos_idx, 
                         '<eos>': self.eos_idx}
        index = 2
        for char, freq in self.vocab.items():
            index += 1
            self.char2idx[char] = index
           
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}
        self.vocab_size = len(set(self.char2idx.values()))
        
    def encode_string(self, string):
        return [self.char2idx[char] for char in string]

In [9]:
class Padding():
    def padding(self, batch_string, pad_idx):
        '''adds regular padding to string'''
        max_len = max(map(len, batch_string))
        padded_batch = []
        for string in batch_string:
            number_padding = max_len - len(string)
            padded_string = string + [pad_idx] * number_padding
            padded_batch.append(padded_string)
        return padded_batch
    
    def add_eos_sos(self, batch_string, sos_idx, eos_idx):
        '''adds end of string and start of string padding'''
        sos_batch = []
        eos_batch = []
        for string in batch_string:
            sos_string = [sos_idx] + string
            eos_string = string + [eos_idx]
            sos_batch.append(sos_string)
            eos_batch.append(eos_string)
        return sos_batch, eos_batch

In [10]:
class Dataset():
    def __init__(self, path_dc, path_marvel):
        self.all_names = self.load_dataset(path_dc, path_marvel)
        #to char represantation of name
        self.all_names = list(map(list, self.all_names))
        self.vocab = Vocabulary(self.all_names)
        self.padding = Padding()
        
    def __len__(self):
        return len(self.all_names)
    
    def load_dataset(self, path_dc, path_marvel):
        dc_df = pd.read_csv(path_dc)
        marvel_df = pd.read_csv(path_marvel)
        self.dataframe = dc_df.append(marvel_df)
        all_names = (self.dataframe.name
                         .str.replace('\((.*?)\)', '')
                         .str.strip()
                         .str.lower()
                         .unique())
        return all_names
    
    def get_batch(self, batch_size):
        batch_chars = [self.all_names[np.random.randint(len(self.all_names))] 
                       for _ in range(batch_size)]
        #to index
        batch_numbers = [self.vocab.encode_string(string) for string in batch_chars]
        #add padding
        batch_numbers = self.padding.padding(batch_numbers, pad_idx=self.vocab.pad_idx)
        batch_in, batch_out = self.padding.add_eos_sos(batch_numbers, 
                                                       sos_idx=self.vocab.sos_idx, 
                                                       eos_idx=self.vocab.eos_idx)
        batch_in = torch.LongTensor(batch_in)
        batch_out = torch.LongTensor(batch_out)
        return batch_in, batch_out

In [11]:
def show_predicted_name(epoch, logits, batch_out, vocab):
    logit = logits.view(batch_size, -1, vocab.vocab_size)[0]
    truth_name = batch_out[0]
    generated_name = logit.topk(1)[1]
    generated_name = [vocab.idx2char[char.item()] for char in generated_name 
                      if char not in [vocab.pad_idx, vocab.eos_idx]]
    generated_name = ''.join(generated_name)
    truth_name = [vocab.idx2char[char.item()] for char in truth_name 
                  if char not in [vocab.pad_idx, vocab.eos_idx]]
    truth_name = ''.join(truth_name)
    print('--------')
    print(f'Epoch:{epoch}\n Predicted: {generated_name}\n Truth: {truth_name}')

Посмотрим на данные<br>


In [12]:
dataset = Dataset(path_dc, path_marvel)

Сперва на оригинальный dataframe

In [13]:
#your code here

Затем на имена после небольшой обработки

In [14]:
#your code here

Как будут выглядить имена когда будут поступать в нашу сетку

In [15]:
dataset.vocab.sos_idx, dataset.vocab.eos_idx, dataset.vocab.pad_idx

(1, 2, 0)

In [16]:
#your code here

# <center>Model

### <center> Embedding

<center> <img src="pic/embedding.png">

### <center> RNN

<center> <img src="pic/rnn.jpg">

Пример как работает backward

In [17]:
x = torch.tensor([1, 2, 3], dtype=torch.float64, requires_grad=True)

In [18]:
#your code here

### <center> Напишем модель

In [19]:
class Model(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, pad_idx):
        #your code here
        pass
        
    def forward(self, batch_in, hidden=None):
        '''
        In:
          batch_in: [batch_size, sequence_len]
          hidden: [1, 1, hidden_size] - default: None
        Out:
          logits: [batch_size * sequence_len, vocab_size]
        '''
        #your code here
        #flatten
        return logits, hidden
    
    def generate(self, determined, vocab):
        start_char = torch.LongTensor([[vocab.sos_idx]]).to(device)
        generated_name = [start_char]
        hidden = torch.zeros(1, 1, self.hidden_size).to(device)
        for _ in range(15):
            logit, hidden = self.forward(generated_name[-1], hidden)
            if determined:
                char = logit.topk(1)[1]
            else:
                char = F.softmax(logit, dim=1).multinomial(1)
            generated_name.append(char)
            if char.item() == vocab.eos_idx:
                break
        generated_name = [vocab.idx2char[char.item()] for char in generated_name 
                          if char not in  [vocab.pad_idx, 
                                           vocab.eos_idx, 
                                           vocab.sos_idx]]
        generated_name = ''.join(generated_name[1:])
        return generated_name

In [20]:
#for testing

In [21]:
batch_in, batch_out = dataset.get_batch(32)

### <center> Trainig

In [22]:
#hyperparameters
batch_size = 64
emb_size = 64
hidden_size = 256
num_epochs = 100
lr = 0.001
vocab_size = dataset.vocab.vocab_size

<center> Cross Entropy Loss

<center> <img src="pic/cross_entopy.jpg">

In [23]:
#your code here

In [24]:
%%time
losses = []
#your code here

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs


<center> Посмотрим на сгенерированные имена

In [25]:
def print_superhero_name(model, n_names):
    for _ in range(n_names):
        generated_name = model.generate(False, dataset.vocab)
        print('------------------------------------------------------------------')
        print('LADYS AND GENTLEMEN PREPARED TO BE WONDERED BY GROUND NEW SUPERHERO!!!')
        print('CREATED BY MYSTERIOUS ROBOTIC INTELLIGENT!!!')
        print('RIGHT FROM LABORATORY OF EVIL GENIUS -- RECCURENT NEURAL NETWORK----!!!')
        print(f'NEW HERO NAMED AS  ----{generated_name.upper()}---!!!!!')
        print()

In [26]:
#your code here

## <center> __Хотите узнать больше?__

Введение в pytorch tensors:
https://github.com/Kyubyong/pytorch_exercises

Неофициальный PyTorch туториал: https://github.com/yunjey/pytorch-tutorial <br>
Достаточно пройти Basics и Intermediate

1. Basics

    - PyTorch Basics
    - Linear Regression
    - Logistic Regression
    - Feedforward Neural Network

2. Intermediate

    - Convolutional Neural Network
    - Deep Residual Network
    - Recurrent Neural Network
    - Bidirectional Recurrent Neural Network
    - Language Model (RNN-LM)


__Официальные pytorch туториалы__:<br>
What is PyTorch?
https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html#sphx-glr-beginner-blitz-tensor-tutorial-py

Neural Networks
https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html#sphx-glr-beginner-blitz-neural-networks-tutorial-py

Training a Classifier
https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py

Transfer Learning Tutorial
https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

Classifying Names with a Character-Level RNN
https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

Generating Names with a Character-Level RNN
https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html