# Rekurencyjne Sieci Neuronowe (RNN)

### Importy i Utilsy  (odpalić i schować )

In [2]:
# imports 
import torch
import os
import unicodedata
import string
import numpy as np
from typing import Tuple, Optional, List

from torch.nn.functional import cross_entropy

import matplotlib.pyplot as plt 
from sklearn.metrics import f1_score

from torch.utils.data import Dataset, DataLoader

all_letters = string.ascii_letters
n_letters = len(all_letters)


class ListDataset(Dataset):
    
    def __init__(self, data, targets):
        
        self.data = data
        self.targets = targets
        
    def __getitem__(self, ind):
        
        return self.data[ind], self.targets[ind]
    
    def __len__(self):
        return len(self.targets)

    
def unicode_to__ascii(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'
                                                                 and c in all_letters)
                   

def read_lines(filename: str) -> List[str]:
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicode_to__ascii(line) for line in lines]


def letter_to_index(letter: str) -> int:
    return all_letters.find(letter)


def line_to_tensor(line: str) -> torch.Tensor:
    tensor = torch.zeros(len(line), n_letters)
    for i, letter in enumerate(line):
        tensor[i][letter_to_index(letter)] = 1
    return tensor

## Dane sekwencyjne

Modele, którymi zajmowaliśmy się wcześniej zakładały konkretny kształt danych. Dla przykładu klasyczna sieć neuronowa fully-connected dla MNISTa zakładała, że na wejściu dostanie wektory rozmiaru 784 - dla wektorów o innej wymiarowości i innych obiektów model zwyczajnie nie będzie działać.

Takie założenie bywa szczególnie niewygodne przy pracy z niektórymi typami danych, takimi jak:
* językiem naturalny (słowa czy zdania mają zadanej z góry liczby znaków)
* szeregi czasowe (dane giełdowe ciągną się właściwie w nieskończoność) 
* dźwięk (nagrania mogą być krótsze lub dłuższe).

Do rozwiązania tego problemu służą rekuencyjne sieci neuronowe (*recurrent neural networks, RNNs*), które zapamiętują swój stan z poprzedniej iteracji.

### Ładowanie danych
Poniższe dwie komórki ściągają dataset nazwisk z 18 różnych narodowości. Każda litera w danym nazwisku jest zamieniana na jej indeks z alfabetu w postaci kodowania "one-hot". Inaczej mówiąc, każde nazwisko jest binarną macierzą rozmiaru `len(name)` $\times$ `n_letters`. 

Dodatkowo, ponieważ ten dataset jest mocno niezbalansowany, użyjemy specjalnego samplera do losowania przykładów treningowych, tak aby do uczenia sieć widziała tyle samo przykładów z każdej klasy.

Ponieważ nazwiska mogą mieć różne długości będziemy rozważać `batch_size = 1` w tym notebooku (choć implementacje modeli powinny działać dla dowolnych wartości `batch_size`!)

In [3]:
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip

--2021-01-16 14:28:57--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 99.84.222.65, 99.84.222.94, 99.84.222.10, ...
Connecting to download.pytorch.org (download.pytorch.org)|99.84.222.65|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2021-01-16 14:28:57 (72.8 MB/s) - ‘data.zip’ saved [2882130/2882130]

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.tx

In [4]:
# NOTE: you can change the seed or remove it completely if you like
torch.manual_seed(1337)

data_dir = 'data/names'

data = []
targets = [] 
label_to_idx = {}

# read each natonality file and process data 
for label, file_name in enumerate(os.listdir(data_dir)):
    
    label_to_idx[label] = file_name.split('.')[0].lower()
    
    names = read_lines(os.path.join(data_dir, file_name))
    data += [line_to_tensor(name) for name in names]
    targets += len(names) * [label]

# split into train and test indices
test_frac = 0.1
n_test = int(test_frac * len(targets))
test_ind = np.random.choice(len(targets), size=n_test, replace=False)
train_ind = np.setdiff1d(np.arange(len(targets)), test_ind)

targets = torch.tensor(targets)
train_targets = targets[train_ind]

# calculate weights for BalancedSampler
uni, counts = np.unique(train_targets, return_counts=True)
weight_per_class = len(targets) / counts
weight = [weight_per_class[c] for c in train_targets]
# preapre the sampler
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights=weight, num_samples=len(weight)) 

train_dataset = ListDataset(data=[x for i, x in enumerate(data) if i in train_ind], targets=train_targets)
train_loader = DataLoader(train_dataset, shuffle=False, batch_size=1, sampler=sampler)

test_dataset = ListDataset(data=[x for i, x in enumerate(data) if i in test_ind], targets=targets[test_ind])
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1)

In [5]:
# check out the content of the dataset
for i, (x, y) in enumerate(train_loader):
    break

print("x.shape:", x.shape)
print("name: ", end="")
for letter_onehot in x[0]:
    print(all_letters[torch.argmax(letter_onehot)], end="")

print("\ny:", label_to_idx[y.item()])

x.shape: torch.Size([1, 10, 52])
name: Sniegowski
y: polish


## Zadanie 1. (2 pkt.)

Zaimplementuj "zwykłą" sieć rekurencyjną. 
![rnn](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/RNN-unrolled.png)

* W klasie `RNN` należy zainicjalizować potrzebne wagi oraz zaimplementować główną logikę dla pojedynczej chwili czasowej $x_t$
* Wyjście z sieci możemy mieć dowolny rozmiar, potrzebna jest również warstwa przekształacjąca stan ukryty na wyjście.
* W pętli uczenia należy dodać odpowiednie wywołanie sieci. HINT: pamiętać o iterowaniu po wymiarze "czasowym".
* Zalecane jest użycie aktywacji na warstwie liczącej reprezentacje `hidden` tak, aby nie "eksplodowała", np. `tanh`.


In [6]:
class RNN(torch.nn.Module):
    
    def __init__(self, 
                 input_size: int,
                 hidden_size: int, 
                 output_size: int):
        """
        :param input_size: int
            Dimensionality of the input vector
        :param hidden_size: int
            Dimensionality of the hidden space
        :param output_size: int
            Desired dimensionality of the output vector
        """
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.input_to_hidden = torch.nn.Linear(input_size + hidden_size, hidden_size)
        
        self.hidden_to_output = torch.nn.Linear(hidden_size, output_size)
    
    # for the sake of simplicity a single forward will process only a single timestamp 
    def forward(self, 
                input: torch.tensor, 
                hidden: torch.tensor) -> Tuple[torch.tensor, torch.tensor]:
        """
        :param input: torch.tensor 
            Input tesnor for a single observation at timestep t
            shape [batch_size, input_size]
        :param hidden: torch.tensor
            Representation of the memory of the RNN from previous timestep
            shape [batch_size, hidden_size]
        """
        
        combined = torch.cat([input, hidden], dim=1) 
        hidden = torch.nn.Tanh()(self.input_to_hidden(combined))
        output = self.hidden_to_output(hidden)
        return output, hidden
    
    def init_hidden(self, batch_size: int) -> torch.Tensor:
        """
        Returns initial value for the hidden state
        """
        return torch.zeros(batch_size, self.hidden_size, requires_grad=True).cuda()

### Pętla uczenia

In [7]:
n_class = len(label_to_idx)

# initialize network and optimizer
rnn = RNN(n_letters, 256, n_class).cuda()
optimizer = torch.optim.SGD(rnn.parameters(), lr=0.01)   

# we will train for only a single epoch 
epochs = 1


# main loop
for epoch in range(epochs):
    
    loss_buffer = []
    
    for i, (x, y) in enumerate(train_loader):  
        
        x = x.cuda()
        y = y.cuda()
        
        optimizer.zero_grad()
        # get initial hidden state
        hidden = rnn.init_hidden(x.shape[0])
        
        # get output for the sample, remember that we treat it as a sequence
        # so you need to iterate over the 2nd, time dimensiotn

        seq_len = x.shape[1]
        
        for t in range(seq_len - 1):
            _, hidden = rnn(x[:, t, :], hidden)
        output = rnn(x[:, -1, :], hidden)[0]
            
        loss = cross_entropy(output, y)
        loss.backward()
        optimizer.step()  
        
        loss_buffer.append(loss.item())
        
        if i % 1000 == 1:
            print(f"Epoch: {epoch} Progress: {100 * i/len(train_loader):2.0f}% Loss: {np.mean(loss_buffer):.3f}")
            loss_buffer = []
    

# evaluate on the test set
with torch.no_grad():
    ps = []
    ys = []
    correct = 0
    for i, (x, y) in enumerate(test_loader):
        x = x.cuda()
        ys.append(y.numpy())

        hidden = rnn.init_hidden(x.shape[0])
        seq_len = x.shape[1]

        for t in range(seq_len - 1):
            _, hidden = rnn(x[:, t, :], hidden)
        output = rnn(x[:, -1, :], hidden)[0]

        pred = output.argmax(dim=1)
        ps.append(pred.cpu().numpy())
    
    ps = np.concatenate(ps, axis=0)
    ys = np.concatenate(ys, axis=0)
    f1 = f1_score(ys, ps, average='weighted')
    
    print(f"Final F1 score: {f1:.2f}")
    assert f1 > 0.15, "You should get over 0.15 f1 score, try changing some hiperparams!"

Epoch: 0 Progress:  0% Loss: 2.886
Epoch: 0 Progress:  6% Loss: 2.861
Epoch: 0 Progress: 11% Loss: 2.766
Epoch: 0 Progress: 17% Loss: 2.496
Epoch: 0 Progress: 22% Loss: 2.291
Epoch: 0 Progress: 28% Loss: 2.148
Epoch: 0 Progress: 33% Loss: 1.977
Epoch: 0 Progress: 39% Loss: 1.924
Epoch: 0 Progress: 44% Loss: 1.894
Epoch: 0 Progress: 50% Loss: 1.792
Epoch: 0 Progress: 55% Loss: 1.796
Epoch: 0 Progress: 61% Loss: 1.774
Epoch: 0 Progress: 66% Loss: 1.746
Epoch: 0 Progress: 72% Loss: 1.726
Epoch: 0 Progress: 77% Loss: 1.656
Epoch: 0 Progress: 83% Loss: 1.697
Epoch: 0 Progress: 89% Loss: 1.643
Epoch: 0 Progress: 94% Loss: 1.673
Epoch: 0 Progress: 100% Loss: 1.662
Final F1 score: 0.17


## Zadanie 2. (0.5 pkt.)
Zaimplementuj funkcje `predict`, która przyjmuje nazwisko w postaci stringa oraz model RNN i wypisuje 3 najlepsze predykcje narodowości dla tego nazwiska razem z ich logitami.

**Hint**: Przyda się tutaj jedna z funkcji z pierwszej komórki notebooka.

In [8]:
def predict(name: str, rnn: RNN):
    """Prints the name and model's top 3 predictions with scores"""
    tensor = line_to_tensor(name)
    tensor = tensor.view(1, *tensor.shape).cuda()
    hidden = rnn.init_hidden(1)
    for t in range(len(name)):
        output, hidden = rnn(tensor[:, t, :], hidden)
    logits, indices = torch.sort(output[0], descending=True)
    for l, i in zip(logits[:3], indices[:3]):
        print("\t", label_to_idx[int(i)], float(l))

In [9]:
some_names = ["Satoshi", "Jackson", "Schmidhuber", "Hinton", "Kowalski"]

for name in some_names:
    print(name)
    predict(name, rnn)

Satoshi
	 italian 3.7912449836730957
	 japanese 2.2028489112854004
	 spanish 1.1157729625701904
Jackson
	 scottish 4.44012451171875
	 irish 2.8323991298675537
	 english 2.5166213512420654
Schmidhuber
	 german 3.363201141357422
	 dutch 2.515110731124878
	 czech 1.7448205947875977
Hinton
	 scottish 2.6894121170043945
	 irish 2.480625867843628
	 english 2.0234460830688477
Kowalski
	 polish 5.826296806335449
	 japanese 3.1261284351348877
	 russian 3.038498640060425


## Zadanie 3 (4 pkt.)
Ostatnim zadaniem jest implementacji komórki i sieci LSTM. 

![lstm](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)

* W klasie `LSTMCell` ma znaleźć się główna loginka LSTMa, czyli wszystkie wagi do stanów `hidden` i `cell` jak i bramek kontrolujących te stany. 
* W klasie `LSTM` powinno znaleźć się wywołanie komórki LSTM, HINT: poprzednio było w pętli uczenia, teraz przenisiemy to do klasy modelu.
* W pętli uczenia należy uzupełnić brakujące wywołania do uczenia i ewaluacji modelu.

Zdecydowanie polecam [materiały Chrisa Olaha](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) do zarówno zrozumienia jak i ściągi do wzorów.

Zadaniem jest osiągnięcie wartości `f1_score` lepszej niż na sieci RNN, przy prawidłowej implementacji nie powinno być z tym problemów używając podanych hiperparametrów. Dozwolona jest oczywiście zmiana `random seed`.

#### Komórka LSTM

In [13]:
class LSTMCell(torch.nn.Module):

    def __init__(self, 
                 input_size: int, 
                 hidden_size: int):
        """
        :param input_size: int
            Dimensionality of the input vector
        :param hidden_size: int
            Dimensionality of the hidden space
        """
        
        super(LSTMCell, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size

        # initialize LSTM weights 
        # NOTE: there are different approaches that are all correct 
        # (e.g. single matrix for all input opperations), you can pick
        # whichever you like for this task

        self.forgetting_layer = torch.nn.Linear(input_size + hidden_size, hidden_size)
        self.input_layer = torch.nn.Linear(input_size + hidden_size, hidden_size)
        self.cell_layer = torch.nn.Linear(input_size + hidden_size, hidden_size)
        self.output_layer = torch.nn.Linear(input_size + hidden_size, hidden_size)

    def forward(self, 
                input: torch.tensor, 
                states: Tuple[torch.tensor, torch.tensor]) -> Tuple[torch.tensor, torch.tensor]:
        
        hidden, cell = states
        
        # Compute input, forget, and output gates
        # then compute new cell state and hidden state
        # see http://colah.github.io/posts/2015-08-Understanding-LSTMs/ 

        combined = torch.cat((hidden, input), dim=1)

        forgetting = torch.nn.Sigmoid()(self.forgetting_layer(combined))
        input = torch.nn.Tanh()(self.input_layer(combined))
        candidates = torch.nn.Sigmoid()(self.cell_layer(combined))
        cell = forgetting * cell + input * candidates
        
        hidden = torch.tanh(cell) * torch.nn.Sigmoid()(self.output_layer(combined))
        
        return hidden, cell

### Klasa modelu LSTM

In [14]:
class LSTM(torch.nn.Module):

    def __init__(self, 
                 input_size: int, 
                 hidden_size: int):
        """
        :param input_size: int
            Dimensionality of the input vector
        :param hidden_size: int
            Dimensionality of the hidden space
        """
        
        super(LSTM, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.cell = LSTMCell(input_size=input_size, hidden_size=hidden_size)
        
    def forward(self, 
                input: torch.tensor) -> Tuple[torch.tensor, torch.tensor]:
        """
        :param input: torch.tensor 
            Input tesnor for a single observation at timestep t
            shape [batch_size, input_size]
        Returns Tuple of two torch.tensors, both of shape [seq_len, batch_size, hidden_size]
        """
        
        batch_size = input.shape[0]
        
        hidden, cell = self.init_hidden_cell(batch_size)
        
        hiddens = []
        cells = []
        
        # this time we will process the whole sequence in the forward method
        # as oppose to the previous exercise, remember to loop over the timesteps
        
        time_steps = input.shape[1]

        for t in range(time_steps):
            hidden, cell = self.cell(input[:, t, :], (hidden, cell))
            hiddens.append(hidden)
            cells.append(cell)

        return hiddens, cells
    
    def init_hidden_cell(self, batch_size):
        """
        Returns initial value for the hidden and cell states
        """
        return (torch.zeros(batch_size, self.hidden_size, requires_grad=True).cuda(), 
                torch.zeros(batch_size, self.hidden_size, requires_grad=True).cuda())

### Pętla uczenia

In [41]:
from itertools import chain

# torch.manual_seed(1337)

# build data loaders
train_loader = DataLoader(train_dataset, batch_size=1, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=1)

# initialize the lstm with an additional cliassifier layer at the top
lstm = LSTM(input_size=len(all_letters), hidden_size=128).cuda()
clf = torch.nn.Linear(in_features=128, out_features=len(label_to_idx)).cuda()

# initialize a optimizer
params = chain(lstm.parameters(), clf.parameters())
optimizer = torch.optim.Adam(params, lr=0.01) 

# we will train for only a single epoch 
epoch = 1

# main loop
for epoch in range(epoch):
    
    loss_buffer = []
    
    for i, (x, y) in enumerate(train_loader):   
        
        x = x.cuda()
        y = y.cuda()
        
        optimizer.zero_grad()
        
        # get output for the sample, remember that we treat it as a sequence
        # so you need to iterate over the sequence length here
        # don't forget about the classifier!
        
        output = clf(lstm(x)[1][-1])

        # calucate the loss
        loss = cross_entropy(output, y)
        loss.backward()
        optimizer.step()                                
        
        loss_buffer.append(loss.item())
        
        if i % 1000 == 1:
            print(f"Epoch: {epoch} Progress: {100 * i/len(train_loader):2.0f}% Loss: {np.mean(loss_buffer):.3f}")
            loss_buffer = []

# evaluate on the test set
with torch.no_grad():
    
    ps = []
    ys = []
    for i, (x, y) in enumerate(test_loader): 
        
        x = x.cuda()
        ys.append(y.numpy())
        
        output = clf(lstm(x)[1][-1])

        pred = output.argmax(dim=1)
        ps.append(pred.cpu().numpy())
    
    ps = np.concatenate(ps, axis=0)
    ys = np.concatenate(ys, axis=0)
    f1 = f1_score(ys, ps, average='weighted')
    
    print(f"Final F1 score: {f1:.2f}")
    assert f1 > 0.18, "You should get over 0.18 f1 score, try changing some hiperparams!"

Epoch: 0 Progress:  0% Loss: 2.894
Epoch: 0 Progress:  6% Loss: 2.381
Epoch: 0 Progress: 11% Loss: 1.921
Epoch: 0 Progress: 17% Loss: 1.707
Epoch: 0 Progress: 22% Loss: 1.565
Epoch: 0 Progress: 28% Loss: 1.572
Epoch: 0 Progress: 33% Loss: 1.305
Epoch: 0 Progress: 39% Loss: 1.176
Epoch: 0 Progress: 44% Loss: 1.105
Epoch: 0 Progress: 50% Loss: 1.100
Epoch: 0 Progress: 55% Loss: 1.039
Epoch: 0 Progress: 61% Loss: 1.007
Epoch: 0 Progress: 66% Loss: 0.939
Epoch: 0 Progress: 72% Loss: 0.912
Epoch: 0 Progress: 77% Loss: 0.881
Epoch: 0 Progress: 83% Loss: 0.795
Epoch: 0 Progress: 89% Loss: 0.901
Epoch: 0 Progress: 94% Loss: 0.810
Epoch: 0 Progress: 100% Loss: 0.758
Final F1 score: 0.24


## Zadanie 4. (0.5 pkt.)
Zaimplementuj analogiczną do funkcji `predict` z zadania 2 dla modelu `lstm+clf`.


In [42]:
def predict_lstm(name: str, lstm: LSTM, clf: torch.nn.Module):
    """Prints the name and model's top 3 predictions with scores"""
    tensor = line_to_tensor(name)
    tensor = tensor.view(1, *tensor.shape).cuda()
    output = clf(lstm(tensor)[1][-1])[0]
    logits, indices = torch.sort(output, descending=True)
    for l, i in zip(logits[:3], indices[:3]):
        print("\t", label_to_idx[int(i)], float(l))

In [43]:
# test your lstm predictor
some_names = ["Satoshi", "Jackson", "Schmidhuber", "Hinton", "Kowalski"]
    
for name in some_names:
    print(name)
    predict_lstm(name, lstm, clf)

Satoshi
	 japanese 9.24991226196289
	 italian 6.910540580749512
	 russian 5.170266151428223
Jackson
	 scottish 15.353461265563965
	 english 6.422601699829102
	 french 1.0398693084716797
Schmidhuber
	 german 4.486064434051514
	 czech 2.473479747772217
	 dutch 2.399200439453125
Hinton
	 scottish 7.10476541519165
	 english 4.8557610511779785
	 chinese 2.0448806285858154
Kowalski
	 polish 8.943662643432617
	 czech 3.258988618850708
	 japanese 3.168221950531006
