# TP 5 LTSM, GRU, et autres cellules à mémoire
Dans ce TP, nous étudierons deux problèmes :

1. Comment traiter et générer des séquences de taille variable ?
2. Comment prendre en compte des dépendances à plus long terme (Vanishing/exploding gradients)

Pour la génération de séquences, on travaillera sur le jeu de données (discours pré-électoraux de Trump).

Nous allons étudier des variantes des RNNs, et en particulier les Long-Short
Term Memories/LSTMs [2] et les Gated Recurrent Units/GRU


## Notable TODOs:
- 1 Génération de séquences de taille variable
- 2 Prise en compte de dépendences lointaines : LSTM et GRU
- 3 Beam-search



# Setup

## Library import
Nous importons toutes les bibliothèques Python nécessaires

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data manipulation
import pandas as pd
import numpy as np


# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')

import matplotlib as plt
import os
from datetime import datetime
# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

## Local library import
Nous importons toutes les bibliothèques locales nécessaires

In [None]:
# Include local library paths
import nltk
import re

import sys
sys.path.append("./drive/MyDrive/amal/tme5/src") # uncomment and fill to import local libraries

from textloader import *
from tp5 import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
print(device)

cuda


# 1. Implementation et expérimentation de LSTM 







Avec Les RNN, Lorsque la séquence à traiter est trop longue, la rétropropagation du gradient de l’erreur peut soit devenir beaucoup trop grande et exploser, soit au contraire devenir beaucoup trop petite. Le réseau ne fait alors plus la différence entre une information qu’il doit prendre en compte ou non. Il se trouve ainsi dans l’incapacité d’apprendre à long terme.

Les LSTM (Long short-term memory) proposent une solution à ce problème en prenant en compte un vecteur mémoire via un système de portes (gates) et d’états.


<img src="./drive/MyDrive/amal/tme5/img/lstm.png" style="display:block;margin-left:auto;margin-right:auto;width:50%;"> <br>

## Parameter definition
Nous fixons tous les paramètres pertinents pour notre notebook. Par convention
les paramètres sont en majuscules, tandis que toutes les autres variables suivent les directives de Python.



In [None]:
lr = 1e-3
batch_size=64
n_epochs = 10
num_embeddings=len(id2lettre)
embedding_dim=200
hidden_size=200
output_size=len(id2lettre)



log_dir = "./drive/MyDrive/amal/tme5/experiments/lstm"
checkpoint_dir = './checkpoints/lstm'
checkpoint_path = f'{checkpoint_dir}/checkpoint_same_l_' + \
            datetime.now().strftime('%d_%m_%Y_%H:%M:%S')

os.makedirs(f"{log_dir}", exist_ok=True)
os.makedirs(f"{checkpoint_dir}", exist_ok=True)


## Data import
Nous récupérons toutes les données nécessaires à l'analyse.

In [None]:
nltk.download('punkt')
data_path = "./drive/MyDrive/amal/tme5/data/trump_full_speech.txt"
with open(f'{data_path}') as f:
    text = f.read()
text = text.strip()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Creation du dataset et dataLoader




In [None]:
traindataset = TextDataset(text)
trainloader = DataLoader(traindataset, batch_size=32, collate_fn=collate_fn, shuffle=True)

In [None]:
from tqdm import tqdm

class LSTM(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embeddings = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        
        self.f = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.i = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.c = nn.Linear(embedding_dim + hidden_size, hidden_size)

        self.o = nn.Linear(embedding_dim + hidden_size, hidden_size)
    
        self.out = nn.Sequential(nn.Linear(hidden_size, output_size), nn.LogSoftmax(dim=1))

    def forward(self, x, h, c):
        x = self.embeddings(x).squeeze(1)
        input_combined = torch.cat((x, h), 1)

        ft = torch.sigmoid(self.f(input_combined))
        it = torch.sigmoid(self.i(input_combined))
        ct = ft*c + it*torch.tanh(self.c(input_combined))
        ot = torch.sigmoid(self.o(input_combined))

        ht = ot*torch.tanh(ct)
        return ht, ct

    def decode(self, ht):
        return self.out(ht)

    def initHidden(self, n):
        return torch.zeros(n, self.hidden_size), torch.zeros(n, self.hidden_size) 

class MaskedCrossEntropy(nn.Module):
    def __init__(self):
        super(MaskedCrossEntropy, self).__init__()

    def forward(self, logit, target, mask):
        loss = F.nll_loss(logit, target, reduction='none')
        loss *= mask
        return loss.sum() / mask.sum()

def train_step(state,model, criterion, x, y, m):
    x.unsqueeze_(-1)
    h = [ t.to(device) for t in state.model.initHidden(x.shape[1])]
    loss = 0
    for i, x in enumerate(x):
        h = state.model(x, *h)
        logits = state.model.decode(h[0])
        loss += criterion(logits, y[i], m[i])
    return loss


def train(train, model, criterion, optimizer, scheduler, n_epochs, log_dir, checkpoint_path):
    losses = []
    writer = SummaryWriter(log_dir=log_dir)
    pbar = tqdm(range(n_epochs), total=n_epochs, file=sys.stdout)
    state = load_state(checkpoint_path, model, optimizer)

    for i in pbar:
          l = []
          for x in train:
              x, y, m = x[:-1].to(device), x[1:].to(device), (x[1:]!=PAD_IX ).to(device)

              loss = train_step(state,model, criterion, x, y, m)
              state.optimizer.zero_grad()
              loss.backward()            
              state.optimizer.step()
              l.append(loss.item()/len(x))
              state.iteration += 1
          
          state.epoch +=1
          save_state(checkpoint_path, state)

          #scheduler.step()
          lo = np.mean(l)
          losses.append(lo)

          writer.add_scalar('Loss/train', lo, i)

          pbar.set_description(f'Train: Loss: {np.round(lo, 4)}') # \tTest: Loss: {np.round(test_lo, 4)}
          pbar.update()

    return losses

class State:
    def __init__(self, model, optimizer):
        self.model = model
        self.optimizer = optimizer
        self.epoch, self.iteration = 0, 0


def save_state(checkpoint_path, state):
    savepath = Path(f"{checkpoint_path}")
    with savepath.open("wb") as f:
        torch.save(state, f)

def load_state(checkpoint_path, model, optimizer):
    savepath = Path(f"{checkpoint_path}")
    if savepath.is_file():
        with savepath.open("rb") as f:
            state = torch.load(f)
            return state
    return State(model, optimizer)


## Definition de notre modele, optimizer, et la loss 




---




In [None]:
model = LSTM(num_embeddings=num_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, output_size=output_size).to(device)
criterion = MaskedCrossEntropy()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
losses = train(
        train=trainloader, 
        model=model, 
        criterion=criterion, 
        optimizer=optimizer, 
        scheduler = None,
        n_epochs=n_epochs, 
        #device=device,
        log_dir=log_dir, 
        checkpoint_path=checkpoint_path
    )

torch.save(model.state_dict(), "./drive/MyDrive/tme5/model-lstm-fitted.pth")



  0%|          | 0/10 [00:00<?, ?it/s][A[A

Train: Loss: 2.1753:   0%|          | 0/10 [03:36<?, ?it/s][A[A

Train: Loss: 2.1753:  10%|█         | 1/10 [03:36<32:24, 216.03s/it][A[A

Train: Loss: 2.1753:  10%|█         | 1/10 [03:36<32:24, 216.04s/it][A[A

Train: Loss: 1.7024:  10%|█         | 1/10 [07:00<32:24, 216.04s/it][A[A

Train: Loss: 1.7024:  20%|██        | 2/10 [07:00<28:19, 212.48s/it][A[A

Train: Loss: 1.7024:  20%|██        | 2/10 [07:00<28:19, 212.48s/it][A[A

Train: Loss: 1.5232:  20%|██        | 2/10 [10:21<28:19, 212.48s/it][A[A

Train: Loss: 1.5232:  30%|███       | 3/10 [10:21<24:24, 209.19s/it][A[A

Train: Loss: 1.5232:  30%|███       | 3/10 [10:21<24:24, 209.20s/it][A[A

Train: Loss: 1.4188:  30%|███       | 3/10 [13:42<24:24, 209.20s/it][A[A

Train: Loss: 1.4188:  40%|████      | 4/10 [13:42<20:39, 206.62s/it][A[A

Train: Loss: 1.4188:  40%|████      | 4/10 [13:42<20:39, 206.62s/it][A[A

Train: Loss: 1.3519:  40%|████      | 4/10 [17:03

FileNotFoundError: ignored

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./experiments/lstm

In [None]:
model = LSTM(num_embeddings=num_embeddings, embedding_dim=embedding_dim, hidden_size=hidden_size, output_size=output_size).to(device)
criterion = MaskedCrossEntropy()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
losses = train(
        train=trainloader, 
        model=model, 
        criterion=criterion, 
        optimizer=optimizer, 
        scheduler = None,
        n_epochs=n_epochs, 
        log_dir=log_dir, 
        checkpoint_path=checkpoint_path
    )




  0%|          | 0/10 [00:00<?, ?it/s][A

NameError: ignored

## Génération de séquences  à l'aide de LSTM 


In [None]:
model.load_state_dict(torch.load("./drive/MyDrive/tme5/model-lstm-fitted.pth"))
model.eval()

In [None]:
def generate(model, start, maxlength):
    x = torch.tensor(string2code(start)).unsqueeze(-1).to(device)
    h = [ v.to(device) for v in model.initHidden(1)]
    l = [lettre2id[start]]
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(maxlength):
            h = model(x, *h) #h = model.one_step(x, *h)
            d = model.decode(h[0])
            predictions.append(d.squeeze(0).tolist())
            probs = torch.exp(d)
            start = torch.distributions.categorical.Categorical(probs).sample()
            l.append(start.item())
            if start.item() == EOS_IX:
                break
            start = start.unsqueeze(-1).to(device)
    return code2string(l), predictions


In [None]:
string, predictions = generate(model, "W", 100)
string

'W0[\\KT+vo71mZ|9::{icL9c#l))h#T#cj(q;LC[*<.$x1:0g 7?\'75#l":CH~3Q>C^t<HRUtI2su(9ja|+g_hXiV-"2,7]VL\'.Wk'

In [None]:
model = LSTM(num_embeddings=len(id2lettre), embedding_dim=200, hidden_size=300, output_size=len(id2lettre)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = MaskedCrossEntropy()

In [None]:
losses = train(trainloader, model, criterion, optimizer, None, 100,log_dir,checkpoint_path)

TypeError: ignored

In [None]:
string, predictions = generate(model, "I", 100)
string

"InD'G !b5iQHyM5:itx&Jj4zAiRl*SX{X9uQ]o,`$jvb+}GLHutUA&';9Zqy7gM%3]|"

# 2. Implementation et éxperimentation  de GRU 

Une variante des LSTM sont les GRU (Gated Recurrent Unit). Cette structure est plus simple que les LSTM au sens où moins de paramètres entrent en jeu.e, les portes entrée/oubli sont fusionnées Le nombre de portes passe à 2 et celui d’état à 1

<img src="./drive/MyDrive/amal/tme5/img/GRU.png" style="display:block;margin-left:auto;margin-right:auto;width:50%;"> <br>


In [None]:
model = GRU(num_embeddings=len(id2lettre), embedding_dim=100, hidden_size=150, output_size=len(id2lettre))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = MaskedCrossEntropy()
losses = train(trainloader, model, criterion, optimizer, None, 20)

In [None]:
def generate(model, start, maxlength):
    x = torch.tensor(string2code(start)).unsqueeze(-1).to(device)
    h = [ v.to(device) for v in model.initHidden(1)]
    l = [lettre2id[start]]
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(maxlength):
            h = model(x, *h)
            d = model.decode(h[0])
            predictions.append(d.squeeze(0).tolist())
            probs = torch.exp(d)
            start = torch.distributions.categorical.Categorical(probs).sample()
            l.append(start.item())
            if start.item() == EOS_IX:
                break
            start = start.unsqueeze(-1).to(device)
    return code2string(l), predictions
string, predictions = generate(model, "x", 100)

import datetime
torch.save(model.state_dict(), "./models/gru-model.pth")

model = GRU(*args, **kwargs)
model.load_state_dict(torch.load(PATH))
model.eval()

En pratique, les GRU et les LSTM permettent d’obtenir des résultats comparables. L’intérêt des GRU par rapport aux LSTM étant le temps d’exécution qui est plus rapide puisque moins de paramètres doivent être calculés

# 3. Beam-search

In [None]:
def beam_search_decoder(predictions, top_k = 3):
    output_sequences = [([], 0)]
    for token_probs in predictions:
        new_sequences = []
    
        for old_seq, old_score in output_sequences:
            for char_index in range(len(token_probs)):
                new_seq = old_seq + [char_index]
                new_score = old_score + token_probs[char_index]
                new_sequences.append((new_seq, new_score))
                
        output_sequences = sorted(new_sequences, key = lambda val: val[1], reverse = True)
        output_sequences = output_sequences[:top_k]
    return output_sequences

    
seqeunces = beam_search_decoder(predictions, top_k = 5)
code2string(seqeunces[0][0])