# LSTM et attention

Nous allons dans ce notebook utiliser le modèle LSTM combiné avec le modèle d'attention, qui est expliqué dans cet <a href="https://arxiv.org/pdf/1706.03762.pdf"> article</a>.



In [1]:
import pandas as pd
import torch
from torchtext import data

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(sequential=True,lower=True,  include_lengths=True,
                  tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

train_data,valid_data, test_data = data.TabularDataset.splits(
        path='./data/', train='train.csv',
        validation='valid.csv', test='test.csv', format='csv', skip_header=True,
        fields=[('text', TEXT), ('label', LABEL)])

print(f'Taille des données train: {len(train_data)}')
print(f'Taille des données de validation: {len(valid_data)}')
print(f'Taille des données test: {len(test_data)}')

Taille des données train: 18163
Taille des données de validation: 2270
Taille des données test: 2271


In [2]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [3]:
BATCH_SIZE = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device,sort_key = lambda x: len(x.text),
    sort_within_batch = True)

In [4]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable


class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, bidirectional = True):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.bidirectional = bidirectional

        self.lstm = nn.LSTM(input_size, hidden_size, bidirectional = bidirectional)
  
    def forward(self, inputs, hidden):
    
        output, hidden = self.lstm(inputs.view(1, 1, self.input_size), hidden)
        return output, hidden
    
    def init_hidden(self):
        return (torch.zeros(1 + int(self.bidirectional), 1, self.hidden_size),
          torch.zeros(1 + int(self.bidirectional), 1, self.hidden_size))

class AttentionDecoder(nn.Module):
  
    def __init__(self, hidden_size, output_size, vocab_size):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.attn = nn.Linear(hidden_size + output_size, 1)
        self.lstm = nn.LSTM(hidden_size + vocab_size, output_size) #if we are using embedding hidden_size should be added with embedding of vocab size
        self.final = nn.Linear(output_size, vocab_size)
  
    def init_hidden(self):
        return (torch.zeros(1, 1, self.output_size),
          torch.zeros(1, 1, self.output_size))
  
    def forward(self, decoder_hidden, encoder_outputs, input):
    
        weights = []
        for i in range(len(encoder_outputs)):
            print(decoder_hidden[0][0].shape)
            print(encoder_outputs[0].shape)
            weights.append(self.attn(torch.cat((decoder_hidden[0][0], 
                                              encoder_outputs[i]), dim = 1)))
        normalized_weights = F.softmax(torch.cat(weights, 1), 1)

        attn_applied = torch.bmm(normalized_weights.unsqueeze(1),
                                 encoder_outputs.view(1, -1, self.hidden_size))

        input_lstm = torch.cat((attn_applied[0], input[0]), dim = 1) #if we are using embedding, use embedding of input here instead

        output, hidden = self.lstm(input_lstm.unsqueeze(0), decoder_hidden)

        output = self.final(output[0])

        return output, hidden, normalized_weights
    
bidirectional = True
c = Encoder(10, 20, bidirectional)
a, b = c.forward(torch.randn(10), c.init_hidden())
print(a.shape)
print(b[0].shape)
print(b[1].shape)

x = AttentionDecoder(20 * (1 + bidirectional), 25, 30)
y, z, w = x.forward(x.init_hidden(), torch.cat((a,a)), torch.zeros(1,1, 30)) #Assuming <SOS> to be all zeros
print(y.shape)
print(z[0].shape)
print(z[1].shape)
print(w)


torch.Size([1, 1, 40])
torch.Size([2, 1, 20])
torch.Size([2, 1, 20])
torch.Size([1, 25])
torch.Size([1, 40])
torch.Size([1, 25])
torch.Size([1, 40])
torch.Size([1, 30])
torch.Size([1, 1, 25])
torch.Size([1, 1, 25])
tensor([[0.5000, 0.5000]], grad_fn=<SoftmaxBackward>)


## Construire le modèle

In [5]:
import torch
from torch import nn
import numpy as np
from torch.nn import functional as F


class RNNAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        super().__init__()
        
        
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Encoder RNN       
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2 * (1+bidirectional), output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        # Softmax non-linearity
        self.softmax = nn.Softmax(dim=0)
                
    def apply_attention(self, rnn_output, final_hidden_state):
        '''
        Apply Attention on RNN output
        
        Input:
            rnn_output (batch_size, seq_len, num_directions * hidden_size): tensor representing hidden state for every word in the sentence
            final_hidden_state (batch_size, num_directions * hidden_size): final hidden state of the RNN
            
        Returns:
            attention_output(batch_size, num_directions * hidden_size): attention output vector for the batch
        '''
        hidden_state = final_hidden_state.unsqueeze(2)
        attention_scores = torch.bmm(rnn_output, hidden_state).squeeze(2)
        soft_attention_weights = F.softmax(attention_scores, 1).unsqueeze(2) #shape = (batch_size, seq_len, 1)
        attention_output = torch.bmm(rnn_output.permute(0,2,1), soft_attention_weights).squeeze(2)
        return attention_output
        
    def forward(self, text, text_lengths):
        # x.shape = (max_sen_len, batch_size)
        embedded_sent = self.embedding(text)
        
        # embedded_sent.shape = (max_sen_len=20, batch_size=64,embed_size=300)
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded_sent, text_lengths)
        packed_output, (h_n,c_n) = self.lstm(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        ##################################### Encoder ####################################### 
        # lstm_output.shape = (seq_len, batch_size, num_directions * hidden_size)
        
        # Final hidden state of last layer (num_directions, batch_size, hidden_size)
        
        batch_size = h_n.shape[1]
        h_n_final_layer = h_n.view(self.n_layers,
                                   self.bidirectional+ 1,
                                   batch_size,
                                   self.hidden_dim)[-1,:,:,:]
   
        ##################################### Attention #####################################
        # Convert input to (batch_size, num_directions * hidden_size) for attention
        final_hidden_state = torch.cat([h_n_final_layer[i,:,:] for i in range(h_n_final_layer.shape[0])], dim=1)
      
        attention_out = self.apply_attention(output.permute(1,0,2), final_hidden_state)
        # Attention_out.shape = (batch_size, num_directions * hidden_size)
           
        #################################### Linear #########################################
        concatenated_vector = torch.cat([final_hidden_state, attention_out], dim=1)
        
        final_feature_map = self.dropout(concatenated_vector) # shape=(batch_size, num_directions * hidden_size)
        final_out = self.fc(final_feature_map)
        return self.softmax(final_out) 

embed size:  torch.Size([410, 64, 100])

h_n torch.Size([4, 64, 256])

c_n torch.Size([4, 64, 256])

output torch.Size([410, 64, 512])

output_lengths torch.Size([64])

h_n_final_layer torch.Size([2, 64, 256])

final_hidden_state torch.Size([64, 512])

attention_out torch.Size([64, 512])

concatenated_vector torch.Size([64, 1024])

final_feature_map torch.Size([64, 1024])

In [6]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNNAttention(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,811,369 trainable parameters


In [7]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

torch.Size([25002, 100])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.5194, -0.9798,  0.8356,  ..., -0.5779, -1.2377, -0.6916],
        [ 0.1325, -0.3895, -0.6949,  ...,  0.9231,  1.2856, -0.9643],
        [-1.2891, -0.0067,  1.2189,  ...,  0.4144, -0.4232,  0.1408]])


In [8]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def precision(preds, y):
    '''
    Retourne la précision
    '''
    y_pred = torch.round(torch.sigmoid(preds))
    y_true = (y_pred == y).float() 
            
    tp = (y_true * y_pred).sum().float()
    tn = ((1 - y_true) * (1 - y_pred)).sum().float()
    fp = ((1 - y_true) * y_pred).sum().float()
    fn = (y_true * (1 - y_pred)).sum().float()
       
    precision = tp / (tp + fp)
    
    return precision

def recall(preds, y):
    '''
    Retourne le recall
    '''
    y_pred = torch.round(torch.sigmoid(preds))
    y_true = (y_pred == y).float()       
    
    tp = (y_true * y_pred).sum().float()
    tn = ((1 - y_true) * (1 - y_pred)).sum().float()
    fp = ((1 - y_true) * y_pred).sum().float()
    fn = (y_true * (1 - y_pred)).sum().float()
    recall = tp / (tp + fn)
    return recall


def f1_loss(preds, y):
    '''
    Retourne le score F1
    '''  
    y_pred = torch.round(torch.sigmoid(preds))
    y_true = (y_pred == y).float() 
            
    tp = (y_true * y_pred).sum().float()
    tn = ((1 - y_true) * (1 - y_pred)).sum().float()
    fp = ((1 - y_true) * y_pred).sum().float()
    fn = (y_true * (1 - y_pred)).sum().float()
    
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    
    f1 = 2* (precision*recall) / (precision + recall)
    return f1

In [9]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
model.load_state_dict(torch.load('tut2-model.pt'))

Epoch: 01 | Epoch Time: 2m 27s
	Train Loss: 0.695 | Train Acc: 45.63%
	 Val. Loss: 0.685 |  Val. Acc: 47.53%
Epoch: 02 | Epoch Time: 2m 21s
	Train Loss: 0.676 | Train Acc: 45.55%
	 Val. Loss: 0.663 |  Val. Acc: 47.53%
Epoch: 03 | Epoch Time: 2m 17s
	Train Loss: 0.661 | Train Acc: 49.86%
	 Val. Loss: 0.655 |  Val. Acc: 53.92%
Epoch: 04 | Epoch Time: 2m 16s
	Train Loss: 0.656 | Train Acc: 60.13%
	 Val. Loss: 0.654 |  Val. Acc: 69.34%
Epoch: 05 | Epoch Time: 2m 22s
	Train Loss: 0.655 | Train Acc: 59.90%
	 Val. Loss: 0.652 |  Val. Acc: 60.57%
Epoch: 06 | Epoch Time: 2m 23s
	Train Loss: 0.653 | Train Acc: 51.87%
	 Val. Loss: 0.653 |  Val. Acc: 48.99%
Epoch: 07 | Epoch Time: 2m 30s
	Train Loss: 0.651 | Train Acc: 65.33%
	 Val. Loss: 0.652 |  Val. Acc: 72.20%
Epoch: 08 | Epoch Time: 2m 31s
	Train Loss: 0.650 | Train Acc: 63.15%
	 Val. Loss: 0.652 |  Val. Acc: 48.02%
Epoch: 09 | Epoch Time: 2m 31s
	Train Loss: 0.650 | Train Acc: 50.72%
	 Val. Loss: 0.651 |  Val. Acc: 49.43%
Epoch: 10 | Epoch T

<All keys matched successfully>

In [10]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.657 | Test Acc: 47.02%


In [11]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    print(tokenized)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    print(indexed)
    length = [len(indexed)]
    print(length)
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    print(tensor)
    length_tensor = torch.LongTensor(length)
    print(length_tensor)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    print(prediction)
    return prediction.item()

In [12]:
predict_sentiment(model, "This film is great and awesome as well it is incredible")

['This', 'film', 'is', 'great', 'and', 'awesome', 'as', 'well', 'it', 'is', 'incredible']
[0, 25, 9, 102, 5, 1309, 19, 85, 11, 9, 1096]
[11]
tensor([[   0],
        [  25],
        [   9],
        [ 102],
        [   5],
        [1309],
        [  19],
        [  85],
        [  11],
        [   9],
        [1096]], device='cuda:0')
tensor([11])
tensor([[0.7311]], device='cuda:0', grad_fn=<SigmoidBackward>)


0.7310585975646973

In [13]:
predict_sentiment(model, "This film is terrible")

['This', 'film', 'is', 'terrible']
[0, 25, 9, 498]
[4]
tensor([[  0],
        [ 25],
        [  9],
        [498]], device='cuda:0')
tensor([4])
tensor([[0.7311]], device='cuda:0', grad_fn=<SigmoidBackward>)


0.7310585975646973