# LSTM et attention

Nous allons dans ce notebook utiliser le modèle LSTM combiné avec le modèle d'attention, qui est expliqué dans cet <a href="https://arxiv.org/pdf/1706.03762.pdf"> article</a>.



In [1]:
import torch
from torchtext import data

class Dataset(object):
    def __init__(self, config):
        self.config = config
        self.train_iterator = None
        self.test_iterator = None
        self.val_iterator = None
        self.vocab = []
        self.word_embeddings = {}
    
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data
        
        Inputs:
            w2v_file (String): path to file containing word embeddings (GloVe/Word2Vec)
            train_file (String): path to training file
            test_file (String): path to test file
            val_file (String): path to validation file
        '''

       
        # Creating Field for data

        TEXT = data.Field(sequential=True,lower=True, tokenize = 'spacy', include_lengths=True)
        LABEL = data.LabelField(sequential=False, use_vocab=False,dtype = torch.float)
        train_data, valid_data, test_data = data.TabularDataset.splits(
        path='./data/', train='train.csv',
        validation='valid.csv', test='test.csv', format='csv', skip_header=True,
        fields=[('text', TEXT), ('label', LABEL)])
        

        TEXT.build_vocab(train_data, 
                 max_size = self.config.max_vocab_size, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
        
        LABEL.build_vocab(train_data)
        print(LABEL.vocab.stoi)
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab
        self.PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
        self.UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.train_iterator= data.BucketIterator(
            (train_data),
            batch_size= self.config.batch_size,
            device = device,
            sort_key=lambda x: len(x.text),
            sort_within_batch = True)
        
        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (valid_data, test_data),
            batch_size=self.config.batch_size,
            device = device,
            sort_key=lambda x: len(x.text),
            sort_within_batch = True)
        
        print(f'Taille des données train: {len(train_data)}')
        print(f'Taille des données de validation: {len(valid_data)}')
        print(f'Taille des données test: {len(test_data)}')


## Construire le modèle

In [2]:
class Config(object):
    max_vocab_size = 50000
    embed_size = 100
    hidden_layers = 1
    hidden_size = 256
    bidirectional = True
    output_size = 1
    max_epochs = 15
    batch_size = 64
    dropout_keep = 0.6
    max_sen_len = None # Sequence length for RNN    
    
config = Config()

In [3]:
dataset = Dataset(config)

defaultdict(None, {'1': 0, '0': 1})
Taille des données train: 18163
Taille des données de validation: 2270
Taille des données test: 2271


In [4]:
import torch
import spacy
from torch import nn
import numpy as np
from torch.nn import functional as F

class Seq2SeqAttention(nn.Module):
    def __init__(self, config, vocab_size, pretrained_embeddings):
        super(Seq2SeqAttention, self).__init__()
        self.config = config
        
        # Embedding Layer
        self.embeddings = nn.Embedding(vocab_size, self.config.embed_size)

        #self.embeddings.weight.data.copy_(pretrained_embeddings)

        #UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

        #self.embeddings.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
        #self.embeddings.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

        #self.embeddings.weight = nn.Parameter(pretrained_embeddings, requires_grad=False)
        
        # Encoder RNN
        self.rnn = nn.LSTM(input_size = self.config.embed_size,
                            hidden_size = self.config.hidden_size,
                            num_layers = self.config.hidden_layers,
                            bidirectional = self.config.bidirectional)
        
        # Dropout Layer
        self.dropout = nn.Dropout(self.config.dropout_keep)
        
        # Fully-Connected Layer
        self.fc = nn.Linear(
            self.config.hidden_size * 2 * (1+self.config.bidirectional) 
            ,
            self.config.output_size
        )
        
        # Softmax non-linearity
        self.softmax = nn.Softmax(dim = 1)
                
    def apply_attention(self, rnn_output, final_hidden_state):
        '''
        Apply Attention on RNN output
        
        Input:
            rnn_output (batch_size, seq_len, num_directions * hidden_size): tensor representing hidden state for every word in the sentence
            final_hidden_state (batch_size, num_directions * hidden_size): final hidden state of the RNN
            
        Returns:
            attention_output(batch_size, num_directions * hidden_size): attention output vector for the batch
        '''
        hidden_state = final_hidden_state.unsqueeze(2)
        attention_scores = torch.bmm(rnn_output, hidden_state).squeeze(2)
        soft_attention_weights = F.softmax(attention_scores, 1).unsqueeze(2) #shape = (batch_size, seq_len, 1)
        attention_output = torch.bmm(rnn_output.permute(0,2,1), soft_attention_weights).squeeze(2)
        return attention_output
        
    def forward(self, x, text_lengths):
        # x.shape = (max_sen_len, batch_size)
        embedded_sent = self.embeddings(x)
        # embedded_sent.shape = (max_sen_len=20, batch_size=64,embed_size=300)

        ##################################### Encoder #######################################
        #lstm_output, (h_n,c_n) = self.lstm(embedded_sent)
        # lstm_output.shape = (seq_len, batch_size, num_directions * hidden_size)
         # embedded_sent.shape = (max_sen_len=20, batch_size=64,embed_size=300)
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded_sent, text_lengths)
        packed_output, (h_n,c_n) = self.rnn(packed_embedded)    
        #unpack sequence
        lstm_output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        ##################################### Encoder ####################################### 
        # lstm_output.shape = (seq_len, batch_size, num_directions * hidden_size)
        
        # Final hidden state of last layer (num_directions, batch_size, hidden_size)
        # Final hidden state of last layer (num_directions, batch_size, hidden_size)
        batch_size = h_n.shape[1]
        h_n_final_layer = h_n.view(self.config.hidden_layers,
                                   self.config.bidirectional + 1,
                                   batch_size,
                                   self.config.hidden_size)[-1,:,:,:]
        
        ##################################### Attention #####################################
        # Convert input to (batch_size, num_directions * hidden_size) for attention
        final_hidden_state = torch.cat([h_n_final_layer[i,:,:] for i in range(h_n_final_layer.shape[0])], dim=1)
        
        attention_out = self.apply_attention(lstm_output.permute(1,0,2), final_hidden_state)
        # Attention_out.shape = (batch_size, num_directions * hidden_size)
        
        #################################### Linear #########################################
        concatenated_vector = torch.cat([final_hidden_state, attention_out], dim=1)
        final_feature_map = self.dropout(concatenated_vector) # shape=(batch_size, num_directions * hidden_size)
        final_out = self.fc(final_feature_map)
        return self.softmax(final_out)
    
    def add_optimizer(self, optimizer):
        self.optimizer = optimizer
        
    def add_loss_op(self, loss_op):
        self.loss_op = loss_op
 
    def binary_accuracy(self, preds, y):
        """
        Returns accuracy per batch
        """
        #round predictions to the closest integer
        rounded_preds = torch.round(torch.sigmoid(preds))
        correct = (rounded_preds == y).float() #convert into float for division 
        acc = correct.sum() / len(correct)
        return acc 
    
    def train_model(self, iterator):
    
        epoch_loss = 0
        epoch_acc = 0
    
        self.train()
    
        for batch in iterator:
        
            self.optimizer.zero_grad()
        
            text, text_lengths = batch.text
        
            predictions = self.__call__(text, text_lengths).squeeze(1)

            loss = self.loss_op(predictions, batch.label)
            acc = self.binary_accuracy(predictions, batch.label)
        
            loss.backward()
        
            optimizer.step()
        
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def evaluate_model(self, iterator):
    
        epoch_loss = 0
        epoch_acc = 0
    
        self.eval()
    
        with torch.no_grad():
    
            for batch in iterator:
        
                text, text_lengths = batch.text
        
                predictions = self.__call__(text, text_lengths).squeeze(1)

                loss = self.loss_op(predictions, batch.label)
            
                acc = self.binary_accuracy(predictions, batch.label)

                epoch_loss += loss.item()
                epoch_acc += acc.item()
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)  
    
    def run_epoch(self, train_iterator, val_iterator, epoch):
        
        for i, batch in enumerate(train_iterator):
    
            if i % 100 == 0:
                print("Iter: {}".format(i+1))
                train_loss, train_acc = self.train_model(train_iterator)
                valid_loss, valid_acc = self.evaluate_model(val_iterator)
                print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
                print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
                

In [5]:
import torch
from torchtext import data
from torchtext.vocab import Vectors
import spacy
import numpy as np
import sys
import torch.optim as optim
from torch import nn
import torch

# Create Model with specified optimizer and loss function
##############################################################

model = Seq2SeqAttention(config, len(dataset.vocab), dataset.word_embeddings)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

pretrained_embeddings = dataset.vocab.vectors
model.embeddings.weight.data.copy_(pretrained_embeddings)

model.embeddings.weight.data[dataset.UNK_IDX] = torch.zeros(config.embed_size)
model.embeddings.weight.data[dataset.PAD_IDX] = torch.zeros(config.embed_size)

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)
model.add_optimizer(optimizer)
model.add_loss_op(criterion)

##############################################################

for i in range(config.max_epochs):
    print ("Epoch: {}".format(i))
    model.run_epoch(dataset.train_iterator, dataset.val_iterator, i)


Epoch: 0
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.64%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 1
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.63%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 2
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.63%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 3
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.63%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 4
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.64%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 5
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.64%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 6
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.63%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 7
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.64%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 8
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.64%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 9
Iter: 1
	Train Loss: 0.767 | Train Acc: 54.64%
	 Val. Loss: 0.769 |  Val. Acc: 54.46%
Epoch: 10
Iter: 1
	Train Loss: 0.767 | Train Acc: 

In [6]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,734,409 trainable parameters


In [7]:
def recall(preds, y):
    '''
    Retourne le recall
    '''
    y_pred = torch.round(torch.sigmoid(preds))
    y_true = (y_pred == y).float()       
    
    tp = (y_true * y_pred).sum().float()
    tn = ((1 - y_true) * (1 - y_pred)).sum().float()
    fp = ((1 - y_true) * y_pred).sum().float()
    fn = (y_true * (1 - y_pred)).sum().float()
    recall = tp / (tp + fn)
    return recall


def f1_loss(preds, y):
    '''
    Retourne le score F1
    '''  
    y_pred = torch.round(torch.sigmoid(preds))
    y_true = (y_pred == y).float() 
            
    tp = (y_true * y_pred).sum().float()
    tn = ((1 - y_true) * (1 - y_pred)).sum().float()
    fp = ((1 - y_true) * y_pred).sum().float()
    fn = (y_true * (1 - y_pred)).sum().float()
    
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    
    f1 = 2* (precision*recall) / (precision + recall)
    return f1

In [8]:
test_loss, test_acc = model.evaluate_model(dataset.test_iterator)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.787 | Test Acc: 52.66%


In [9]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [dataset.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [10]:
predict_sentiment(model, "This film is great and awesome as well it is incredible")

0.7310585975646973

In [11]:
predict_sentiment(model, "This film is terrible")

0.7310585975646973