In [9]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [10]:
my_path = "drive/My Drive/data/"

In [38]:

import os
import sys
import torch
from torch.nn import functional as F
import numpy as np
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe
import pandas as pd
import torch
from torchtext import data

# pour la reproductibilité
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(sequential=True,lower=True, tokenize = 'spacy', include_lengths=True, batch_first=True, fix_length=200)
LABEL = data.LabelField(dtype = torch.float)

train_data, valid_data, test_data = data.TabularDataset.splits(
        path=my_path, train='train.csv',
        validation='valid.csv', test='test.csv', format='csv', skip_header=True,
        fields=[('text', TEXT), ('label', LABEL)])

print(f'Taille des données train: {len(train_data)}')
print(f'Taille des données de validation: {len(valid_data)}')
print(f'Taille des données test: {len(test_data)}')

MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train_data)

print(f"Nombre de tokens unique dans le TEXT: {len(TEXT.vocab)}") 
print(f"Nombre unique de LABEL: {len(LABEL.vocab)}")


# utilisation du GPU si possible 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
device = device, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)


Taille des données train: 18163
Taille des données de validation: 2270
Taille des données test: 2271
Nombre de tokens unique dans le TEXT: 25002
Nombre unique de LABEL: 2


In [39]:
class AttentionModel(torch.nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(AttentionModel, self).__init__()
		
		"""
		Arguments	
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embeddding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
		
		--------
		
		"""
		
		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		
		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
		self.lstm = nn.LSTM(embedding_length, hidden_size)
		self.label = nn.Linear(hidden_size, output_size)
		
	def attention_net(self, lstm_output, final_state):

		""" 
		Now we will incorporate Attention mechanism in our LSTM model. In this new model, we will use attention 
    to compute soft alignment score corresponding
		between each of the hidden_state and the last hidden_state of the LSTM. We will be using torch.bmm 
    for the batch matrix multiplication.
		
		Arguments
		---------
		
		lstm_output : Final output of the LSTM which contains hidden layer outputs for each sequence.
		final_state : Final time-step hidden state (h_n) of the LSTM
		
		---------
		
		Returns : It performs attention mechanism by first computing weights for each of 
    the sequence present in lstm_output and and then finally computing the
				  new hidden state.
				  
		Tensor Size :
					hidden.size() = (batch_size, hidden_size)
					attn_weights.size() = (batch_size, num_seq)
					soft_attn_weights.size() = (batch_size, num_seq)
					new_hidden_state.size() = (batch_size, hidden_size)
					  
		"""
		
		hidden = final_state.squeeze(0)
		attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)
		soft_attn_weights = F.softmax(attn_weights, 1)
		new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
		
		return new_hidden_state
	
	def forward(self, input_sentences, batch_size=None):
	
		""" 
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
		
		Returns
		-------
		Output of the linear layer containing logits for pos & neg class which receives its input as the new_hidden_state which is basically the output of the Attention network.
		final_output.shape = (batch_size, output_size)
		
		"""
		
		input = self.word_embeddings(input_sentences)
		input = input.permute(1, 0, 2)
		if batch_size is None:
			h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
		else:
			h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
			
		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0)) # final_hidden_state.size() = (1, batch_size, hidden_size) 
		output = output.permute(1, 0, 2) # output.size() = (batch_size, num_seq, hidden_size)
		
		attn_output = self.attention_net(output, final_hidden_state)
		logits = self.label(attn_output)
		
		return logits

In [40]:

learning_rate = 2e-5
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300
word_embeddings = TEXT.vocab.vectors
vocab_size = len(TEXT.vocab)

model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)



In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Le modèle a {count_parameters(model):,} paramètres à entraîner')

Le modèle a 8,072,506 paramètres à entraîner


In [41]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion =  F.cross_entropy


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


def binary_accuracy(preds, y):
    """
    Retourne l'accuracy par batch
    """
    #arrondi la prédiction à l'entier le plus proche
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

def recall(preds, y):
    '''
    Retourne le recall
    '''
    y_pred = torch.round(torch.sigmoid(preds))
    y_true = (y_pred == y).float()       
    
    tp = (y_true * y_pred).sum().float()
    tn = ((1 - y_true) * (1 - y_pred)).sum().float()
    fp = ((1 - y_true) * y_pred).sum().float()
    fn = (y_true * (1 - y_pred)).sum().float()
    
    if (tp + fn) == 0:
        recall = torch.zeros(1)
        
    recall = tp / (tp + fn)
    return recall



def f1_loss(preds, y):
    '''
    Retourne le score F1
    '''  
    y_pred = torch.round(torch.sigmoid(preds))
    y_true = (y_pred == y).float() 
            
    tp = (y_true * y_pred).sum().float()
    tn = ((1 - y_true) * (1 - y_pred)).sum().float()
    fp = ((1 - y_true) * y_pred).sum().float()
    fn = (y_true * (1 - y_pred)).sum().float()
    
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    
    if (tp + fn) == 0 or (tp + fp) == 0 or (recall + precision == 0):
        f1 = torch.zeros(1)
    else:
        f1 = 2* (precision*recall) / (precision + recall)
    
    return f1

In [55]:


def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_rec = 0
    epoch_f1 = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        text = batch.text[0]
        target = batch.label 

        
        if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
        predictions = model(text)
        target = torch.autograd.Variable(target).long()
        loss = criterion(predictions, target )
        #target = [batch_size]
        #predictions = [batch_size, output_size]
        pred = torch.max(predictions, 1)[1].view(target.size()).data
        #pred = [batch_size]
        acc = binary_accuracy(pred.float(), target)
        rec = recall(pred.float(), target)
        f1 = f1_loss(pred.float(), target)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_rec += rec.item()
        epoch_f1 += f1.item()        
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_rec / len(iterator), epoch_f1 / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_rec = 0
    epoch_f1 = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

          text = batch.text[0]
          if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
          target = batch.label 
          predictions = model(text)
          target = torch.autograd.Variable(target).long()
          loss = criterion(predictions, target )
          pred = torch.max(predictions, 1)[1].view(target.size()).data
          #num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
          #acc = 100.0 * num_corrects/len(batch)
          acc = binary_accuracy(pred.float(), target)
          rec = recall(pred.float(), target)
          f1 = f1_loss(pred.float(), target)
            
          epoch_loss += loss.item()
          epoch_acc += acc.item()
          epoch_rec += rec.item()
          epoch_f1 += f1.item() 
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_rec / len(iterator), epoch_f1 / len(iterator)
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc, train_rec, train_f1 = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, valid_rec, valid_f1 = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Train Recall: {train_rec*100:.2f}% | Train F1: {train_f1*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. Recall: {valid_rec*100:.2f}%  | Val. F1: {valid_f1*100:.2f}%')



Epoch: 01 | Epoch Time: 0m 11s
	Train Loss: 0.348 | Train Acc: 85.14% | Train Recall: 43.87% | Train F1: 57.18%
	 Val. Loss: 0.391 |  Val. Acc: 81.56% | Val. Recall: 42.31%  | Val. F1: 54.98%
Epoch: 02 | Epoch Time: 0m 11s
	Train Loss: 0.216 | Train Acc: 91.66% | Train Recall: 44.61% | Train F1: 59.37%
	 Val. Loss: 0.411 |  Val. Acc: 82.26% | Val. Recall: 43.79%  | Val. F1: 55.99%
Epoch: 03 | Epoch Time: 0m 11s
	Train Loss: 0.119 | Train Acc: 95.65% | Train Recall: 45.06% | Train F1: 60.68%
	 Val. Loss: 0.521 |  Val. Acc: 82.09% | Val. Recall: 45.10%  | Val. F1: 56.70%
Epoch: 04 | Epoch Time: 0m 11s
	Train Loss: 0.061 | Train Acc: 97.88% | Train Recall: 45.15% | Train F1: 61.27%
	 Val. Loss: 0.590 |  Val. Acc: 81.69% | Val. Recall: 45.40%  | Val. F1: 56.75%
Epoch: 05 | Epoch Time: 0m 11s
	Train Loss: 0.027 | Train Acc: 99.04% | Train Recall: 45.25% | Train F1: 61.57%
	 Val. Loss: 0.684 |  Val. Acc: 82.39% | Val. Recall: 43.47%  | Val. F1: 55.93%
Epoch: 06 | Epoch Time: 0m 11s
	Train Lo

In [56]:

test_loss, test_acc, test_rec, test_f1 = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%| Test Recall: {test_rec*100:.2f}%  | Test F1: {test_f1*100:.2f}%')

Test Loss: 0.974 | Test Acc: 83.54%| Test Recall: 47.19%  | Test F1: 59.39%


In [90]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    length_tensor = torch.LongTensor(length)
    prediction = model(tensor, 1)
    out = F.softmax(prediction, 1)
    if (torch.argmax(out[0]) == 0):
      print ("Sentiment: Positive")
    else:
      print ("Sentiment: Negative")

predict_sentiment(model, "This film is horrible bad bad")


predict_sentiment(model, "This film is great amazing good ")



tensor([3.4362e-08, 1.0000e+00], device='cuda:0', grad_fn=<SelectBackward>)
Sentiment: Negative
tensor([1.0000e+00, 4.1757e-07], device='cuda:0', grad_fn=<SelectBackward>)
Sentiment: Positive
