In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import string
from string import punctuation
from os import listdir
from collections import Counter
import re
import unicodedata

import pandas as pd
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Etape 1 : Importer les données

On importe les données déjà pré-traitées et sous forme de DataFrame.

In [2]:
dataset = pd.read_pickle("clean_data.pkl")
# mélanger les données
dataset = dataset.sample(frac=1)
dataset.head(10)

Unnamed: 0,Reviews,Tidy_Reviews,label
6180,In this satire of the commercialization and 'l...,satire commercialization war john cusack play ...,0
8430,I'm starting to think that there's a conspirac...,start think conspiracy right involves wallop m...,1
5189,"If you want to remember MJ, this is a good pla...",want remember mj good place start feature swee...,0
1498,Tourists head to Ireland for a school trip to ...,tourist head ireland school trip learn druid e...,1
12203,being the self-proclaimed professional film cr...,professional critic somewhat embarrass admit l...,0
5798,54 is a film about a club with that very title...,club title set era feature classic bartender s...,1
820,"After ""Star Wars: A New Hope"" redefined scienc...",star war new hope redefine science fiction emp...,0
11474,"andrew lloyd webber's musicals , preferably hi...",andrew lloyd webber musical preferably early w...,0
5357,Interesting film about an actual event that to...,interest actual event take place civil war ver...,1
9914,I was totally impressed by Shelley Adrienne's ...,totally impressed shelley adrienne waitress co...,0


On split les données en données d'entraînement, de test et de validation.

In [3]:
from sklearn.model_selection import train_test_split
reviews_train, reviews_test, label_train, label_test = train_test_split(dataset["Tidy_Reviews"], dataset["label"], test_size=0.2, random_state=42)
data_test, data_val, y_test, y_val = train_test_split(reviews_test, label_test, test_size=0.5, random_state=42)

data_test = data_test.to_numpy()
data_val = data_val.to_numpy()
data_train = reviews_train.to_numpy()

print("\t\t\tTaille des features:")
print("Train set: \t\t{}".format(reviews_train.shape), 
      "\nValidation set: \t{}".format(data_val.shape),
      "\nTest set: \t\t{}".format(data_test.shape))

			Taille des features:
Train set: 		(18163,) 
Validation set: 	(2271,) 
Test set: 		(2270,)


# Etape 2 : Pré-traitement des données

## Créer un vocabulaire et encoder les commentaires

On utilise seulement le train pour créer le vocabulaire et si un mot du test n'est pas dans le vocabulaire, on le remplacera par "unk" comme unknown.



In [4]:
from collections import Counter
import nltk    
voc = Counter()
for review in data_train:
    tokens = nltk.word_tokenize(review)
    voc.update(tokens)

voc.update(["<unk>"]) 

vocab_to_int = {word: ii for ii, word in enumerate(voc, 1)}
def encode(data):
    reviews_ints = [] 
    for review in data:
        reviews_ints.append([vocab_to_int[word] if word in voc else vocab_to_int["<unk>"] for word in review.split()])
    return reviews_ints

encoded_reviews = encode(data_train)

In [5]:
print('Mots uniques: ', len((vocab_to_int)))
print('Commentaire: \n', encoded_reviews[:1])

Mots uniques:  54202
Commentaire: 
 [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 23, 37, 38, 39, 40, 2, 41, 42, 43, 18, 44, 45, 46, 47, 10, 48, 49, 50, 51, 52, 53, 32, 54, 55]]


## Supprimer les commentaires de longueurs nulles.

In [6]:
import numpy as np
import torch

review_lens = Counter([len(x) for x in encoded_reviews])
print("Nombre de ommentaires de longueur nulle: {}".format(review_lens[0]))
print("Longueur maximale pour un commentaire: {}".format(max(review_lens)))

Nombre de ommentaires de longueur nulle: 0
Longueur maximale pour un commentaire: 1360


Dans ce jeu de données il n'y en a pas.

## Faire en sorte d'avoir des commentaires de même longueur

Lorsque nous introduisons les commentaires dans notre modèle, nous introduisons un batch de plusieurs commentaires à la fois, et tous les commentaires doivent avoir la même taille. Les commentaires plus courts que la taille donnée seq_length sont complétés par des 0, et les plus longs sont tronqués.


In [7]:
def pad_features(encoded_reviews, seq_length):
    
    features = np.zeros((len(encoded_reviews), seq_length), dtype=int)
    for i, row in enumerate(encoded_reviews):
        features[i, -len(row):] = np.array(row)[:seq_length]
    return features

seq_length = 200
features = pad_features(encoded_reviews, seq_length=seq_length)

## test ##
assert len(features)==len(encoded_reviews)
assert len(features[0])==seq_length

# print first 10 values of the first 30 batches 
print(features[:30,:10])

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [ 111  300  301  302  303  304  305  306  307  304]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0  261]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [ 935  788  936  504  937  938  939  940  941  942]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [1215 1216 1107 1217   66 1218 1219 1220 1221

## Pré-traitement des données de test et de validation

In [8]:
data_val_ints = encode(data_val)
data_val = pad_features(data_val_ints, seq_length=seq_length)

data_test_ints = encode(data_test)
data_test = pad_features(data_test_ints, seq_length=seq_length)

train_x = np.array(features)
train_y = np.array(label_train)
val_x = np.array(data_val)
val_y = np.array(y_val)
test_x = np.array(data_test)
test_y = np.array(y_test)


print("\t\t\tTaille des datasets:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))


			Taille des datasets:
Train set: 		(18163, 200) 
Validation set: 	(2271, 200) 
Test set: 		(2270, 200)


In [9]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

# un batch du data train
dataiter = iter(valid_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)


Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[   0,    0,    0,  ...,  394, 8057,  870],
        [   0,    0,    0,  ..., 1371, 1837, 5940],
        [   0,    0,    0,  ..., 3027,   89, 4719],
        ...,
        [   0,    0,    0,  ..., 1870,  172,   89],
        [   0,    0,    0,  ..., 9180,  976,  554],
        [   0,    0,    0,  ..., 3990,  445, 3689]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
        1, 0])


# Etape 3 : Définir le modèle LSTM

<center> <img src="lstm.png" alt="drawing" width="700"/>

Les couches du modèle sont les suivantes :

 - Une couche d'intégration qui convertit nos tokens (entiers) en incorporations d'une taille spécifique.
 - Une couche LSTM définie par une taille et un nombre de couches hidden_state
 - Une couche de sortie entièrement connectée qui mappe les sorties de la couche LSTM à la taille de sortie souhaitée
 - Une couche d'activation sigmoïde qui transforme toutes les sorties en une valeur de 0 à 1; renvoie uniquement la dernière sortie sigmoïde comme sortie de ce réseau.
 - Output: la sortie sigmoïde du dernier pas de temps est considérée comme la sortie finale de ce réseau

In [11]:
from torch import nn

class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        device = "cuda" if torch.cuda.is_available() else "cpu"
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words, hidden):
        device = "cuda" if torch.cuda.is_available() else "cpu"
             # INPUT   :  (batch_size, seq_length)
        embedded_words = self.embedding(input_words)   # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words, hidden)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h

## Définition des paramètres

In [12]:
n_vocab = len(vocab_to_int)
n_embed = 400
n_hidden = 512
n_output = 1   # 1 ("positive") or 0 ("negative")
n_layers = 2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net = net.to(device)

## Fonction de coût et optimiseur

La fonction de coût mesure la Binary Cross Entropy entre le target le output, et on utilise l'optimiseur Adam.

In [13]:
from torch import optim

criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

criterion = criterion.to(device)

## Entraînement du modèle

In [14]:
print_every = 100
step = 0
n_epochs = 3  
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)
        
        # making requires_grad = False for the latest set of h
        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            ######################
            ##### VALIDATION #####
            ######################
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(batch_size)
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs, v_h)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()




Epoch: 1/3 Step: 100 Training Loss: 0.6341 Validation Loss: 0.6161
Epoch: 1/3 Step: 200 Training Loss: 0.6650 Validation Loss: 0.6766
Epoch: 1/3 Step: 300 Training Loss: 0.7101 Validation Loss: 0.6699
Epoch: 2/3 Step: 400 Training Loss: 0.4321 Validation Loss: 0.3896
Epoch: 2/3 Step: 500 Training Loss: 0.4098 Validation Loss: 0.3898
Epoch: 2/3 Step: 600 Training Loss: 0.3771 Validation Loss: 0.3338
Epoch: 2/3 Step: 700 Training Loss: 0.2631 Validation Loss: 0.2278
Epoch: 3/3 Step: 800 Training Loss: 0.3735 Validation Loss: 0.2642
Epoch: 3/3 Step: 900 Training Loss: 0.3352 Validation Loss: 0.3070
Epoch: 3/3 Step: 1000 Training Loss: 0.2044 Validation Loss: 0.1770


## Test sur les données test

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
net.eval()
test_losses = []
num_correct = 0
test_h = net.init_hidden(batch_size)

for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    test_h = tuple([each.data for each in test_h])
    test_output, test_h = net(inputs,test_h)
    loss = criterion(test_output, labels.float())
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

RuntimeError: CUDA error: device-side assert triggered

## Test sur un commentaire nouveau

In [None]:
def predict(net, review, seq_length = 200):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    test_ints = []
    
    for word in review:
        review = review.lower() 
        test_text = ''.join([c for c in review if c not in punctuation])
        test_words = test_text.split()    
        test_ints.append([vocab_to_int[word] if word in voc else vocab_to_int["<unk>"] for word in test_words ])
        
    padded_words = pad_features(test_ints, seq_length)
    padded_words = torch.from_numpy(padded_words).to(device)
    
    if(len(padded_words) == 0):
        "Your review must contain at least 1 word!"
        return None
    
    net.eval()
    h = net.init_hidden(1)
    output, h = net(padded_words, h)
    pred = torch.round(output.squeeze())
    msg = "This is a positive review." if pred == 0 else "This is a negative review."
    
    return msg


review1 = "It made me cry."
review2 = "It was so good it made me cry."
review3 = "It's ok."
review4 = "This movie had the best acting and the dialogue was so good. I loved it."
review5 = "Garbage"
                       ### OUTPUT ###
predict(net, review1)  ## negative ##
predict(net, review2)  ## positive ##
predict(net, review3)  ## negative ##
predict(net, review4)  ## positive ##
predict(net, review5)  ## negative ##