In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.preprocessing import MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# Importation des données

In [None]:
from google.colab import files
uploaded = files.upload()

J'ai renommé le titre des colonnes dans chaque dataframe avec text et label

In [None]:
amazon_train=pd.read_csv("amazon_train.txt", sep= '\t')
amazon_test=pd.read_csv("amazon_test.txt", sep= '\t')
amazon_dev=pd.read_csv("amazon_dev.txt", sep= '\t')
imdb_train=pd.read_csv("imdb_train.txt", sep= '\t')
imdb_test=pd.read_csv("imdb_test.txt", sep= '\t')
imdb_dev=pd.read_csv("imdb_dev.txt", sep= '\t')
yelp_train=pd.read_csv("yelp_train.txt", sep= '\t')
yelp_test=pd.read_csv("yelp_test.txt", sep= '\t')
yelp_dev=pd.read_csv("yelp_dev.txt", sep= '\t')

In [None]:
yelp_dev.head()

Unnamed: 0,text,label
0,Wow... Loved this place.,1
1,Stopped by during the late May bank holiday of...,1
2,The selection on the menu was great and so wer...,1
3,The fries were great too.,1
4,A great touch.,1


## Prétraitement des données

J'ai essayé d'ensembler tous les fichiers train/dev/test dans un seul pour faire le prétraitement une seul fois pour ne pas refaire beaucoup de travail,  mais j'ai trouvé problème pour le lire, donc j'ai fait le prétraitement pour chaque fichier à part

Elimination des majuscules

In [None]:
amazon_train['text']=amazon_train['text'].map(lambda x: x.lower())
amazon_test['text']=amazon_test['text'].map(lambda x: x.lower())
amazon_dev['text']=amazon_dev['text'].map(lambda x: x.lower())
imdb_train['text']=imdb_train['text'].map(lambda x: x.lower())
imdb_test['text']=imdb_test['text'].map(lambda x: x.lower())
imdb_dev['text']=imdb_dev['text'].map(lambda x: x.lower())
yelp_train['text']=yelp_train['text'].map(lambda x: x.lower())
yelp_test['text']=yelp_test['text'].map(lambda x: x.lower())
yelp_dev['text']=yelp_dev['text'].map(lambda x: x.lower())

In [None]:
amazon_train.head()

Unnamed: 0,text,label
0,this phone is pretty sturdy and i've never had...,1
1,i love this thing!,1
2,everything about it is fine and reasonable for...,1
3,i even dropped this phone into a stream and it...,1
4,i have been very happy with the 510 and have h...,1


Elimination de la ponctutation

In [None]:
import re
def  r_punctuation(text):
    """ Return a cleaned version of text
    """
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

In [None]:
amazon_train['text'] = amazon_train['text'].apply(r_punctuation)
amazon_dev['text'] = amazon_dev['text'].apply(r_punctuation)
amazon_test['text'] = amazon_test['text'].apply(r_punctuation)
imdb_train['text'] = imdb_train['text'].apply(r_punctuation)
imdb_dev['text'] = imdb_dev['text'].apply(r_punctuation)
imdb_test['text'] = imdb_test['text'].apply(r_punctuation)
yelp_train['text'] = yelp_train['text'].apply(r_punctuation)
yelp_test['text'] = yelp_test['text'].apply(r_punctuation)
yelp_dev['text'] = yelp_dev['text'].apply(r_punctuation)

In [None]:
text_amazon_train = ''.join([c for c in amazon_train['text']])
text_amazon_test = ''.join([c for c in amazon_test['text']])
text_amazon_dev = ''.join([c for c in amazon_dev['text']])
text_imdb_train = ''.join([c for c in imdb_train['text']])
text_imdb_test = ''.join([c for c in imdb_test['text']])
text_imdb_dev = ''.join([c for c in imdb_dev['text']])
text_yelp_train = ''.join([c for c in yelp_train['text']])
text_yelp_test = ''.join([c for c in yelp_test['text']])
text_yelp_dev = ''.join([c for c in yelp_dev['text']])

Tokénisation et création du vocabulaire

In [None]:
text_split1 = [c for c in amazon_train['text']]
text_split2 = [c for c in amazon_test['text']]
text_split3 = [c for c in amazon_dev['text']]
text_split4 = [c for c in imdb_train['text']]
text_split5 = [c for c in imdb_test['text']]
text_split6 = [c for c in imdb_dev['text']]
text_split7 = [c for c in yelp_train['text']]
text_split8 = [c for c in yelp_test['text']]
text_split9 = [c for c in yelp_dev['text']]

In [None]:
from collections import Counter

In [None]:
text_amazon_train = ' '.join(text_split1)
words1 = text_amazon_train.split()

text_amazon_test = ' '.join(text_split2)
words1.extend(text_amazon_test.split())

text_amazon_dev = ' '.join(text_split3)
words1.extend(text_amazon_dev.split())

text_imdb_train = ' '.join(text_split4)
words2 = text_imdb_train.split()

text_imdb_test = ' '.join(text_split5)
words2.extend(text_imdb_test.split())

text_imdb_dev = ' '.join(text_split6)
words2.extend(text_imdb_dev.split())

text_yelp_train = ' '.join(text_split7)
words3 = text_yelp_train.split()

text_yelp_test = ' '.join(text_split8)
words3.extend(text_yelp_test.split())

text_yelp_dev = ' '.join(text_split9)
words3.extend(text_yelp_dev.split())

count_words = Counter(words1)
total_words = len(words1)
sorted_words = count_words.most_common(total_words)

count_words2 = Counter(words2)
total_words2 = len(words2)
sorted_words2 = count_words2.most_common(total_words2)

count_words3 = Counter(words3)
total_words3 = len(words3)
sorted_words3 = count_words3.most_common(total_words3)

In [None]:
vocab_to_int = {w:i for i, (w,c) in enumerate(sorted_words)}
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

vocab_to_int2 = {w:i for i, (w,c) in enumerate(sorted_words2)}
vocab_to_int2 = {w:i+1 for i, (w,c) in enumerate(sorted_words2)}

vocab_to_int3 = {w:i for i, (w,c) in enumerate(sorted_words3)}
vocab_to_int3 = {w:i+1 for i, (w,c) in enumerate(sorted_words3)}

In [None]:
ama_train_int = []
for review in text_split1:
  r = [vocab_to_int[w] for w in review.split()]
  ama_train_int.append(r)

ama_test_int = []
for review in text_split2:
  r = [vocab_to_int[w] for w in review.split()]
  ama_test_int.append(r)

ama_dev_int = []
for review in text_split3:
  r = [vocab_to_int[w] for w in review.split()]
  ama_dev_int.append(r)

imdb_train_int = []
for review in text_split4:
  r = [vocab_to_int2[w] for w in review.split()]
  imdb_train_int.append(r)


imdb_test_int = []
for review in text_split5:
  r = [vocab_to_int2[w] for w in review.split()]
  imdb_test_int.append(r)


imdb_dev_int = []
for review in text_split6:
  r = [vocab_to_int2[w] for w in review.split()]
  imdb_dev_int.append(r)


yelp_train_int = []
for review in text_split7:
  r = [vocab_to_int3[w] for w in review.split()]
  yelp_train_int.append(r)

yelp_test_int = []
for review in text_split8:
  r = [vocab_to_int3[w] for w in review.split()]
  yelp_test_int.append(r)


yelp_dev_int = []
for review in text_split9:
  r = [vocab_to_int3[w] for w in review.split()]
  yelp_dev_int.append(r)



In [None]:
ama_train_label = [1 if label ==1 else 0 for label in amazon_train['label']]
ama_train_label = np.array(ama_train_label)

ama_test_label = [1 if label ==1 else 0 for label in amazon_test['label']]
ama_test_label = np.array(ama_test_label)

ama_dev_label = [1 if label ==1 else 0 for label in amazon_dev['label']]
ama_dev_label = np.array(ama_dev_label)

imdb_train_label = [1 if label ==1 else 0 for label in imdb_train['label']]
imdb_train_label = np.array(imdb_train_label)

imdb_test_label = [1 if label ==1 else 0 for label in imdb_test['label']]
imdb_test_label = np.array(imdb_test_label)

imdb_dev_label = [1 if label ==1 else 0 for label in imdb_dev['label']]
imdb_dev_label = np.array(imdb_dev_label)

yelp_train_label = [1 if label ==1 else 0 for label in yelp_train['label']]
yelp_train_label = np.array(yelp_train_label)

yelp_test_label = [1 if label ==1 else 0 for label in yelp_test['label']]
yelp_test_label = np.array(yelp_test_label)

yelp_dev_label = [1 if label ==1 else 0 for label in yelp_dev['label']]
yelp_dev_label = np.array(yelp_dev_label)

In [None]:
ama_train_len = [len(x) for x in ama_train_int]
ama_test_len = [len(x) for x in ama_test_int]
ama_dev_len = [len(x) for x in ama_dev_int]
imdb_train_len = [len(x) for x in imdb_train_int]
imdb_test_len = [len(x) for x in imdb_test_int]
imdb_dev_len = [len(x) for x in imdb_dev_int]
yelp_train_len = [len(x) for x in yelp_train_int]
yelp_test_len = [len(x) for x in yelp_test_int]
yelp_dev_len = [len(x) for x in yelp_dev_int]


In [None]:
ama_train_int = [ ama_train_int[i] for i, l in enumerate(ama_train_len) if l>0 ]
Y_ama_train = [ ama_train_label[i] for i, l in enumerate(ama_train_len) if l> 0 ]

ama_test_int = [ ama_test_int[i] for i, l in enumerate(ama_test_len) if l>0 ]
Y_ama_test = [ ama_test_label[i] for i, l in enumerate(ama_test_len) if l> 0 ]

ama_dev_int = [ ama_dev_int[i] for i, l in enumerate(ama_dev_len) if l>0 ]
Y_ama_dev = [ ama_dev_label[i] for i, l in enumerate(ama_dev_len) if l> 0 ]

imdb_train_int = [ imdb_train_int[i] for i, l in enumerate(imdb_train_len) if l>0 ]
Y_imdb_train = [ imdb_train_label[i] for i, l in enumerate(imdb_train_len) if l> 0 ]

imdb_test_int = [ imdb_test_int[i] for i, l in enumerate(imdb_test_len) if l>0 ]
Y_imdb_test = [ imdb_test_label[i] for i, l in enumerate(imdb_test_len) if l> 0 ]

imdb_dev_int = [ imdb_dev_int[i] for i, l in enumerate(imdb_dev_len) if l>0 ]
Y_imdb_dev = [ imdb_dev_label[i] for i, l in enumerate(imdb_dev_len) if l> 0 ]

yelp_train_int = [ yelp_train_int[i] for i, l in enumerate(yelp_train_len) if l>0 ]
Y_yelp_train = [ yelp_train_label[i] for i, l in enumerate(yelp_train_len) if l> 0 ]

yelp_test_int = [ yelp_test_int[i] for i, l in enumerate(yelp_test_len) if l>0 ]
Y_yelp_test = [ yelp_test_label[i] for i, l in enumerate(yelp_test_len) if l> 0 ]

yelp_dev_int = [ yelp_dev_int[i] for i, l in enumerate(yelp_dev_len) if l>0 ]
Y_yelp_dev = [ yelp_dev_label[i] for i, l in enumerate(yelp_dev_len) if l> 0 ]

In [None]:
def pad_features(reviews_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [None]:
seq_length=200

In [None]:
X_ama_train = pad_features(ama_train_int, seq_length)
X_ama_test = pad_features(ama_test_int, seq_length)
X_ama_dev = pad_features(ama_dev_int, seq_length)
X_imdb_train = pad_features(imdb_train_int, seq_length)
X_imdb_test = pad_features(imdb_test_int, seq_length)
X_imdb_dev = pad_features(imdb_dev_int, seq_length)
X_yelp_train = pad_features(yelp_train_int, seq_length)
X_yelp_test = pad_features(yelp_test_int, seq_length)
X_yelp_dev = pad_features(yelp_dev_int, seq_length)

# Modèle 1 : Pour amazon dataset

Dataloaders

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(np.array(X_ama_train)), torch.from_numpy(np.array(Y_ama_train)))
valid_data = TensorDataset(torch.from_numpy(np.array(X_ama_dev)), torch.from_numpy(np.array(Y_ama_dev)))
test_data = TensorDataset(torch.from_numpy(np.array(X_ama_test)), torch.from_numpy(np.array(Y_ama_test)))
# dataloaders
batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[  0,   0,   0,  ..., 727,  11,  85],
        [  0,   0,   0,  ...,  45,   1, 467],
        [  0,   0,   0,  ...,   1, 671, 535],
        ...,
        [  0,   0,   0,  ...,  18,   1,   9],
        [  0,   0,   0,  ...,  22,  31, 142],
        [  0,   0,   0,  ...,   0,   6, 331]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
        1, 0])


Créatiion du modèle.


Le modèle contient :
- une couche pour conversion de nos mots en tokens
- une couche qui convertit les tokens en embeddings de taille spécifique
- couche cachée de LSTM
- une dernière couche d'activation sigmoïde, qui transforme toutes les valeurs de sortie en une valeur comprise entre 0 et 1


In [None]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
    

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

Entrainement du modèle

In [None]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)


SentimentLSTM(
  (embedding): Embedding(1867, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
torch.cuda.is_available()

True

In [None]:
train_on_gpu = torch.cuda.is_available()

In [None]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 4 

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        inputs = inputs.cuda()
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor)
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Test du modèle

In [None]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    inputs = inputs.cuda()
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)




# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test accuracy: 0.740


# Modèle 2 : Pour imdb dataset

In [None]:
# create Tensor datasets
train_data2 = TensorDataset(torch.from_numpy(np.array(X_imdb_train)), torch.from_numpy(np.array(Y_imdb_train)))
valid_data2 = TensorDataset(torch.from_numpy(np.array(X_imdb_dev)), torch.from_numpy(np.array(Y_imdb_dev)))
test_data2 = TensorDataset(torch.from_numpy(np.array(X_imdb_test)), torch.from_numpy(np.array(Y_imdb_test)))
# dataloaders
batch_size = 2
train_loader2 = DataLoader(train_data2, shuffle=True, batch_size=batch_size)
valid_loader2 = DataLoader(valid_data2, shuffle=True, batch_size=batch_size)
test_loader2 = DataLoader(test_data2, shuffle=True, batch_size=batch_size)

In [None]:
# obtain one batch of training data
dataiter2 = iter(train_loader2)
sample_x2, sample_y2 = dataiter2.next()


Entrainement du modèle 2

In [None]:
# Instantiate the model w/ hyperparams
vocab_size2 = len(vocab_to_int2)+1 # +1 for the 0 padding
net2 = SentimentLSTM(vocab_size2, output_size, embedding_dim, hidden_dim, n_layers)
print(net2)

SentimentLSTM(
  (embedding): Embedding(3077, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer2 = torch.optim.Adam(net2.parameters(), lr=lr)

epochs = 4 

counter = 0
print_every = 100
clip=5 


if(train_on_gpu):
    net2.cuda()

net2.train()

for e in range(epochs):
    
    h = net2.init_hidden(batch_size)

    for inputs, labels in train_loader2:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        h = tuple([each.data for each in h])

        net2.zero_grad()

        inputs = inputs.type(torch.LongTensor)
        inputs = inputs.cuda()
        output, h = net2(inputs, h)

        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net2.parameters(), clip)
        optimizer2.step()

        if counter % print_every == 0:
            val_h = net2.init_hidden(batch_size)
            val_losses = []
            net2.eval()
            for inputs, labels in valid_loader2:
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor)
                inputs = inputs.cuda()
                output, val_h = net2(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net2.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.901616... Val Loss: 0.701811
Epoch: 1/4... Step: 200... Loss: 0.664526... Val Loss: 0.733139
Epoch: 1/4... Step: 300... Loss: 0.809648... Val Loss: 0.725327
Epoch: 1/4... Step: 400... Loss: 0.223608... Val Loss: 0.723979
Epoch: 2/4... Step: 500... Loss: 0.246920... Val Loss: 0.837024
Epoch: 2/4... Step: 600... Loss: 0.148009... Val Loss: 0.903636
Epoch: 2/4... Step: 700... Loss: 0.053076... Val Loss: 0.793303
Epoch: 2/4... Step: 800... Loss: 0.296860... Val Loss: 0.744965
Epoch: 3/4... Step: 900... Loss: 0.008704... Val Loss: 1.026233
Epoch: 3/4... Step: 1000... Loss: 0.009821... Val Loss: 1.064687
Epoch: 3/4... Step: 1100... Loss: 0.129862... Val Loss: 1.409217
Epoch: 3/4... Step: 1200... Loss: 0.060528... Val Loss: 1.324847
Epoch: 4/4... Step: 1300... Loss: 0.037490... Val Loss: 1.356025
Epoch: 4/4... Step: 1400... Loss: 0.003150... Val Loss: 1.405396
Epoch: 4/4... Step: 1500... Loss: 0.018738... Val Loss: 1.501619
Epoch: 4/4... Step: 1600... Loss: 

Test du modèle 2

In [None]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net2.init_hidden(batch_size)

net2.eval()
# iterate over test data
for inputs, labels in test_loader2:

    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    inputs = inputs.cuda()
    output, h = net2(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)




# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test accuracy: 0.820


# Modèle 3 : Pour yelp dataset

In [None]:
# create Tensor datasets
train_data3 = TensorDataset(torch.from_numpy(np.array(X_yelp_train)), torch.from_numpy(np.array(Y_yelp_train)))
valid_data3 = TensorDataset(torch.from_numpy(np.array(X_yelp_dev)), torch.from_numpy(np.array(Y_yelp_dev)))
test_data3 = TensorDataset(torch.from_numpy(np.array(X_yelp_test)), torch.from_numpy(np.array(Y_yelp_test)))
# dataloaders
batch_size = 50
train_loader3 = DataLoader(train_data3, shuffle=True, batch_size=batch_size)
valid_loader3 = DataLoader(valid_data3, shuffle=True, batch_size=batch_size)
test_loader3 = DataLoader(test_data3, shuffle=True, batch_size=batch_size)

In [None]:
# obtain one batch of training data
dataiter3 = iter(train_loader3)
sample_x3, sample_y3 = dataiter3.next()


In [None]:
# Instantiate the model w/ hyperparams
vocab_size3 = len(vocab_to_int3)+1 # +1 for the 0 padding
net3 = SentimentLSTM(vocab_size3, output_size, embedding_dim, hidden_dim, n_layers)
print(net3)

SentimentLSTM(
  (embedding): Embedding(2052, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer3 = torch.optim.Adam(net3.parameters(), lr=lr)


# training params

epochs = 4 

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net3.cuda()

net3.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net3.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader3:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net3.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        inputs = inputs.cuda()
        output, h = net3(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net3.parameters(), clip)
        optimizer2.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net3.init_hidden(batch_size)
            val_losses = []
            net3.eval()
            for inputs, labels in valid_loader3:


                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor)
                inputs = inputs.cuda()
                output, val_h = net3(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net3.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Test du modèle 3

In [None]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net3.init_hidden(batch_size)

net3.eval()
# iterate over test data
for inputs, labels in test_loader3:

    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    inputs = inputs.cuda()
    output, h = net3(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)




# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test accuracy: 0.480


# Evaluation des différents modèles dans leurs out-domain

**Premier modèle**

In [None]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader2:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    inputs = inputs.cuda()
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)




# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

**Deuxième modèle**

In [None]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net2.init_hidden(batch_size)

net2.eval()
# iterate over test data
for inputs, labels in test_loader1:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    inputs = inputs.cuda()
    output, h = net2(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)




# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))