<h1>Sentiment analysis</h1>
<p>By Turlagh CLANCY (20220024) and Benoît LU (20141188)</p>

<p>We first import the basic library that will be used everywhere</p>

In [77]:
import torch
import pandas as pd
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

<p>The dataset classes will help us iterate through the data and split it.</p>
<p>The DFDataset is just a container for the train/test splits from the original dataset.</p>
<p>SSTDataset is a container for the stanford's dataset.</p>

In [78]:
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

class DFDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.df.reset_index(drop=True, inplace=True)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.df[['phrases', 'sentiment values']].iloc[idx].to_dict()
    
    def splits(self, test_size=0.3):
        train_df, test_df = train_test_split(self.df, test_size=test_size)
        return DFDataset(train_df), DFDataset(test_df)

class SSTDataset(Dataset):
    def __init__(self, data_folder):
        self.data_folder = data_folder
        df_sent = pd.read_table(f'{data_folder}/datasetSentences.txt')
        df_lab = pd.read_table(f'{data_folder}/sentiment_labels.txt',sep='|')
        df_dict = pd.read_table(f'{data_folder}/dictionary.txt',sep='|', names=('phrases','phrase ids'))
        df_common = df_dict[df_dict['phrases'].isin(df_sent['sentence'])]
        self.df = df_lab.join(df_common.set_index('phrase ids'), on='phrase ids', how='right')
        self.df.reset_index(drop=True, inplace=True)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.df[['phrases', 'sentiment values']].iloc[idx].to_dict()
    
    def splits(self, test_size=0.3):
        train_df, test_df = train_test_split(self.df, test_size=test_size)
        return DFDataset(train_df), DFDataset(test_df)
    
    def phrases(self):
        return self.df['phrases']

SST = SSTDataset('./data')
train_dataset, test_dataset = SST.splits()

<p>We're now loading each embedding that'll be used throughout the project <b>beware of their size</b>, they will be loaded at the root of the project.</p>

In [79]:
# Load GloVe pre-train model
from torchtext.vocab import GloVe

glove = GloVe(name='6B', dim=100, unk_init = torch.Tensor.normal_)

In [80]:
# Load word2vec pre-train model
import os
os.environ["GENSIM_DATA_DIR"] = f"D:\python projects\sentiment-analysis\gensim-data"
import gensim.downloader as api
from gensim.models import Word2Vec

word2vec = api.load('word2vec-google-news-300')

In [81]:
# Load FastText pre-train model
from torchtext.vocab import FastText

fasttext = FastText(language='en', unk_init = torch.Tensor.normal_)

In [83]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
documents = [TaggedDocument(tokenizer(doc), [i]) for i, doc in enumerate(SST.phrases())]
doc2vec = Doc2Vec(documents, vector_size=300, window=10, workers=4, epochs=30)

<p>We sample from the dataset in batch-mode, we have to make sure that each batch contains targets of equal length, else, we pad them.</p>

In [84]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import random

def collate_batch(batch, text_transform_fn, pad_id):
    labels, texts, lengths = [], [], []
    for row in batch:
        labels.append(row['sentiment values'])
        tokens = text_transform_fn(row['phrases'])
        texts.append(torch.tensor(tokens))
        lengths.append(len(tokens))
    # Decreasing sorting required for packed padded sequences
    df = pd.DataFrame({'texts':texts, 'lengths':lengths}).sort_values(by='lengths', ascending=False)
    return torch.tensor(labels), pad_sequence(df['texts'].tolist(), padding_value=pad_id), torch.tensor(df['lengths'].tolist())

# src: https://github.com/pytorch/text/blob/master/examples/legacy_tutorial/migration_tutorial.ipynb
def batch_sampler(dataset, tokenizer, batch_size):
    indices = [(i, len(tokenizer(s['phrases']))) for i, s in enumerate(dataset)]
    random.shuffle(indices)
    pooled_indices = []
    # create pool of indices with similar lengths 
    for i in range(0, len(indices), batch_size * 100):
        pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))

    pooled_indices = [x[0] for x in pooled_indices]

    # yield indices for current batch
    return [pooled_indices[i:i + batch_size] for i in range(0, len(pooled_indices), batch_size)]

<p>Here are the neural networks classes. Both GRU and LSTM share the same architecture aside from the core of the network which is LSTM or GRU. They're redundant in their definition, but it's clearer to explicitly instantiate an LSTM or a GRU, instead of passing a parameter ("lstm", "gru")/p>

In [85]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx, freeze_embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx = pad_idx, freeze=freeze_embeddings)
        self.rnn = nn.LSTM(pretrained_embeddings.shape[1], hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, lengths):
        #text = [sent len, batch size]
        embedded = self.dropout(self.embedding(text))
        #embedded = [sent len, batch size, emb dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.to('cpu'))
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        #output = [sent len, batch size, hid dim * num directions]
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        #hidden = [batch size, hid dim * num directions]
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
            
        return self.fc(hidden)

class GRU(nn.Module):
    def __init__(self, pretrained_embeddings, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx, freeze_embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx = pad_idx, freeze=freeze_embeddings)
        self.rnn = nn.GRU(pretrained_embeddings.shape[1], hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, lengths):
        #text = [sent len, batch size]
        embedded = self.dropout(self.embedding(text))
        #embedded = [sent len, batch size, emb dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.to('cpu'))
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        packed_output, hidden = self.rnn(packed_embedded)
        #output = [sent len, batch size, hid dim * num directions]
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        #hidden = [batch size, hid dim * num directions]
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
            
        return self.fc(hidden)


<p>Here are first the parameters that will be passed to the neural network and second we have a generic setup function that takes into account the architecture and the embeddings to use, as per the project statement we're going through LSTM or GRU for the architecture and GlOve, FastText, Word2Vec and document-level for the embeddings.</p>

In [86]:
from torchtext.data.utils import get_tokenizer

HIDDEN_DIM = 50
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
FREEZE_EMBEDDINGS = True

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def setup(architecture, embeddings):
    if embeddings.lower() == 'glove':
        voc = glove.stoi
        pretrain_emb = glove.vectors
    elif embeddings.lower() == 'fasttext':
        voc = fasttext.stoi
        pretrain_emb = fasttext.vectors
    elif embeddings.lower() == 'word2vec':
        voc = word2vec.key_to_index
        pretrain_emb = torch.from_numpy(word2vec.vectors)
    elif embeddings.lower() == 'documentlevel':
        voc = doc2vec.wv.key_to_index
        pretrain_emb = torch.from_numpy(doc2vec.wv.vectors)
    else:
        print("Unknown embeddings passed, failed to setup")
        return None
    
    specials = {'<unk>':len(voc), '<pad>':len(voc) + 1}
    tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
    batch_size = 8
    text_transform = lambda x: [voc[token] if token in voc else specials['<unk>'] for token in tokenizer(x)]
    
    train_dataloader = DataLoader(train_dataset,
                                  batch_sampler=batch_sampler(train_dataset, tokenizer, batch_size),
                                  collate_fn=lambda batch: collate_batch(batch, text_transform, specials['<pad>']))
    test_dataloader = DataLoader(test_dataset,
                                 batch_sampler=batch_sampler(test_dataset, tokenizer, batch_size),
                                 collate_fn=lambda batch: collate_batch(batch, text_transform, specials['<pad>']))

    # We concatenate the special characters
    pretrain_emb = torch.cat((pretrain_emb, torch.zeros(len(specials),pretrain_emb.shape[1])))
    
    if architecture.lower() == 'lstm':
        model = LSTM(pretrain_emb, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, specials['<pad>'], FREEZE_EMBEDDINGS)
    elif architecture.lower() == 'gru':
        model = GRU(pretrain_emb, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, specials['<pad>'], FREEZE_EMBEDDINGS)
    else:
        print("Unknown architecture passed, failed to setup")
        return None
    print(f'The model has {count_parameters(model):,} trainable parameters')
    return (model, train_dataloader, test_dataloader)

<p>We train and evaluate the model though each datalaoder (which iterates through the dataset in batch-mode) using an accuracy based on the 5 classes of the stanford dataset, from 0 to 1 by a step of 0.2.</p>
<p>The train and evaluate functions are the same, aside from the optimization through the gradient being enabled during training.</p>

In [87]:
from math import floor
import time

def accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.floor(5*torch.sigmoid(preds))
    correct = (rounded_preds == torch.floor(5*y)).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def train(model, dataloader, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()

    for i, batch in enumerate(dataloader):
        labels, inputs, inputs_lengths = batch
        optimizer.zero_grad()
        predictions = model(inputs, inputs_lengths).squeeze(1)
        loss = criterion(predictions, labels)
        acc = accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

def evaluate(model, dataloader, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            labels, inputs, inputs_lengths = batch

            predictions = model(inputs, inputs_lengths).squeeze(1)
            loss = criterion(predictions, labels)
            acc = accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

<p>Each pair of architecture and embedding is improved upon through <i>N_EPOCHS</i> epochs.</p>

In [90]:
import torch.optim as optim

N_EPOCHS = 8

def compute_epochs(model, train_dataloader, test_dataloader):
    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = criterion.to(device)
    test_accs = []
    train_accs = []
    for epoch in range(N_EPOCHS):
        start_time = time.time()
        train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
        test_loss, test_acc = evaluate(model, test_dataloader, criterion)

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')
        test_accs.append(test_acc)
        train_accs.append(train_acc)
    return train_accs, test_accs

<p>We generate the corresponding accuracies here.</p>

In [91]:
import pandas as pd
arrays = [
    ['lstm', 'lstm', 'lstm', 'lstm', 'gru', 'gru', 'gru', 'gru'],
    ['glove', 'fasttext', 'word2vec', 'documentlevel', 'glove', 'fasttext', 'word2vec', 'documentlevel']
]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["architecture", "embedding"])
df = pd.DataFrame([], index=index, columns=['train_acc', 'test_acc'])

print("==========================")
print("EPOCHS ON LSTM with GlOve")
print("==========================")
df.loc['lstm', 'glove'][['train_acc', 'test_acc']] = compute_epochs(*setup('lstm','glove'))
print("==========================")
print("EPOCHS ON LSTM with Fasttext")
print("==========================")
df.loc['lstm', 'fasttext'][['train_acc', 'test_acc']] = compute_epochs(*setup('lstm','fasttext'))
print("==========================")
print("EPOCHS ON LSTM with word2vec")
print("==========================")
df.loc['lstm', 'word2vec'][['train_acc', 'test_acc']] = compute_epochs(*setup('lstm','word2vec'))
print("==========================")
print("EPOCHS ON LSTM with documentlevel")
print("==========================")
df.loc['lstm', 'documentlevel'][['train_acc', 'test_acc']] = compute_epochs(*setup('lstm','documentlevel'))

print("==========================")
print("EPOCHS ON GRU with GlOve")
print("==========================")
df.loc['gru', 'glove'][['train_acc', 'test_acc']] = compute_epochs(*setup('gru','glove'))
print("==========================")
print("EPOCHS ON GRU with Fasttext")
print("==========================")
df.loc['gru', 'fasttext'][['train_acc', 'test_acc']] = compute_epochs(*setup('gru','fasttext'))
print("==========================")
print("EPOCHS ON GRU with word2vec")
print("==========================")
df.loc['gru', 'word2vec'][['train_acc', 'test_acc']] = compute_epochs(*setup('gru','word2vec'))
print("==========================")
print("EPOCHS ON GRU with documentlevel")
print("==========================")
df.loc['gru', 'documentlevel'][['train_acc', 'test_acc']] = compute_epochs(*setup('gru','documentlevel'))

EPOCHS ON LSTM with GlOve
The model has 121,701 trainable parameters
Epoch: 01 | Epoch Time: 0m 44s
	Train Loss: 0.688 | Train Acc: 21.43%
	 Test Loss: 0.680 |  Test Acc: 21.76%
Epoch: 02 | Epoch Time: 0m 48s
	Train Loss: 0.679 | Train Acc: 24.09%
	 Test Loss: 0.679 |  Test Acc: 25.56%
Epoch: 03 | Epoch Time: 1m 3s
	Train Loss: 0.676 | Train Acc: 24.44%
	 Test Loss: 0.675 |  Test Acc: 26.48%
Epoch: 04 | Epoch Time: 0m 52s
	Train Loss: 0.676 | Train Acc: 25.30%
	 Test Loss: 0.678 |  Test Acc: 26.33%
Epoch: 05 | Epoch Time: 1m 3s
	Train Loss: 0.674 | Train Acc: 26.10%
	 Test Loss: 0.674 |  Test Acc: 28.05%
Epoch: 06 | Epoch Time: 1m 0s
	Train Loss: 0.674 | Train Acc: 26.25%
	 Test Loss: 0.674 |  Test Acc: 27.96%
Epoch: 07 | Epoch Time: 0m 52s
	Train Loss: 0.673 | Train Acc: 26.30%
	 Test Loss: 0.673 |  Test Acc: 27.14%
Epoch: 08 | Epoch Time: 0m 54s
	Train Loss: 0.672 | Train Acc: 26.68%
	 Test Loss: 0.676 |  Test Acc: 26.10%
EPOCHS ON LSTM with Fasttext
The model has 201,701 trainable p

In [137]:
import numpy as np
import matplotlib.pyplot as plt
params_tuples = [('lstm', 'glove', 'LSTM (GlOve)'),
          ('lstm', 'fasttext', 'LSTM (FastText)'),
          ('lstm', 'word2vec', 'LSTM (Word2Vec Google News 300)'),
          ('lstm', 'documentlevel', 'LSTM (document-level)'),
          ('gru', 'glove', 'GRU (GlOve)'),
          ('gru', 'fasttext', 'GRU (FastText)'),
          ('gru', 'word2vec', 'GRU (Word2Vec Google News 300)'),
          ('gru', 'documentlevel', 'GRU (document-level)'),]

for params in params_tuples:
    plt.plot(np.arange(1,9), df.loc[params[0], params[1]]['train_acc'], label='train', linewidth=3)
    plt.plot(np.arange(1,9), df.loc[params[0], params[1]]['test_acc'], label='test', linewidth=3)
    plt.title(params[2], fontsize=25)
    plt.xlabel('epoch', fontsize=20)
    plt.ylabel('accuracy', fontsize=20)
    plt.tick_params(axis='both', which='major', labelsize=15)
    plt.ylim(0.2,0.35)
    plt.legend(fontsize=15)
    plt.savefig(f'{params[0]}_{params[1]}.png', bbox_inches='tight')
    plt.clf()

<Figure size 640x480 with 0 Axes>