<a href="https://colab.research.google.com/github/Zinni98/Sentiment-analysis-project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Polarity Classification

### Get the data

In [1]:
import nltk
import torch
nltk.download("punkt")
nltk.download("movie_reviews")
nltk.download("subjectivity")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Unzipping corpora/subjectivity.zip.


True

## TODO: Use spacy negation marking

In [2]:
from nltk.corpus import movie_reviews
from nltk.sentiment.util import mark_negation
import numpy as np


class MovieReviewsCorpus():
  def __init__(self):
    # list of documents, each document is a list containing words of that document
    self.mr = movie_reviews
    self.corpus, self.labels = self._flatten()
    self.unprocessed_corpus = self._get_corpus()
    self.corpus_words = self.get_corpus_words()
    self.vocab = self._create_vocab()

  
  def _list_to_str(self, doc) -> str:
    """
    Put all elements of the list into a single string, separating each element with a space.
    """
    return " ".join([w for sent in doc for w in sent])
  
  
  def _flatten(self):
    """
    Returns
    -------
    list[list[str]]
      Each inner list represents a document. Each document is a list of tokens.
    """

    # 3 nested list: each list contain a document, each inner list contains a phrase (until fullstop), each phrase contains words.
    neg = self.mr.paras(categories = "neg")
    pos = self.mr.paras(categories = "pos")

    corpus = [[w for w in self._list_to_str(d).split(" ")] for d in pos] + [[w for w in self._list_to_str(d).split(" ")] for d in neg]
    labels = np.array([0] * len(pos) + [1] * len(neg))
    return corpus, labels
  
  def _get_corpus(self):
    neg = self.mr.paras(categories = "neg")
    pos = self.mr.paras(categories = "pos")

    return neg + pos

  def movie_reviews_dataset_raw(self):
    """
    Returns the dataset containing:
    
    - A list of all the documents
    - The corresponding label for each document

    Returns
    -------
    tuple(list, np.array)
      The dataset: first element is the list of the document, the second element of the tuple is the associated label (positive or negative) for each document
    """
    
    return self.corpus, self.labels
  
  def get_corpus_words(self) -> list:
    return [w for doc in self.corpus for w in doc]

  def _create_vocab(self):
    vocab = dict()
    for word in self.corpus_words:
      try:
        vocab[word] += 1
      except:
        vocab[word] = 1
    return vocab

  def __len__(self):
    return len(self.corpus)


"""class MovieReviewsCorpusNegMarked(MovieReviewsCorpus):
  def __init__(self):
    super().__init__()
    self._neg_marking()
    self.corpus_words = self.get_corpus_words()
    self.vocab = self._create_vocab()
  
  def _neg_marking(self):
    negated_corpus = [self._mark(doc) for doc in self.corpus]
    self.corpus = negated_corpus
  
  def _mark(self, doc):
    # negates the whole document
    negated_doc = mark_negation(doc, double_neg_flip=True)
    return negated_doc"""

'class MovieReviewsCorpusNegMarked(MovieReviewsCorpus):\n  def __init__(self):\n    super().__init__()\n    self._neg_marking()\n    self.corpus_words = self.get_corpus_words()\n    self.vocab = self._create_vocab()\n  \n  def _neg_marking(self):\n    negated_corpus = [self._mark(doc) for doc in self.corpus]\n    self.corpus = negated_corpus\n  \n  def _mark(self, doc):\n    # negates the whole document\n    negated_doc = mark_negation(doc, double_neg_flip=True)\n    return negated_doc'

In [3]:
import operator
from tqdm import tqdm
from torchtext.vocab import GloVe
import torch

corpus = MovieReviewsCorpus()

global_vectors = GloVe(name='840B', dim=300)

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    null_embedding = torch.tensor([0.0]*300)
    for word in tqdm(vocab):
        try:
          if torch.equal(embeddings_index.get_vecs_by_tokens(word), null_embedding):
            raise KeyError
          a[word] = embeddings_index.get_vecs_by_tokens(word)
          k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print()
    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x


oov = check_coverage(corpus.vocab, global_vectors)

KeyboardInterrupt: ignored

In [15]:
from torch.utils.data import Dataset
from torchtext.vocab import GloVe

class MovieReviewsDataset(Dataset):
  def __init__(self, raw_dataset):
    self.corpus = np.array(raw_dataset[0], dtype=object)
    self.elements_to_tensor()
    self.labels = np.array(raw_dataset[1], dtype=object)

  def __len__(self):
    return len(self.corpus)
  
  def elements_to_tensor(self):
    global_vectors = GloVe(name='840B', dim=300)
    for idx, item in enumerate(self.corpus):
      item_tensor = torch.empty(len(item), 300)
      for i in range(len(item)):
        token = item[i]
        item_tensor[i] = global_vectors.get_vecs_by_tokens(token)
      self.corpus[idx] = item_tensor
  
  def __getitem__(self, index):
    item = self.corpus[index]
    label = self.labels[index]
    return (item, label)

### Create the model class
Let's first try with a simple BiLSTM

In [4]:
from unicodedata import bidirectional
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence

class BiLSTM(nn.Module):
  def __init__(self, device = "cuda", input_size = 300, hidden_size = 128, output_size = 2):
    super(BiLSTM, self).__init__()
    self.hidden_size = hidden_size
    self.device = device
    self.lstm = nn.LSTM(input_size, hidden_size, batch_first = True, bidirectional=True)
    self.relu = nn.ReLU()
    self.fc = nn.Linear(hidden_size*2, output_size)
  
  def init_hidden(self, batch_size):
      if self.cuda:
        return (torch.zeros(2, batch_size, self.hidden_size).to(self.device), torch.zeros(2, batch_size, self.hidden_size).to(self.device))
  
  def forward(self, x):
    print(x.batch_sizes[0].item())
    batch_size = x.batch_sizes[0].item()
    hidden = self.init_hidden(batch_size)

    # output: batch_size, sequence_length, hidden_size * 2 (since is bilstm)
    out, _ = self.lstm(x, hidden)
    out, input_sizes = pad_packed_sequence(out, batch_first=True)
    # Interested only in the last layer
    out = out[list(range(batch_size)), input_sizes - 1, :]
    out = self.relu(out)
    out = self.fc(out)

    return out
    





In [5]:
def training_step(net, data_loader, optimizer, cost_function, device = 'cuda'):
  cumulative_loss = 0
  cumulative_accuracy = 0
  samples = 0

  net.train()

  for batch_idx, (inputs, targets) in enumerate(data_loader):

    inputs = inputs.to(device)
    targets = targets.to(device)
    in_size = targets.size(dim=0)

    outputs = net(inputs)

    loss = cost_function(outputs, targets)

    loss.backward()

    optimizer.step()

    optimizer.zero_grad()
    
    samples += in_size
    cumulative_loss += loss.item()
    _, predicted = outputs.max(dim=1)

    cumulative_accuracy += predicted.eq(targets).sum().item()

  return cumulative_loss/samples, (cumulative_accuracy/samples)*100

In [6]:
def test_step(net, data_loader, cost_function, device = 'cuda'):
  cumulative_loss = 0
  cumulative_accuracy = 0
  samples = 0

  net.eval()

  with torch.no_grad():

    for batch_idx, (inputs, targets) in enumerate(data_loader):
      inputs = inputs.to(device)
      targets = targets.to(device)
      in_size = targets.size(dim=0)

      outputs = net(inputs)

      loss = cost_function(outputs, targets)

      samples += in_size
      cumulative_loss += loss.item()
      _, predicted = outputs.max(dim=1)

      cumulative_accuracy += predicted.eq(targets).sum().item()

    return cumulative_loss/samples, (cumulative_accuracy/samples)*100


In [7]:
from torch.utils.data import DataLoader
from torch.optim import Adam
import torch.nn as nn

def main(train_loader, test_loader, device = "cuda", epochs = 10):

  net = BiLSTM(device = device).to(device)

  optimizer = Adam(net.parameters(), 0.001, betas = (0.9, 0.9), weight_decay = 0.0001, amsgrad=True)

  cost_function = nn.CrossEntropyLoss()

  for e in range(epochs):
    print(f"epoch {e}:")
    train_loss, train_accuracy = training_step(net, train_loader, optimizer, cost_function, device)
    print(f"Training loss: {train_loss} \n Training accuracy: {train_accuracy}")
  
  
  _, test_accuracy = test_step(net, test_loader, cost_function, device)


  return test_accuracy


In [8]:
from typing import List
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split

def pad(batch, max_size):
  pad = torch.zeros(batch[0].size(dim=1))
  for idx in range(len(batch)):
    remaining = max_size - batch[idx].size(dim = 0)
    batch[idx] = torch.cat((batch[idx], pad.repeat((remaining, 1))), dim = 0)
  return batch

def batch_to_tensor(X: List[torch.tensor], max_size):
  X_tensor = torch.zeros(len(X), max_size, X[0].size(dim = 1))
  for i, embed in enumerate(X):
    X_tensor[i] = embed
  return X_tensor

def sort_ds(X, Y):
  """
  Sort inputs by document lengths
  """
  document_lengths = np.array([tens.size(dim = 0) for tens in X])
  indexes = np.argsort(document_lengths)

  X_sorted = X[indexes][::-1]
  Y_sorted = Y[indexes][::-1]
  document_lengths = torch.from_numpy(document_lengths[indexes][::-1].copy())

  return X_sorted, Y_sorted, document_lengths



def collate(batch):
  X, Y = list(zip(*batch))
  Y = np.array(list(Y))
  X = np.array(list(X))

  # Sort dataset
  X, Y, document_lengths = sort_ds(X, Y)

  # Get tensor sizes
  max_size = torch.max(document_lengths).item()

  # Pad tensor each element
  X = pad(X, max_size)

  # Transform the batch to a tensor
  X_tensor = batch_to_tensor(X, max_size)
  
  # Return the padded sequence object
  X_final = pack_padded_sequence(X_tensor, document_lengths, batch_first=True)
  return X_final, torch.from_numpy(Y.copy())

def get_data(batch_size: int, collate_fn):
  batch = MovieReviewsCorpus()

  dataset = MovieReviewsDataset(batch.movie_reviews_dataset_raw())

  # Random Split

  train_indexes, test_indexes = train_test_split(list(range(len(dataset.labels))), test_size = 0.2,
                                                 stratify = dataset.labels, random_state = 42)
  
  train_ds = Subset(dataset, train_indexes)
  test_ds = Subset(dataset, test_indexes)

  train_loader = DataLoader(train_ds, batch_size = batch_size, collate_fn = collate_fn, num_workers=2, pin_memory=True)
  test_loader = DataLoader(test_ds, batch_size = batch_size, collate_fn = collate_fn, num_workers=2, pin_memory=True)

  return train_loader, test_loader

In [None]:
train_loader, test_loader = get_data(128, collate)

In [16]:
accuracy = main(train_loader, test_loader, device = "cuda", epochs = 30)

print(f"Overall accuracy: {accuracy}")

epoch 0:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.00561179518699646 
 Training accuracy: 53.6875
epoch 1:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.005560296215116977 
 Training accuracy: 54.37499999999999
epoch 2:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.00520324744284153 
 Training accuracy: 65.0625
epoch 3:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0048539996519684795 
 Training accuracy: 68.875
epoch 4:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.004494919218122959 
 Training accuracy: 74.0
epoch 5:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.004662803355604411 
 Training accuracy: 73.1875
epoch 6:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.006248410977423191 
 Training accuracy: 55.8125
epoch 7:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.004944670349359513 
 Training accuracy: 65.8125
epoch 8:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.004672420434653759 
 Training accuracy: 74.625
epoch 9:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.004696947634220123 
 Training accuracy: 68.9375
epoch 10:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0043427800945937635 
 Training accuracy: 76.1875
epoch 11:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.004364939518272877 
 Training accuracy: 76.125
epoch 12:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.004369023591279984 
 Training accuracy: 73.5625
epoch 13:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.004154049325734377 
 Training accuracy: 76.3125
epoch 14:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0042791181802749635 
 Training accuracy: 74.8125
epoch 15:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0040869623981416225 
 Training accuracy: 76.8125
epoch 16:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0041534962877631185 
 Training accuracy: 75.375
epoch 17:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0041836733557283875 
 Training accuracy: 74.5
epoch 18:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0039699716866016385 
 Training accuracy: 77.6875
epoch 19:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0037136100232601165 
 Training accuracy: 78.6875
epoch 20:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.003511216528713703 
 Training accuracy: 80.375
epoch 21:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0034270294941961764 
 Training accuracy: 81.1875
epoch 22:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.003357218336313963 
 Training accuracy: 81.25
epoch 23:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0033302810043096542 
 Training accuracy: 81.25
epoch 24:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.003236290737986565 
 Training accuracy: 82.4375
epoch 25:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0031316147744655607 
 Training accuracy: 83.5
epoch 26:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0033965276554226874 
 Training accuracy: 81.5
epoch 27:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.00322844572365284 
 Training accuracy: 82.5625
epoch 28:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0030492713116109372 
 Training accuracy: 84.125
epoch 29:




128
128
128
128
128
128
128
128
128
128
128
128
64
Training loss: 0.0033320794254541397 
 Training accuracy: 81.375




128
128
128
16
Overall accuracy: 76.25


In [None]:
"""
tensor([1315, 1222, 1011, 1010,  936,  862,  814,  807,  807,  764,  718,  515,
         495,  388,  344,  323])
tensor([1617, 1361, 1311, 1178, 1081, 1068,  958,  941,  925,  768,  688,  619,
         604,  573,  484,  405])
"""

# First try to parse phrases documet-wise, then try to parse each phrase of a document separately, and then aggregate the result (if there are more positive phrases then positive, otherwise negative). (Try also to give a weight depending on the number of sentiment lexemes)

### Training procedure

### Main function containing also cross validation

### (Possible improvement, apply UDA to GLOVE)

In [None]:
from nltk.tokenize.stanford import StanfordTokenizer
from torchtext.vocab import GloVe

class VectorizerPipeline():

  def __init__(self, corpus, pipe= {"tokenizer": "stanford", "embedding": "glove"}, embedding_size: int = 300):
    self.corpus = corpus
    if embedding_size:
      self.embedding_size = embedding_size
    else:
      self.embedding_size = 300
    
    self._allowed = {
        "tokenizer": ["stanford"],
        "embedding": ["glove"],
        "lemmatizer": [],
        "stop-word-removal": [],
    }
    self.pipe = {
        "tokenizer": None,
        "embedding": None,
        "lemmatizer": None,
        "stop-word-removal": None,
    }
    if pipe:
      for key, value in pipe.items():
        try:
          if pipe[key] in self._allowed[key]:
            self.pipe[key] = value
          else:
            raise ValueError(f"Invalid type of {key}. \n Valid {key}s are {self._allowed[key]}")
        except KeyError:
          raise KeyError(f"Invalid step in the pipeline: {key}. \n valid steps are {list(self._allowed.keys())}")


  def tokenization(self, batch):
    tok = StanfordTokenizer()
    X = [tok(x) for x in batch]
    return X
  
  def embedding(self, batch):
    max_length = max(batch, key=len)

  def vectorize(self, batch):
    Y, X = list(zip(*batch))
    pass








ModuleNotFoundError: ignored

In [None]:
from torch.utils.data import DataLoader

corpus = MovieReviewsCorpus()

dataset = MovieReviewsDataset(corpus.movie_reviews_dataset_raw())

pipeline = VectorizerPipeline(corpus)

dataloader = DataLoader(dataset, batch_size = 64, collate_fn = pipeline.vectorize())

KeyboardInterrupt: ignored