<a href="https://colab.research.google.com/github/Zinni98/Sentiment-analysis-project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Polarity Classification

### Get the data

In [1]:
import nltk
import torch
nltk.download("punkt")
nltk.download("movie_reviews")
nltk.download("subjectivity")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!


True

## TODO: Use spacy negation marking

In [2]:
from nltk.corpus import movie_reviews
from nltk.sentiment.util import mark_negation
import numpy as np


class MovieReviewsCorpus():
  def __init__(self):
    # list of documents, each document is a list containing words of that document
    self.mr = movie_reviews
    self.corpus, self.labels = self._flatten()
    self.unprocessed_corpus = self._get_corpus()
    self.corpus_words = self.get_corpus_words()
    self.vocab = self._create_vocab()

  
  def _list_to_str(self, doc) -> str:
    """
    Put all elements of the list into a single string, separating each element with a space.
    """
    return " ".join([w for sent in doc for w in sent])
  
  
  def _flatten(self):
    """
    Returns
    -------
    list[list[str]]
      Each inner list represents a document. Each document is a list of tokens.
    """

    # 3 nested list: each list contain a document, each inner list contains a phrase (until fullstop), each phrase contains words.
    neg = self.mr.paras(categories = "neg")
    pos = self.mr.paras(categories = "pos")

    corpus = [[w for w in self._list_to_str(d).split(" ")] for d in pos] + [[w for w in self._list_to_str(d).split(" ")] for d in neg]
    labels = np.array([0] * len(pos) + [1] * len(neg))
    return corpus, labels
  
  def _get_corpus(self):
    neg = self.mr.paras(categories = "neg")
    pos = self.mr.paras(categories = "pos")

    return neg + pos

  def movie_reviews_dataset_raw(self):
    """
    Returns the dataset containing:
    
    - A list of all the documents
    - The corresponding label for each document

    Returns
    -------
    tuple(list, np.array)
      The dataset: first element is the list of the document, the second element of the tuple is the associated label (positive or negative) for each document
    """
    
    return self.corpus, self.labels
  
  def get_sentence_ds(self):
    neg = self.mr.paras(categories = "neg")
    pos = self.mr.paras(categories = "pos")

    pos = [phrase for doc in pos for phrase in doc]
    neg = [phrase for doc in neg for phrase in doc]

    labels = np.array([0] * len(pos) + [1] * len(neg))
    corpus = neg+pos
    return corpus, labels

  
  def get_corpus_words(self) -> list:
    return [w for doc in self.corpus for w in doc]

  def _create_vocab(self):
    vocab = dict()
    for word in self.corpus_words:
      try:
        vocab[word] += 1
      except:
        vocab[word] = 1
    return vocab

  def __len__(self):
    return len(self.corpus)


"""class MovieReviewsCorpusNegMarked(MovieReviewsCorpus):
  def __init__(self):
    super().__init__()
    self._neg_marking()
    self.corpus_words = self.get_corpus_words()
    self.vocab = self._create_vocab()
  
  def _neg_marking(self):
    negated_corpus = [self._mark(doc) for doc in self.corpus]
    self.corpus = negated_corpus
  
  def _mark(self, doc):
    # negates the whole document
    negated_doc = mark_negation(doc, double_neg_flip=True)
    return negated_doc"""

'class MovieReviewsCorpusNegMarked(MovieReviewsCorpus):\n  def __init__(self):\n    super().__init__()\n    self._neg_marking()\n    self.corpus_words = self.get_corpus_words()\n    self.vocab = self._create_vocab()\n  \n  def _neg_marking(self):\n    negated_corpus = [self._mark(doc) for doc in self.corpus]\n    self.corpus = negated_corpus\n  \n  def _mark(self, doc):\n    # negates the whole document\n    negated_doc = mark_negation(doc, double_neg_flip=True)\n    return negated_doc'

In [3]:
from torch.utils.data import Dataset
from torchtext.vocab import GloVe

class MovieReviewsDataset(Dataset):
  def __init__(self, raw_dataset):
    self.corpus = np.array(raw_dataset[0])
    self.elements_to_tensor()
    self.labels = np.array(raw_dataset[1])

  def __len__(self):
    return len(self.corpus)
  
  def elements_to_tensor(self):
    global_vectors = GloVe(name='840B', dim=300)
    for idx, item in enumerate(self.corpus):
      item_tensor = torch.empty(len(item), 300)
      for i in range(len(item)):
        token = item[i]
        item_tensor[i] = global_vectors.get_vecs_by_tokens(token)
      self.corpus[idx] = item_tensor
  
  def __getitem__(self, index):
    item = self.corpus[index]
    label = self.labels[index]
    return (item, label)

### Create the model class
Let's first try with a simple BiLSTM

In [4]:
from unicodedata import bidirectional
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence

class BiLSTM(nn.Module):
  def __init__(self, device = "cuda", input_size = 300, hidden_size = 128, output_size = 2):
    super(BiLSTM, self).__init__()
    self.hidden_size = hidden_size
    self.device = device
    self.lstm = nn.LSTM(input_size, hidden_size, batch_first = True, bidirectional=True, num_layers = 2)
    self.source_classifier = nn.Sequential(nn.ReLU(),
                                           nn.BatchNorm1d(hidden_size*2, eps = 1e-08),
                                           nn.Dropout(0.3),
                                           nn.Linear(hidden_size*2, output_size)
                                           )
    self.target_classifier = nn.Sequential(nn.ReLU(),
                                           nn.BatchNorm1d(hidden_size*2, eps = 1e-08),
                                           nn.Dropout(0.3),
                                           nn.Linear(hidden_size*2, output_size)
                                           )
    
    
  
  def init_hidden(self, batch_size):
      if self.cuda:
        return (torch.zeros(4, batch_size, self.hidden_size).to(self.device),
                torch.zeros(4, batch_size, self.hidden_size).to(self.device),)
  
  def forward(self, x):
    batch_size = x.batch_sizes[0].item()
    hidden = self.init_hidden(batch_size)

    features, _ = self.lstm(x)
    features, input_sizes = pad_packed_sequence(features, batch_first=True)
    features = features[list(range(batch_size)), input_sizes - 1, :]
    source_output = self.source_classifier(features)
    # source_output = nn.Softmax(source_output)

    target_output = self.target_classifier(features)
    # target_output = nn.Softmax(target_output)

    source_target_output = torch.cat((source_output, target_output), dim=1)
    
    return source_output , target_output, source_target_output

  def classifier_parameters(self) -> torch.Tensor:
    """
    Parameters of the classification layer

    Yields
    ------
    torch.Tensor
      Classification layer parameter
    """
    sc = list(self.source_classifier.parameters())
    tc = list(self.target_classifier.parameters())
    tot = sc + tc
    for param in tot:
      yield param




In [5]:
from abc import ABC, abstractmethod
import torch.optim as optim

class AnnealingOptimizer(torch.optim.Optimizer, ABC):
  """
  Defines and abstract class in order to implement an sgd optimizer using an annealing strategy
  """
  def __init__(self, model, nr_epochs, lr: float = 0.001, epoch: int = 0) -> None:
    if not 0.0 <= lr:
      raise ValueError(f"Invalid learning rate: {lr}")
    if not 0 <= epoch:
      raise ValueError(f"Invalid epoch value: {epoch}")
    
    self.nr_epochs = nr_epochs
    self.epoch = epoch
    self._alpha = 10
    self._beta = 0.75
    self._base_lr = lr

  def update_lr(self):
    """
    Updates the learning rate using the annealing strategy.
    In order to let the annealing strategy to work correctly, this method should be called at every epoch during the network training

    The learning rate for the classifier is 10 times bigger as proposed in the [Symnet paper](https://arxiv.org/pdf/1904.04663.pdf)
    """
    self.epoch += 1
    new_lr = self._compute_lr()
    for g in self.optimizer.param_groups:
      if g["name"] == "fe":
        g["lr"] = new_lr
      else:
        g["lr"] = new_lr*10

    
  def _compute_lr(self):
    """
    Computes the learning rate using the proposed annealing strategy

    Returns
    -------
    float
      updated learning rate
    """
    etap = 1 / ((1 + self._alpha * self.epoch / self.nr_epochs ) ** self._beta)
    return self._base_lr * etap

  def step(self):
    self.optimizer.step()
  
  def zero_grad(self):
    self.optimizer.zero_grad()


class BiLSTMOptimizer(AnnealingOptimizer):
  """
  Implements an annealing optimizer for Resnet
  """
  def __init__(self, model, nr_epochs, lr: float = 0.001, epoch: int = 0, momentum: float = 0.9) -> None:
    super(BiLSTMOptimizer ,self).__init__(model, nr_epochs, lr, epoch)
    
    # Note that names for parameters group are important in order to update each group differently
    self.optimizer = optim.SGD([
                {'params': model.lstm.parameters(), "name": "fe"},
                {'params': model.classifier_parameters(), "lr": self._compute_lr()*10, "name": "classifier"}
            ], lr=lr, momentum=momentum)
    

In [6]:
def source_loss(output_source, label):
  """
   Cross entropy loss of source classifier C_s for source samples (equation 5 of the paper)

  Parameters
  ----------
  output: torch.Tensor
    Output batch of the network. Notice that in order to let the algorithm work correctly, this should
    be the output of the source classifier
  
  label: torch.Tensor
    Labels corresponding to the samples whose output is computed

  Returns
  -------
  torch.Tensor
    The result of the computed loss for the entire batch
  """
  loss_fun = nn.CrossEntropyLoss()
  loss = loss_fun(output_source, label)
  return loss

def target_loss(output_target, label):
  """
  Cross entropy loss of target classifier C_t for source samples (equation 6 of the paper)

  Parameters
  ----------
  output: torch.Tensor
    Output batch of the network. Notice that in order to let the algorithm work correctly, this should
    be the output of the target classifier
  
  label: torch.Tensor
    Labels corresponding to the samples whose output is computed

  Returns
  -------
  torch.Tensor
    The result of the computed loss for the entire batch
  """
  return source_loss(output_target, label)

def source_target_loss(output, st = True):
  """
  Two-way cross-entropy loss for the joint classifier C_st (equation 7 of the paper)

  Parameters
  ----------
  output: torch.Tensor
    Output batch of the network. Notice that in order to let the algorithm work correctly, this should
    be the output of the combined source-target classifier
  st: bool
    True if train batch belongs to source, False if belongs to target
  
  Returns
  -------
  torch.Tensor
    The result of the computed loss for the entire batch

  """
  n_classes = int(output.size(1)/2)
  soft = nn.Softmax(dim=1)
  prob_out = soft(output)
  if st:
    loss = -(prob_out[:,:n_classes].sum(1).log().mean())
  else:
    loss = -(prob_out[:,n_classes:].sum(1).log().mean())
  return loss

def feature_category_loss(output_st, label):
  """
  Category level confusion loss (equation 8 of the Symnet paper)

  Parameters
  ----------
  output_st: torch.Tensor
    Output batch of the network. Notice that in order to let the algorithm work correctly, this should
    be the output of the combined source-target classifier
  
  label: torch.Tensor
    Labels corresponding to the samples whose output is computed
  
  Returns
  -------
  torch.Tensor
    The result of the computed loss for the entire batch

  """
  n_classes = int(output_st.size(1)/2)

  loss_fun_1 = nn.CrossEntropyLoss()
  loss_fun_2 = nn.CrossEntropyLoss()

  loss_1 = loss_fun_1(output_st[:, :n_classes], label)/2
  loss_2 = loss_fun_2(output_st[:,n_classes:], label)/2
  return loss_1 + loss_2

def feature_domain_loss(output_st):
  """
  Domain level confusion loss (equation 9 of the Symnet paper)

  Parameters
  ----------
  output: torch.Tensor
    Output batch of the network. Notice that in order to let the algorithm work correctly, this should
    be the output of the combined source-target classifier
  
  Returns
  -------
  torch.Tensor
    The result of the computed loss for the entire batch

  """
  n_classes = int(output_st.size(1)/2)

  soft = nn.Softmax(dim=1)
  prob_out = soft(output_st)

  loss_1 = -(prob_out[:,:n_classes]).sum(1).log().mean()/2
  loss_2 = -(prob_out[:,n_classes:]).sum(1).log().mean()/2

  return loss_1 + loss_2



def entropyMinimizationPrinciple(output_st):
    """
    Entropy minimization principle (equation 10 of the Symnet paper)

    Parameters
    ----------
    output: torch.Tensor
      Output batch of the network. Notice that in order to let the algorithm work correctly, this should
      be the output of the combined source-target classifier
    
    Returns
    -------
    torch.Tensor
      The corresponding entropy minimization loss for the entire batch
    """
    nr_classes = int(output_st.size(1)/2)
    soft = nn.Softmax(dim=1)
    prob_out = soft(output_st)

    p_st_source = prob_out[:, :nr_classes]
    p_st_target = prob_out[:, nr_classes:]
    qst = p_st_source + p_st_target

    emp = -qst.log().mul(qst).sum(1).mean()

    return emp

In [7]:
def argsort(X, Y, document_lengths):
  """
  Sort inputs by document lengths
  """
  indexes = np.argsort(document_lengths)

  X_sorted = X[indexes][::-1]
  Y_sorted = Y[indexes][::-1]
  document_lengths = torch.from_numpy(document_lengths[indexes][::-1].copy())

  return X_sorted, Y_sorted, document_lengths

def collate_lengths(batch, lengths):
  X, Y = list(zip(*batch))
  Y = np.array(list(Y))
  X = np.array(list(X))

  # Sort dataset
  X, Y, document_lengths = argsort(X, Y, lengths)

  # Get tensor sizes
  max_size = torch.max(document_lengths).item()

  # Pad tensor each element
  X = pad(X, max_size)

  # Transform the batch to a tensor
  X_tensor = batch_to_tensor(X, max_size)
  
  # Return the padded sequence object
  return X_tensor, torch.from_numpy(Y.copy()), document_lengths

def training_step_uda(net, src_data_loader, target_data_loader, optimizer, lam, e, device = 'cuda'):
  n_source_samples = 0.
  n_target_samples = 0.
  cumulative_classifier_loss = 0.
  cumulative_feature_loss = 0.
  cumulative_accuracy = 0.

  target_iter = iter(target_data_loader)

  net.train()

  # iterate over the training set
  for batch_idx, (inputs_source, labels, source_document_lengths) in enumerate(src_data_loader):
    try:
      inputs_target, _ , target_document_lengths = next(target_iter)
    except:
      target_iter = iter(target_data_loader)
      inputs_target, _, target_document_lengths = next(target_iter)
  
    
    # Cannot simply concatenate because they have different lengths in dimensions that are not batch dim.
    source_inputs_to_list = []
    target_inputs_to_list = []

    for i, x in enumerate(inputs_source):
      source_inputs_to_list.append((x, labels[i].item()))
    for i, x in enumerate(inputs_target):
      # Creating fake labels
      target_inputs_to_list.append((x, -1))

    source_document_lengths = source_document_lengths.numpy()
    target_document_lengths = target_document_lengths.numpy()
    document_lengths = np.concatenate([source_document_lengths, target_document_lengths])
    # Since inputs from source and target have different lengths I have to collate again
    inputs, labels, document_lengths = collate_lengths(source_inputs_to_list + target_inputs_to_list,
                                      document_lengths)

    inputs = pack_padded_sequence(inputs, document_lengths, batch_first=True)

    source_samples = [i for i in range(len(labels)) if labels[i]!=-1]
    target_samples = [i for i in range(len(labels)) if labels[i]==-1]
    labels = torch.tensor([labels[i].item() for i in range(len(labels)) if labels[i]!=-1])

    inputs = inputs.to(device)
    labels = labels.to(device)


    # forward pass
    c_s, c_t, c_st = net(inputs)

    c_s_source = c_s[source_samples,:]
    c_s_target = c_s[target_samples,:]

    c_t_source = c_t[source_samples,:]
    c_t_target = c_t[target_samples,:]

    c_st_source = c_st[source_samples,:]
    c_st_target = c_st[target_samples,:]


    # Equation 5 of the paper
    error_source_task = source_loss(c_s_source, labels)

    # Equation 6 of the paper
    error_target_task = target_loss(c_t_source, labels)

    # Equation 7 of the paper
    domain_loss_source = source_target_loss(c_st_source)
    domain_loss_target = source_target_loss(c_st_target, st = False)
    error_domain = domain_loss_source + domain_loss_target

    classifier_total_loss = error_source_task + error_target_task + error_domain

    # Retain graph needed because otherwise the parts of the computation graph
    # needed to compute classifier_total_loss will be freed up, but we
    # need those parts in order to compute the next loss
    classifier_total_loss.backward(retain_graph = True)

    for param in net.lstm.parameters():
      param.grad.data.zero_()
    
    class_params = []
    for param in net.source_classifier.parameters():
      class_params.append(param.grad.data.clone())
      param.grad.data.zero_()
    for param in net.target_classifier.parameters():
      class_params.append(param.grad.data.clone())
      param.grad.data.zero_()

    # Equation 8 of the paper
    error_feature_category = feature_category_loss(c_st_source, labels)

    # Equation 9 of the paper
    error_feature_domain = feature_domain_loss(c_st_target)

    min_entropy = entropyMinimizationPrinciple(c_st_target)

    # Equations 11 of the paper
    feature_total_loss = error_feature_category + lam * (error_feature_domain + min_entropy)

    feature_total_loss.backward()

    idx = 0
    for param in net.source_classifier.parameters():
      param.grad.data = class_params[idx]
      idx += 1
    for param in net.target_classifier.parameters():
      param.grad.data = class_params[idx]
      idx += 1

    
    optimizer.step()
    optimizer.zero_grad()
    


    # print statistics
    n_source_samples+=inputs_source.shape[0]
    n_target_samples+=inputs_target.shape[0]
    
    cumulative_classifier_loss += classifier_total_loss.item()
    cumulative_feature_loss += feature_total_loss.item()
    _, predicted = c_s_source.max(dim = 1) ## to get the maximum probability
    cumulative_accuracy += predicted.eq(labels).sum().item()

  return cumulative_classifier_loss/n_source_samples, cumulative_feature_loss/n_target_samples, cumulative_accuracy/n_source_samples*100


In [15]:
def test_step_uda(net, data_target_test_loader, device='cuda:0'):

    '''
    Params
    ------

    net : model 
    data_loader : DataLoader obj of the domain to test on
    cost_function : cost function used to address accuracies (not necessary) -> TargetClassifierLoss
    device : GPU or CPU device

    '''

    samples = 0.
    cumulative_loss = 0.
    cumulative_accuracy = 0.

    net.eval()

    with torch.no_grad():

        for batch_idx, (inputs, labels, document_lengths) in enumerate(data_target_test_loader):

            inputs = pack_padded_sequence(inputs, document_lengths, batch_first=True)
            # load data into GPU
            inputs = inputs.to(device)
            targets = labels.to(device)
        
            # forward pass
            _, c_t, _ = net(inputs)

            # apply the loss
            loss = target_loss(c_t, targets)

            # print statistics
            samples+=c_t.shape[0]
            cumulative_loss += loss.item() # Note: the .item() is needed to extract scalars from tensors
            _, predicted = c_t.max(1)
            cumulative_accuracy += predicted.eq(targets).sum().item()

    return cumulative_loss/samples, cumulative_accuracy/samples*100

In [17]:
from torch.utils.tensorboard import SummaryWriter
import math

def main_uda(source_train_loader,
             target_train_loader,
             target_test_loader,
             device="cuda",
             epochs=15,
             nr_classes = 2, 
            ):
    
  # writer = SummaryWriter(log_dir="gdrive/My Drive/Colab Notebooks/runs/exp2")
  ## DataLoader split the size of the given dataset into #of elements in the dataset/batch size
  
  print('DataLoaders Done')
  net = BiLSTM().to(device)
  print('Network Init Done')
  optimizer = BiLSTMOptimizer(model = net, nr_epochs = epochs)
  print('Got optimizers')

  for e in range(epochs):
    lam = 2 / (1 + math.exp(-1 * 10 * e / epochs)) - 1

    train_ce_loss, train_en_loss, train_accuracy = training_step_uda(net=net, src_data_loader=source_train_loader, 
                                                        target_data_loader=target_train_loader, 
                                                        optimizer=optimizer, lam=lam, e=e, device=device)
    torch.cuda.empty_cache()

    print(f'Epoch: {e+1:d}')
    print(f'\t Train: CE loss {train_ce_loss:.5f}, Entropy loss {train_en_loss:.5f}, Accuracy {train_accuracy:.2f}')

    test_loss, test_accuracy = test_step_uda(net, target_test_loader, device)
    print(f'\t Test Accuracy {test_accuracy:.2f}')
    
    optimizer.update_lr()

  test_loss, test_accuracy = test_step_uda(net, target_test_loader, device)
  return test_accuracy

In [10]:
from typing import List
from torch.nn.utils.rnn import pack_padded_sequence
from torch.utils.data import Subset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

def pad(batch, max_size):
  pad = torch.zeros(batch[0].size(dim=1))
  for idx in range(len(batch)):
    remaining = max_size - batch[idx].size(dim = 0)
    batch[idx] = torch.cat((batch[idx], pad.repeat((remaining, 1))), dim = 0)
  return batch

def batch_to_tensor(X: List[torch.tensor], max_size):
  X_tensor = torch.zeros(len(X), max_size, X[0].size(dim = 1))
  for i, embed in enumerate(X):
    X_tensor[i] = embed
  return X_tensor

def sort_ds(X, Y):
  """
  Sort inputs by document lengths
  """
  document_lengths = np.array([tens.size(dim = 0) for tens in X])
  indexes = np.argsort(document_lengths)

  X_sorted = X[indexes][::-1]
  Y_sorted = Y[indexes][::-1]
  document_lengths = torch.from_numpy(document_lengths[indexes][::-1].copy())

  return X_sorted, Y_sorted, document_lengths


def collate(batch):
  X, Y = list(zip(*batch))
  Y = np.array(list(Y))
  X = np.array(list(X))

  # Sort dataset
  X, Y, document_lengths = sort_ds(X, Y)

  # Get tensor sizes
  max_size = torch.max(document_lengths).item()

  # Pad tensor each element
  X = pad(X, max_size)

  # Transform the batch to a tensor
  X_tensor = batch_to_tensor(X, max_size)
  
  # Return the padded sequence object
  return X_tensor, torch.from_numpy(Y.copy()), document_lengths

def get_data(batch_size: int, collate_fn):
  batch = MovieReviewsCorpus()

  dataset = MovieReviewsDataset(batch.movie_reviews_dataset_raw())
  unsup_ds = MovieReviewsDataset(batch.movie_reviews_dataset_raw())

  # Random Split

  train_indexes, test_indexes = train_test_split(list(range(len(dataset.labels))), test_size = 0.2,
                                                 stratify = dataset.labels, random_state = 42)
  
  train_ds = Subset(dataset, train_indexes)
  test_ds = Subset(dataset, test_indexes)

  train_loader = DataLoader(train_ds, batch_size = batch_size, collate_fn = collate_fn, pin_memory=True)
  test_loader = DataLoader(test_ds, batch_size = batch_size, collate_fn = collate_fn, pin_memory=True)
  unsup_loader = DataLoader(test_ds, batch_size = batch_size, collate_fn = collate_fn, pin_memory=True)

  return train_loader, test_loader, unsup_loader

In [11]:
train_loader, test_loader, unsup_loader = get_data(128, collate)

  


In [18]:
accuracy = main_uda(train_loader, unsup_loader, test_loader, device = "cuda", epochs = 25)

print(f"Overall accuracy: {accuracy}")

DataLoaders Done
Network Init Done
Got optimizers


  app.launch_new_instance()
  app.launch_new_instance()


Epoch: 1
	 Train: CE loss 0.02363, Entropy loss 0.00727, Accuracy 52.56
	 Test Accuracy 57.50
Epoch: 2
	 Train: CE loss 0.02188, Entropy loss 0.00929, Accuracy 59.19
	 Test Accuracy 58.25
Epoch: 3
	 Train: CE loss 0.02113, Entropy loss 0.01152, Accuracy 61.44
	 Test Accuracy 62.25
Epoch: 4
	 Train: CE loss 0.02113, Entropy loss 0.01365, Accuracy 62.19
	 Test Accuracy 62.75
Epoch: 5
	 Train: CE loss 0.02067, Entropy loss 0.01524, Accuracy 63.38
	 Test Accuracy 59.00
Epoch: 6
	 Train: CE loss 0.02081, Entropy loss 0.01651, Accuracy 65.25
	 Test Accuracy 62.25
Epoch: 7
	 Train: CE loss 0.02058, Entropy loss 0.01740, Accuracy 63.56
	 Test Accuracy 60.00
Epoch: 8
	 Train: CE loss 0.02048, Entropy loss 0.01791, Accuracy 64.12
	 Test Accuracy 61.00
Epoch: 9
	 Train: CE loss 0.02043, Entropy loss 0.01833, Accuracy 66.25
	 Test Accuracy 58.75
Epoch: 10
	 Train: CE loss 0.02045, Entropy loss 0.01865, Accuracy 65.12
	 Test Accuracy 58.75
Epoch: 11
	 Train: CE loss 0.02038, Entropy loss 0.01879, A

KeyboardInterrupt: ignored

In [None]:
 """
tensor([1315, 1222, 1011, 1010,  936,  862,  814,  807,  807,  764,  718,  515,
         495,  388,  344,  323])
tensor([1617, 1361, 1311, 1178, 1081, 1068,  958,  941,  925,  768,  688,  619,
         604,  573,  484,  405])
"""

# First try to parse phrases documet-wise, then try to parse each phrase of a document separately, and then aggregate the result (if there are more positive phrases then positive, otherwise negative). (Try also to give a weight depending on the number of sentiment lexemes)

### Training procedure

### Main function containing also cross validation

### (Possible improvement, apply UDA to GLOVE)

In [None]:
from nltk.tokenize.stanford import StanfordTokenizer
from torchtext.vocab import GloVe

class VectorizerPipeline():

  def __init__(self, corpus, pipe= {"tokenizer": "stanford", "embedding": "glove"}, embedding_size: int = 300):
    self.corpus = corpus
    if embedding_size:
      self.embedding_size = embedding_size
    else:
      self.embedding_size = 300
    
    self._allowed = {
        "tokenizer": ["stanford"],
        "embedding": ["glove"],
        "lemmatizer": [],
        "stop-word-removal": [],
    }
    self.pipe = {
        "tokenizer": None,
        "embedding": None,
        "lemmatizer": None,
        "stop-word-removal": None,
    }
    if pipe:
      for key, value in pipe.items():
        try:
          if pipe[key] in self._allowed[key]:
            self.pipe[key] = value
          else:
            raise ValueError(f"Invalid type of {key}. \n Valid {key}s are {self._allowed[key]}")
        except KeyError:
          raise KeyError(f"Invalid step in the pipeline: {key}. \n valid steps are {list(self._allowed.keys())}")


  def tokenization(self, batch):
    tok = StanfordTokenizer()
    X = [tok(x) for x in batch]
    return X
  
  def embedding(self, batch):
    max_length = max(batch, key=len)

  def vectorize(self, batch):
    Y, X = list(zip(*batch))
    pass








ModuleNotFoundError: ignored

In [None]:
from torch.utils.data import DataLoader

corpus = MovieReviewsCorpus()

dataset = MovieReviewsDataset(corpus.movie_reviews_dataset_raw())

pipeline = VectorizerPipeline(corpus)

dataloader = DataLoader(dataset, batch_size = 64, collate_fn = pipeline.vectorize())

KeyboardInterrupt: ignored

In [None]:
import operator
from tqdm import tqdm
from torchtext.vocab import GloVe
import torch

corpus = MovieReviewsCorpus()

global_vectors = GloVe(name='840B', dim=300)

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    null_embedding = torch.tensor([0.0]*300)
    for word in tqdm(vocab):
        try:
          if torch.equal(embeddings_index.get_vecs_by_tokens(word), null_embedding):
            raise KeyError
          a[word] = embeddings_index.get_vecs_by_tokens(word)
          k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print()
    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x


oov = check_coverage(corpus.vocab, global_vectors)