# IMDB reviews

## Import libraries

In [1]:
!pip install portalocker



In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torchtext.datasets import IMDB
import gensim.downloader as api
from gensim.utils import simple_preprocess
import numpy as np

## Load pre-trained word2vec embeddings

In [3]:
word2vec = api.load("word2vec-google-news-300")

## Load the dataset

In [4]:
#TODO: Revisar porque train no carga bien

train_dataset = IMDB(root='./dataset', split='train')
test_dataset = IMDB(root='./dataset', split='test')

print(len(list(train_dataset)))
print(len(list(test_dataset)))

25000
25000


## Visualize the dataset

In [5]:
for i, (label, text) in enumerate(train_dataset):
    if i < 10:
      print(f"{i} Label: {label}\tText: {text[:300]}...")
    if label == 2:
      print(f"{i} Label: {label}\tText: {text[:300]}...")
#    if i == 200:
      break

0 Label: 1	Text: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really h...
1 Label: 1	Text: "I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity...
2 Label: 1	Text: If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away wit...
3 Label: 1	Text: This film was probab

## Define our processed dataset

In [6]:
class IMDBTensorDataset(Dataset):

    def __init__(self, dataset, word2vec):
        self.dataset = self._preprocess(dataset, word2vec)

    def _preprocess(self, dataset, word2vec):
        result = []
        for label, text in dataset:
            tokens = simple_preprocess(text)
            text_vectors = [
                word2vec[token]
                for token in tokens if token in word2vec
            ]
            # evalua si hay reviews vacias o cortas, y se generan matrices de 0s (en el else)
            if text_vectors:
                text_arrays = np.array(text_vectors)
                # tomar un tensor con todas las palabras/embeddings calculando su promedio con mean
                text_tensor = torch.tensor(text_arrays).mean(0)
            else:
                # El 300 es porque el word2vec importado usa "300", ej: "word2vec-google-news-300"
                text_tensor = torch.zero(300)
            label_normalized = label - 1
            result.append((label_normalized, text_tensor))
        return result

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

## Create data loaders

In [7]:
train_tensor_dataset = IMDBTensorDataset(train_dataset, word2vec)
# drop_last descarta si el ultimo batch no llega a 64
train_loader = DataLoader(train_tensor_dataset, batch_size=64, shuffle=True, drop_last=True)

test_tensor_dataset = IMDBTensorDataset(test_dataset, word2vec)
test_loader = DataLoader(test_tensor_dataset, batch_size=64, shuffle=False, drop_last=True)

## Define the model

In [8]:
# input_dim = tamaño del embedding (300)
# output_dim = 1 porque estamos intentando encontrar la clase binaria (0 -> negativo | 1  -> positivo)


# Se usa "nn.ReLU" y no la lib "F" porque sino habria que manejar la composicion de capas en el forward y no todo desde el nn.Sequential(...)
# Sigmoid porque queremos que este entre 0 y 1 (por la clasificacion binaria)

class MLP(nn.Module):

    def __init__(self, hidden_dim, input_dim=300, output_dim=1):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

## Train the model

In [9]:
from torch.cuda import is_available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#epochs = 10
#lr = 0.001
#hidden_dim = 128

def train_model(epochs, lr, hidden_dim):
  # Less value of loss function is the best model??
  model = MLP(hidden_dim).to(device)

  # BCE = Binary Cross Entropy Loss
  criterion = nn.BCELoss()
  optimizer = Adam(model.parameters(), lr=lr)
  score = 0.0

  for epoch in range(epochs):
      running_loss = 0.0
      running_loss_test = 0.0

      #Train
      for labels, texts in train_loader:
          labels = labels.float().to(device).unsqueeze(1)
          texts = texts.to(device)
          optimizer.zero_grad()
          predictions = model(texts)
          loss = criterion(predictions, labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()

      # Validation (cambiar de test_loader a validation_loader, y este validation_loader debe ser un split del train_loader porque sino overfittea los hiperparametros)
      for t_labels, t_texts in test_loader:
          t_labels = t_labels.float().to(device).unsqueeze(1)
          t_texts = t_texts.to(device)
          t_predictions = model(t_texts)
          loss_t = criterion(t_predictions, t_labels)
          running_loss_test += loss_t.item()

      # Get score comparing train vs test loss
      avg_train_loss = running_loss / len(train_loader)
      avg_test_loss = running_loss_test / len(test_loader)

      print(f"Epoch [{epoch+1}/{epochs}] | Loss Train: {avg_train_loss:.6f}, Loss Test: {avg_test_loss:.6f}")

  # Score (Se utiliza accuracy para obtener mejor modelo porque si se utiliza la loss retorna el modelo que mas overfittea)
  with torch.no_grad():
    correct, total = 0, 0

    for labels, texts in test_loader:
        labels = labels.to(device).unsqueeze(1)
        texts = texts.to(device)
        outputs = model(texts)
        predictions = torch.round(outputs.data)
        total += labels.size(0)
        correct += (predictions == labels).sum().item()
    score = correct / total

  return (score, model)

# Hyper-parameters tuning

# Grid-Search
ls_epochs = [10, 25, 50]
ls_lrs = [0.1, 0.01, 0.001]
ls_hidden_dims = [32, 64, 128, 256]

grid_search =  [(e, l, h) for e in ls_epochs for l in ls_lrs for h in ls_hidden_dims]

print(grid_search)

# score_models len must be equal grid_search = [(epoch, lr, hidden_dim)]
best_model = None
# best_score is best accuracy found
best_score = 0

# For debug
best_comb = None

for i, (epochs, lr, hidden_dim) in enumerate(grid_search):
  tmp_score, tmp_model = train_model(epochs, lr, hidden_dim)
  print(f'[{i+1}/{len(grid_search)}] (Epochs: {epochs}, Lr: {lr}, Hidden_dim: {hidden_dim}) =>')
  #print(f"Score Accuracy: { correct / total:.2f}%")
  print(f'   actual best_model_score: {best_score} - train_score: {tmp_score}')
  if tmp_score > best_score:
    best_score = tmp_score
    best_model = tmp_model
    best_comb = (epochs, lr, hidden_dim)
    print(f'new best score: {best_score}')

print(f'[Grid Search] best combination (epochs, lr, hidden_dim): {best_comb}')

[(10, 0.1, 32), (10, 0.1, 64), (10, 0.1, 128), (10, 0.1, 256), (10, 0.01, 32), (10, 0.01, 64), (10, 0.01, 128), (10, 0.01, 256), (10, 0.001, 32), (10, 0.001, 64), (10, 0.001, 128), (10, 0.001, 256), (25, 0.1, 32), (25, 0.1, 64), (25, 0.1, 128), (25, 0.1, 256), (25, 0.01, 32), (25, 0.01, 64), (25, 0.01, 128), (25, 0.01, 256), (25, 0.001, 32), (25, 0.001, 64), (25, 0.001, 128), (25, 0.001, 256), (50, 0.1, 32), (50, 0.1, 64), (50, 0.1, 128), (50, 0.1, 256), (50, 0.01, 32), (50, 0.01, 64), (50, 0.01, 128), (50, 0.01, 256), (50, 0.001, 32), (50, 0.001, 64), (50, 0.001, 128), (50, 0.001, 256)]
Epoch [1/10] | Loss Train: 0.695545, Loss Test: 0.693177
Epoch [2/10] | Loss Train: 0.694005, Loss Test: 0.693933
Epoch [3/10] | Loss Train: 0.695325, Loss Test: 0.694950
Epoch [4/10] | Loss Train: 0.694756, Loss Test: 0.693849
Epoch [5/10] | Loss Train: 0.695280, Loss Test: 0.693904
Epoch [6/10] | Loss Train: 0.694796, Loss Test: 0.699982
Epoch [7/10] | Loss Train: 0.694902, Loss Test: 0.693667
Epoch 

## Perform an evaluation

In [10]:
with torch.no_grad():
    correct, total = 0, 0

    for labels, texts in test_loader:
        labels = labels.to(device).unsqueeze(1)
        texts = texts.to(device)
        outputs = best_model(texts)
        predictions = torch.round(outputs.data)
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 86.34%


# TODO:
1. [Done] Check why it doesn't work. (It was only loading a few negative examples, which allowed learning a trivial model).
2. [Done] Do hyper-parameter tuning with grid-search over the learning rate and the MLP hidden dimension.
  
  (Including epochs)
3. [Optional] Use fastText instead of word2vec.