In [1]:
import spacy
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_dataset

# Load the IMDB dataset
imdb = load_dataset("imdb")
texts = imdb["train"]["text"] + imdb["test"]["text"]
labels = imdb["train"]["label"] + imdb["test"]["label"]
# train_texts = imdb["train"]["text"]
# train_labels = imdb["train"]["label"]
# test_texts = imdb["test"]["text"]
# test_labels = imdb["test"]["label"]
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.3, random_state=42
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from collections import Counter
print(Counter(train_labels))
print(Counter(test_labels))

Counter({1: 17557, 0: 17443})
Counter({0: 7557, 1: 7443})


In [3]:
nlp_sm = spacy.load("en_core_web_sm", disable=["parser", "ner", "tagger", "attribute_ruler", "lemmatizer"])
nlp_md = spacy.load("en_core_web_md", disable=["parser", "ner", "tagger", "attribute_ruler", "lemmatizer"])
nlp_lg = spacy.load("en_core_web_lg", disable=["parser", "ner", "tagger", "attribute_ruler", "lemmatizer"])

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
def preprocess_text(nlp, text):
    # Tokenize and generate embeddings using spaCy
    doc = nlp(text)
    return np.mean([token.vector for token in doc if token.has_vector], axis=0)

In [5]:
from tqdm import tqdm

# Prepare the data
def prepare_data(texts, labels, nlp):
    embeddings, filtered_labels = [], []
    for label, text in tqdm(zip(labels, texts), total=len(texts)):
        embedding = preprocess_text(nlp, text)
        if embedding is not None:  # Ignore empty embeddings
            embeddings.append(embedding)
            filtered_labels.append(label)
    return np.array(embeddings), np.array(filtered_labels)

In [6]:
class IMDBDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

In [7]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SentimentClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 1),
            # nn.ReLU(),
            # nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.fc(x).squeeze()

In [8]:
def train_and_get_model(train_loader, train_embeddings, epochs = 200):
        # Initialize model, loss, and optimizer
    input_dim = train_embeddings.shape[1]  # Dimensionality of the pretrained embeddings
    model = SentimentClassifier(input_dim)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Training loop
    epochs = 200
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for embeddings, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

    return model

In [9]:
from sklearn.metrics import accuracy_score
def evaluate_model(model, test_loader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for embeddings, labels in test_loader:
            outputs = model(embeddings)
            preds = (outputs > 0.5).float()
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Test Accuracy: {accuracy:.4f}")

# Small model

In [10]:
# Preprocess the dataset
train_embeddings_sm, filtered_train_labels_sm = prepare_data(train_texts, train_labels, nlp_sm)
test_embeddings_sm, filtered_test_labels_sm = prepare_data(test_texts, test_labels, nlp_sm)

100%|██████████| 35000/35000 [02:52<00:00, 202.83it/s]
100%|██████████| 15000/15000 [01:14<00:00, 202.46it/s]


In [11]:
print("Embeddings dimension: ", train_embeddings_sm.shape[1])

Embeddings dimension:  96


In [12]:
# Create PyTorch datasets
train_dataset_sm = IMDBDataset(train_embeddings_sm, filtered_train_labels_sm)
test_dataset_sm = IMDBDataset(test_embeddings_sm, filtered_test_labels_sm)

In [13]:
train_loader_sm = DataLoader(train_dataset_sm, batch_size=64, shuffle=True)
test_loader_sm = DataLoader(test_dataset_sm, batch_size=64, shuffle=False)

In [14]:
model = train_and_get_model(train_loader_sm, train_embeddings_sm)

Epoch 1/200, Loss: 368.8025
Epoch 2/200, Loss: 356.8560
Epoch 3/200, Loss: 350.3138
Epoch 4/200, Loss: 346.0489
Epoch 5/200, Loss: 343.0567
Epoch 6/200, Loss: 340.8306
Epoch 7/200, Loss: 339.1372
Epoch 8/200, Loss: 337.7704
Epoch 9/200, Loss: 336.6265
Epoch 10/200, Loss: 335.6276
Epoch 11/200, Loss: 334.8148
Epoch 12/200, Loss: 334.0869
Epoch 13/200, Loss: 333.4556
Epoch 14/200, Loss: 332.8865
Epoch 15/200, Loss: 332.3653
Epoch 16/200, Loss: 331.8805
Epoch 17/200, Loss: 331.4379
Epoch 18/200, Loss: 331.0940
Epoch 19/200, Loss: 330.6556
Epoch 20/200, Loss: 330.3690
Epoch 21/200, Loss: 330.0400
Epoch 22/200, Loss: 329.7715
Epoch 23/200, Loss: 329.4799
Epoch 24/200, Loss: 329.2130
Epoch 25/200, Loss: 329.0184
Epoch 26/200, Loss: 328.7784
Epoch 27/200, Loss: 328.5295
Epoch 28/200, Loss: 328.3432
Epoch 29/200, Loss: 328.1567
Epoch 30/200, Loss: 327.9228
Epoch 31/200, Loss: 327.8272
Epoch 32/200, Loss: 327.6497
Epoch 33/200, Loss: 327.5478
Epoch 34/200, Loss: 327.3922
Epoch 35/200, Loss: 327

In [15]:
evaluate_model(model, test_loader_sm)

Test Accuracy: 0.6679


# Medium

# Repetir TRAINING PARA MODELOS SMALL, MEDIUM LARGE E COMPARAR, PARA TORNAR DISCUSSAO INTERESSANTE.
SM: 0.66
MD: 0.77
LG: 0.84

In [16]:
# Preprocess the dataset
train_embeddings_md, filtered_train_labels_md = prepare_data(train_texts, train_labels, nlp_md)
test_embeddings_md, filtered_test_labels_md = prepare_data(test_texts, test_labels, nlp_md)

100%|██████████| 35000/35000 [03:33<00:00, 163.95it/s]
100%|██████████| 15000/15000 [01:31<00:00, 164.62it/s]


In [17]:
print("Embeddings dimension: ", train_embeddings_md.shape[1])

Embeddings dimension:  300


In [18]:
# Create PyTorch datasets
train_dataset_md = IMDBDataset(train_embeddings_md, filtered_train_labels_md)
test_dataset_md = IMDBDataset(test_embeddings_md, filtered_test_labels_md)

In [19]:
train_loader_md = DataLoader(train_dataset_md, batch_size=64, shuffle=True)
test_loader_md = DataLoader(test_dataset_md, batch_size=64, shuffle=False)

In [20]:
model = train_and_get_model(train_loader_md, train_embeddings_md)

Epoch 1/200, Loss: 363.3898
Epoch 2/200, Loss: 344.9529
Epoch 3/200, Loss: 334.8779
Epoch 4/200, Loss: 327.9018
Epoch 5/200, Loss: 322.3867
Epoch 6/200, Loss: 318.0481
Epoch 7/200, Loss: 314.1565
Epoch 8/200, Loss: 310.9347
Epoch 9/200, Loss: 308.1047
Epoch 10/200, Loss: 305.6279
Epoch 11/200, Loss: 303.3166
Epoch 12/200, Loss: 301.3785
Epoch 13/200, Loss: 299.5306
Epoch 14/200, Loss: 297.9556
Epoch 15/200, Loss: 296.3798
Epoch 16/200, Loss: 294.9706
Epoch 17/200, Loss: 293.7424
Epoch 18/200, Loss: 292.6783
Epoch 19/200, Loss: 291.5630
Epoch 20/200, Loss: 290.5566
Epoch 21/200, Loss: 289.6655
Epoch 22/200, Loss: 288.7686
Epoch 23/200, Loss: 287.9861
Epoch 24/200, Loss: 287.2303
Epoch 25/200, Loss: 286.4458
Epoch 26/200, Loss: 285.7505
Epoch 27/200, Loss: 285.1155
Epoch 28/200, Loss: 284.5521
Epoch 29/200, Loss: 283.8950
Epoch 30/200, Loss: 283.4254
Epoch 31/200, Loss: 282.8311
Epoch 32/200, Loss: 282.2424
Epoch 33/200, Loss: 281.7999
Epoch 34/200, Loss: 281.3927
Epoch 35/200, Loss: 280

In [21]:
evaluate_model(model, test_loader_md)

Test Accuracy: 0.7601


# Large

In [22]:
# Preprocess the dataset
train_embeddings_lg, filtered_train_labels_lg = prepare_data(train_texts, train_labels, nlp_lg)
test_embeddings_lg, filtered_test_labels_lg = prepare_data(test_texts, test_labels, nlp_lg)

100%|██████████| 35000/35000 [03:38<00:00, 160.07it/s]
100%|██████████| 15000/15000 [01:32<00:00, 161.79it/s]


In [23]:
print("Embeddings dimension: ", train_embeddings_lg.shape[1])

Embeddings dimension:  300


In [24]:
# Create PyTorch datasets
train_dataset_lg = IMDBDataset(train_embeddings_lg, filtered_train_labels_lg)
test_dataset_lg = IMDBDataset(test_embeddings_lg, filtered_test_labels_lg)

In [25]:
train_loader_lg = DataLoader(train_dataset_lg, batch_size=64, shuffle=True)
test_loader_lg = DataLoader(test_dataset_lg, batch_size=64, shuffle=False)

In [26]:
model = train_and_get_model(train_loader_lg, train_embeddings_lg)

Epoch 1/200, Loss: 351.2266
Epoch 2/200, Loss: 312.6541
Epoch 3/200, Loss: 289.7222
Epoch 4/200, Loss: 274.1645
Epoch 5/200, Loss: 262.8609
Epoch 6/200, Loss: 254.2484
Epoch 7/200, Loss: 247.3957
Epoch 8/200, Loss: 241.8090
Epoch 9/200, Loss: 237.0766
Epoch 10/200, Loss: 233.2017
Epoch 11/200, Loss: 229.6388
Epoch 12/200, Loss: 226.8012
Epoch 13/200, Loss: 224.1167
Epoch 14/200, Loss: 221.7712
Epoch 15/200, Loss: 219.7670
Epoch 16/200, Loss: 217.9245
Epoch 17/200, Loss: 216.2579
Epoch 18/200, Loss: 214.6836
Epoch 19/200, Loss: 213.2655
Epoch 20/200, Loss: 212.1026
Epoch 21/200, Loss: 210.8025
Epoch 22/200, Loss: 209.8309
Epoch 23/200, Loss: 208.7982
Epoch 24/200, Loss: 207.8887
Epoch 25/200, Loss: 206.9833
Epoch 26/200, Loss: 206.1955
Epoch 27/200, Loss: 205.4459
Epoch 28/200, Loss: 204.6701
Epoch 29/200, Loss: 204.0531
Epoch 30/200, Loss: 203.5020
Epoch 31/200, Loss: 202.8612
Epoch 32/200, Loss: 202.2944
Epoch 33/200, Loss: 201.7616
Epoch 34/200, Loss: 201.2687
Epoch 35/200, Loss: 200

In [27]:
evaluate_model(model, test_loader_lg)

Test Accuracy: 0.8552
