In [1]:
import spacy
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_dataset

# Load the IMDB dataset
imdb = load_dataset("imdb")
train_texts = imdb["train"]["text"]
train_labels = imdb["train"]["label"]
test_texts = imdb["test"]["text"]
test_labels = imdb["test"]["label"]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from collections import Counter
print(Counter(train_labels))
print(Counter(test_labels))

Counter({0: 12500, 1: 12500})
Counter({0: 12500, 1: 12500})


In [3]:
nlp = spacy.load("en_core_web_lg", disable=["parser", "ner", "tagger", "attribute_ruler", "lemmatizer"])

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
def preprocess_text(text):
    # Tokenize and generate embeddings using spaCy
    doc = nlp(text)
    return np.mean([token.vector for token in doc if token.has_vector], axis=0)

In [5]:
from tqdm import tqdm
# Load IMDB dataset

# Prepare the data
def prepare_data(texts, labels):
    embeddings, filtered_labels = [], []
    for label, text in tqdm(zip(labels, texts), total=len(texts)):
        embedding = preprocess_text(text)
        if embedding is not None:  # Ignore empty embeddings
            embeddings.append(embedding)
            filtered_labels.append(label)
    return np.array(embeddings), np.array(filtered_labels)

# Preprocess the dataset
train_embeddings, filtered_train_labels = prepare_data(train_texts, train_labels)
test_embeddings, filtered_test_labels = prepare_data(test_texts, test_labels)

100%|██████████| 25000/25000 [02:40<00:00, 155.36it/s]
100%|██████████| 25000/25000 [02:40<00:00, 155.75it/s]


In [6]:
train_embeddings.shape

(25000, 300)

In [7]:
class IMDBDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Create PyTorch datasets
train_dataset = IMDBDataset(train_embeddings, train_labels)
test_dataset = IMDBDataset(test_embeddings, test_labels)

In [8]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [9]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SentimentClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 1),
            # nn.ReLU(),
            # nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.fc(x).squeeze()

In [11]:
# Initialize model, loss, and optimizer
input_dim = train_embeddings.shape[1]  # Dimensionality of the pretrained embeddings
model = SentimentClassifier(input_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training loop
epochs = 200
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for embeddings, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 1/200, Loss: 254.4884
Epoch 2/200, Loss: 230.9502
Epoch 3/200, Loss: 215.5630
Epoch 4/200, Loss: 204.5524
Epoch 5/200, Loss: 196.1863
Epoch 6/200, Loss: 189.5965
Epoch 7/200, Loss: 184.3742
Epoch 8/200, Loss: 179.8733
Epoch 9/200, Loss: 176.2241
Epoch 10/200, Loss: 173.0558
Epoch 11/200, Loss: 170.3163
Epoch 12/200, Loss: 167.9532
Epoch 13/200, Loss: 165.8230
Epoch 14/200, Loss: 163.9290
Epoch 15/200, Loss: 162.2420
Epoch 16/200, Loss: 160.6829
Epoch 17/200, Loss: 159.4167
Epoch 18/200, Loss: 158.0422
Epoch 19/200, Loss: 156.8811
Epoch 20/200, Loss: 155.8903
Epoch 21/200, Loss: 154.8508
Epoch 22/200, Loss: 154.0006
Epoch 23/200, Loss: 153.1274
Epoch 24/200, Loss: 152.3011
Epoch 25/200, Loss: 151.5658
Epoch 26/200, Loss: 150.9291
Epoch 27/200, Loss: 150.2395
Epoch 28/200, Loss: 149.5934
Epoch 29/200, Loss: 149.0518
Epoch 30/200, Loss: 148.5186
Epoch 31/200, Loss: 148.0304
Epoch 32/200, Loss: 147.4675
Epoch 33/200, Loss: 147.0590
Epoch 34/200, Loss: 146.5302
Epoch 35/200, Loss: 146

In [12]:
from sklearn.metrics import accuracy_score
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for embeddings, labels in test_loader:
        outputs = model(embeddings)
        preds = (outputs > 0.5).float()
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8433


# Repetir TRAINING PARA MODELOS SMALL, MEDIUM LARGE E COMPARAR, PARA TORNAR DISCUSSAO INTERESSANTE.
SM: 0.66
MD: 0.77
LG: 0.84