# Notebook 2 AnaliseSentimentosBagOfWord - Alunos Regulares IA024-2024S1 FEEC-UNICAMP

**Nome:** Caio Petrucci dos Santos Rosa

**RA:** 248245

## Instalação e importação de pacotes

In [None]:
!pip install torchtext
!pip install 'portalocker>=2.0.0'
!pip install Unidecode



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.datasets import IMDB
from collections import Counter
import torch.nn as nn
import torch.optim as optim

## I - Vocabulário e Tokenização

In [None]:
import string
from unidecode import unidecode

def normalize_string(s):
    return unidecode(s).strip(string.punctuation).lower()

In [None]:
# limit the vocabulary size to 20000 most frequent tokens
vocab_size = 20000

counter = Counter()
for (target, line) in list(IMDB(split='train')):
    words = [ normalize_string(word) for word in line.split()]
    counter.update(words)

# create a vocabulary of the 20000 most frequent tokens
most_frequent_words = sorted(counter, key=counter.get, reverse=True)[:vocab_size]
vocab = {word: i for i, word in enumerate(most_frequent_words, 1)}
vocab_size = len(vocab)

In [None]:
def encode_sentence(sentence, vocab):
    return [ vocab.get(normalize_string(word), 0) for word in sentence.split() ] # 0 for

encode_sentence("I like Pizza.", vocab)

[9, 38, 7893]

## II - Dataset

In [None]:
from torch.nn.functional import one_hot

# Dataset Class with One-hot Encoding
class IMDBDataset(Dataset):
    def __init__(self, split, vocab):
        self.raw_data = list(IMDB(split=split))
        self.vocab = vocab
        self.data = [ self.__encode_sample_to_onehot(sample) for sample in self.raw_data ]

    def __encode_sample_to_onehot(self, sample):
        target, line = sample
        target = 1 if target == 1 else 0
        # one-hot encoding
        X = torch.zeros(len(self.vocab) + 1)
        for word in encode_sentence(line, self.vocab):
            X[word] = 1
        return X, torch.tensor(target)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Load Data with One-hot Encoding
train_data = IMDBDataset('train', vocab)
test_data = IMDBDataset('test', vocab)

## III - Data Loader

In [None]:
batch_size = 128

# define dataloaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_data,  batch_size=batch_size, shuffle=False)

## IV - Modelo

In [None]:
class OneHotMLP(nn.Module):
    def __init__(self, vocab_size):
        super(OneHotMLP, self).__init__()

        self.fc1 = nn.Linear(vocab_size+1, 200)
        self.fc2 = nn.Linear(200, 1)

        self.relu = nn.ReLU()

    def forward(self, x):
        o = self.fc1(x.float())
        o = self.relu(o)
        return self.fc2(o)

# Model instantiation
model = OneHotMLP(vocab_size)

## V - Laço de Treinamento - Otimização da função de Perda pelo Gradiente descendente

In [None]:
# Verifica se há uma GPU disponível e define o dispositivo para GPU se possível, caso contrário, usa a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print('GPU:', torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print('using CPU')

GPU: Tesla T4


In [None]:
# Training loop with inital loss and perplexity during training

import time

model = model.to(device)

# Define optimal learning rate
optimal_lr = 0.05

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=optimal_lr)

# Evaluate loss before training
with torch.no_grad():
    initial_losses = []
    model.eval()
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        logits = model(inputs)
        initial_losses.append(criterion(logits.squeeze(), targets.float()))
    initial_loss = torch.tensor(initial_losses).mean()
    initial_PPL = torch.exp(initial_loss)
    print(f'Loss on training data before starting training:\t{initial_loss:.4f}')
    print(f'Perplexity on training data before starting training:\t{initial_loss:.4f}')

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    start_time = time.time()  # Start time of the epoch
    model.train()
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        # Forward pass
        logits = model(inputs)
        loss = criterion(logits.squeeze(), targets.float())
        PPL = torch.exp(loss)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    end_time = time.time()  # End time of the epoch
    epoch_duration = end_time - start_time  # Duration of epoch

    print(f'Epoch [{epoch+1}/{num_epochs}], \
            Loss: {loss.item():.4f}, \
            Perplexity: {PPL.item():.4f}, \
            Elapsed Time: {epoch_duration:.2f} sec')

Loss on training data before starting training:	0.6935
Perplexity on training data before starting training:	0.6935
Epoch [1/5],             Loss: 0.5013,             Perplexity: 1.6508,             Elapsed Time: 1.05 sec
Epoch [2/5],             Loss: 0.3896,             Perplexity: 1.4764,             Elapsed Time: 1.03 sec
Epoch [3/5],             Loss: 0.1991,             Perplexity: 1.2203,             Elapsed Time: 1.06 sec
Epoch [4/5],             Loss: 0.2558,             Perplexity: 1.2915,             Elapsed Time: 1.08 sec
Epoch [5/5],             Loss: 0.1913,             Perplexity: 1.2108,             Elapsed Time: 1.04 sec


## VI - Avaliação

In [None]:
def bce_loss(y_prob, y_target):
    return - ( torch.mul( y_target, torch.log(y_prob) ) + torch.mul( (1-y_target), torch.log(1-y_prob) ) ).mean().item()

In [None]:
# Evaluation printing Accuracy, Loss and Perplexity

model.eval()

with torch.no_grad():
    losses = []
    correct = 0
    total = 0
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        logits = model(inputs)
        losses.append(bce_loss(torch.sigmoid(logits.squeeze(dim=1)), targets.float()))
        predicted = torch.round(torch.sigmoid(logits.squeeze(dim=1)))
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    accuracy = 100 * correct/total
    loss = torch.tensor(losses).mean()
    PPL = torch.exp(loss)

    print(f'Accuracy on test data:\t{accuracy}%.')
    print(f'Loss on test data:\t{loss}.')
    print(f'Perplexity on test data:\t{PPL}.')

Accuracy on test data:	87.52%.
Loss on test data:	0.30297234654426575.
Perplexity on test data:	1.353877067565918.
