In [1]:
nome = 'Arthur Baia'
print(f'Meu nome é {nome}')

Meu nome é Arthur Baia


## Instruções

- Treinar uma rede neural de duas camadas como classificador binário na tarefa de análise de sentimentos usando dataset IMDB usando TF-IDF como entrada.

Deve-se implementar o laço de treinamento e validação da rede neural.

Neste exercício usaremos o IMDB com 20k exemplos para treino, 5k para desenvolvimento e 25k para teste.

# Importando os pacotes necessários

In [2]:
import collections
import os
import random
import re
import torch
import numpy as np

# Verificando se a GPU está disponível

In [3]:
if torch.cuda.is_available(): 
   dev = "cuda:0"
   print(torch. cuda. get_device_name(dev))
else:
   dev = "cpu" 
print(dev)
device = torch.device(dev)

NVIDIA GeForce GTX 1660 Ti
cuda:0


## Preparando Dados

Primeiro, fazemos download do dataset:

In [5]:
!wget -nc http://files.fast.ai/data/aclImdb.tgz 
!tar -xzf aclImdb.tgz

File ‘aclImdb.tgz’ already there; not retrieving.



## Carregando o dataset

Criaremos uma divisão de treino (80%) e validação (20%) artificialmente.

Nota: Evitar de olhar ao máximo o dataset de teste para não ficar enviseado no que será testado. Em aplicações reais, o dataset de teste só estará disponível no futuro, ou seja, é quando o usuário começa a testar o seu produto.

In [6]:
def load_texts(folder):
    texts = []
    for path in os.listdir(folder):
        with open(os.path.join(folder, path)) as f:
            texts.append(f.read())
    return texts

x_train_pos = load_texts('aclImdb/train/pos')
x_train_neg = load_texts('aclImdb/train/neg')
x_test_pos = load_texts('aclImdb/test/pos')
x_test_neg = load_texts('aclImdb/test/neg')

x_train = x_train_pos + x_train_neg
x_test = x_test_pos + x_test_neg
y_train = [True] * len(x_train_pos) + [False] * len(x_train_neg)
y_test = [True] * len(x_test_pos) + [False] * len(x_test_neg)

# Embaralhamos o treino para depois fazermos a divisão treino/valid.
c = list(zip(x_train, y_train))
random.shuffle(c)
x_train, y_train = zip(*c)

n_train = int(0.8 * len(x_train))

x_valid = x_train[n_train:]
y_valid = y_train[n_train:]
x_train = x_train[:n_train]
y_train = y_train[:n_train]

print(len(x_train), 'amostras de treino.')
print(len(x_valid), 'amostras de desenvolvimento.')
print(len(x_test), 'amostras de teste.')

print('3 primeiras amostras treino:')
for x, y in zip(x_train[:3], y_train[:3]):
    print(y, x[:100])

print('3 últimas amostras treino:')
for x, y in zip(x_train[-3:], y_train[-3:]):
    print(y, x[:100])

print('3 primeiras amostras validação:')
for x, y in zip(x_valid[:3], y_test[:3]):
    print(y, x[:100])

print('3 últimas amostras validação:')
for x, y in zip(x_valid[-3:], y_valid[-3:]):
    print(y, x[:100])

20000 amostras de treino.
5000 amostras de desenvolvimento.
25000 amostras de teste.
3 primeiras amostras treino:
False I don't know who wrote the script for this movie, but from the first moment on, I was irritated. Of 
False There are so many puns to play on the title of the spectacularly bad Valentine that I don't know whe
False Unless you are already familiar with the pop stars who star in this film, save yourself the time and
3 últimas amostras treino:
True Why didn't Dynamo have any pants?! Where did they go?? It was never explained. That's why this movie
False FAIL. I'd love to give this crap a 0. Yes, I registered just to rate this garbage. I want to go back
True I had really only been exposed to Olivier's dramatic performances, and those were mostly much later 
3 primeiras amostras validação:
True I was deeply moved by this movie in many respects. First of all, I just want to say that Clara Lago 
True Really, I liked it. The premise was good, the story fit where both respectiv

In [7]:
from typing import List
from collections import Counter


## TF-IDF

In [8]:
class TFIDF():

    def __init__(self, max_vocab_token=1000):

        self.max_vocab_tokens = max_vocab_token

    def fit(self, corpus):

        self.frequency_dict = self.create_vocab(corpus, self.max_vocab_tokens)
        self.idf = {key: np.log10(len(corpus)/(value))
                    for key, value in self.frequency_dict.items()}
        
    def transform(self, phrase):
        unique_freq_dict = self.create_vocab(
            [phrase], len(self.tokenize(phrase)))
        return [unique_freq_dict[word]*idf if word in unique_freq_dict else 0 for word, idf in self.idf.items()]

    def create_vocab(self, texts: List[str], max_tokens: int):
        """
        Returns a dictionary whose keys are tokens and values are token ids (from 0 to max_tokens - 1).
        """
        # Escreva aqui seu código.
        L = [word for phrase in list(map(self.tokenize, texts))
             for word in set(phrase)]
        return dict(Counter(L).most_common(max_tokens))

    def tokenize(self, text: str):
        """
        Convert string to a list of tokens (i.e., words).
        This function lower cases everything and removes punctuation.
        """
        # Escreva aqui seu código.
        return re.findall(r"[\w']+", text.lower())


# Testes

In [10]:
def test_tfidf():
    tfidf = TFIDF()
    assert TFIDF().tokenize("This is a test!") == ['this', 'is', 'a', 'test']
    assert TFIDF().create_vocab(["This is a test!", "This is another test!"], 5) == {
        'test': 2, 'this': 2, 'is': 2, 'a': 1, 'another': 1}
    tfidf.fit(["This is a test!", "This is another test!"])
    assert tfidf.idf == {'test': 0.0, 'this': 0.0, 'is': 0.0,
                         'a': 0.3010299956639812, 'another': 0.3010299956639812}
    tfidf.fit(["test!", ])
    assert tfidf.idf == {'test': 0.0}
    tfidf.fit(["!", ])
    assert tfidf.idf == {}
    tfidf = TFIDF(max_vocab_token=5)
    tfidf.fit(["This is a test!", "This is another test!"])
    assert tfidf.transform("This is a test!") == [
        0.0, 0.0, 0.0, 0.3010299956639812, 0.0]
    return True


test_tfidf()

True

## IMDB dataset 


In [11]:
class IMDBdataset(torch.utils.data.Dataset):
    def __init__(self, x, y, tfidf):
        self.x = [tfidf.transform(x) for x in x]
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return torch.tensor(self.x[idx]).float(), torch.tensor(self.y[idx]).long()


# IMDB dataset tests

In [13]:
import pytest


def test_imdbdataset():
    tfidf1 = TFIDF()
    tfidf1.fit(x_train)
    assert IMDBdataset(x_train, y_train, tfidf1).__len__() == len(x_train)
    assert type(IMDBdataset(x_train, y_train, tfidf1).__getitem__(0)[0]), type(
        IMDBdataset(x_train, y_train, tfidf1).__getitem__(0)[1]) == (torch.Tensor, torch.Tensor)
    assert IMDBdataset(x_train, y_train, tfidf1).__getitem__(0)[
        0].shape == torch.Size([1000])
    assert IMDBdataset(x_train, y_train, tfidf1).__getitem__(0)[
        1].shape == torch.Size([])
    tfidf_teste = TFIDF(max_vocab_token=10)
    tfidf_teste.fit(x_train)
    teste_imdb = IMDBdataset(
        ["This is a test!", "This is another test!"], [0, 1], tfidf_teste)
    assert pytest.approx(teste_imdb.x, [[0, 0.014461261606717435, 0, 0, 0, 0.043519412473365916, 0.04744970610179834, 0, 0, 0],
                                        [0, 0, 0, 0, 0, 0.043519412473365916, 0.04744970610179834, 0, 0, 0]])
    assert pytest.approx(teste_imdb.y, [[0], [1]])
    return True


test_imdbdataset()


TypeError: pytest.approx() does not support nested data structures: [0, 0.014573525916998344, 0, 0, 0, 0.043303435105349154, 0.047280187524168765, 0, 0, 0] at index 0
  full sequence: [[0,
  0.014573525916998344,
  0,
  0,
  0,
  0.043303435105349154,
  0.047280187524168765,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0.043303435105349154, 0.047280187524168765, 0, 0, 0]]

# Dataloaders

In [14]:
def create_imdb_dataset(max_vocab_tokens, x, y):
  tfidf = TFIDF(max_vocab_token=max_vocab_tokens)
  tfidf.fit(x)
  return IMDBdataset(x, y, tfidf)

In [15]:
max_vocab_tokens = 1000
imdb_train = create_imdb_dataset(max_vocab_tokens = max_vocab_tokens, x= x_train, y = y_train)
imdb_validation = create_imdb_dataset(max_vocab_tokens = max_vocab_tokens, x= x_valid, y = y_valid)

In [16]:
def create_dataloader(data, batch_size):
  return torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)

In [17]:
train_batch_size = 20
valid_batch_size = 20
train_dataloader = create_dataloader(imdb_train ,train_batch_size)
valid_dataloader = create_dataloader(imdb_validation ,valid_batch_size)

# Dataset loaders tests

In [18]:
def test_dataloader(max_vocab_tokens):
    assert len(train_dataloader) == len(imdb_train)/train_batch_size
    assert len(valid_dataloader) == len(imdb_validation)/valid_batch_size
    # Test the shape of the first batch
    assert train_dataloader.__iter__().__next__()[0].shape == torch.Size([
        train_batch_size, max_vocab_tokens])
    assert train_dataloader.__iter__().__next__(
    )[1].shape == torch.Size([train_batch_size])
    # Test the shape of validation batch
    assert valid_dataloader.__iter__().__next__()[0].shape == torch.Size([
        valid_batch_size, max_vocab_tokens])
    assert valid_dataloader.__iter__().__next__(
    )[1].shape == torch.Size([valid_batch_size])
    return True
test_dataloader(max_vocab_tokens)

True

In [19]:
type(train_dataloader.__iter__().__next__()[1])

torch.Tensor

# Model Construction

In [25]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, input_layer_dim, k, hidden_layer_dim):
        super(NeuralNetwork, self).__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(input_layer_dim, hidden_layer_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_layer_dim, k)
        )

    def forward(self, x):
        return self.layers(x)


# Test model

In [32]:
def test_neuralnetwork(input_layer_dim, k, hidden_layer_dim):
    model = NeuralNetwork(input_layer_dim, k, hidden_layer_dim)
    assert model.layers[0].in_features == input_layer_dim
    assert model.layers[0].out_features == hidden_layer_dim
    assert model.layers[0].weight.shape == torch.Size([hidden_layer_dim , input_layer_dim])
    assert model.layers[0].bias.shape == torch.Size([hidden_layer_dim])
    assert model.layers[2].in_features == hidden_layer_dim
    assert model.layers[2].out_features == k
    assert model.layers[2].weight.shape == torch.Size([k , hidden_layer_dim])
    assert model.layers[2].bias.shape == torch.Size([k])
    return True
test_neuralnetwork(1000, 2, 100)

True

In [114]:
"""Function to train the model, which returns the train loss"""
def train(model, train_dataloader, criterion, optimizer):
    model.train()
    train_loss = 0
    for x,y in train_dataloader:
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    return train_loss / len(train_dataloader)

In [None]:
"""Function to evaluate the model, which returns the validation loss and accuracy"""
def evaluate(model, valid_dataloader, criterion):
    model.eval()
    valid_loss = 0
    correct = 0
    with torch.no_grad():
        for x,y in valid_dataloader:
            x = x.to(device)
            y = y.to(device)
            output = model(x)
            valid_loss += criterion(output, y).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(y.view_as(pred)).sum().item()
    return valid_loss/ len(valid_dataloader), correct / len(valid_dataloader)

In [54]:
batch_size = 20
k = 2
input_dim = max_vocab_tokens
hidden_dim = 100
learning_rate = 0.0001
print(max_vocab_tokens)
num_epochs = 10

1000


In [48]:
model = NeuralNetwork(input_dim, k, hidden_dim).to(device)
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


In [None]:
"""Train the model for n epochs and evaluate the model after each epoch"""
train_loss = []
valid_loss = []
valid_acc = []
for epoch in range(num_epochs):
    print("Epoch: ", epoch)
    train_loss.append(train(model, train_dataloader, loss_func, optimizer))
    valid_loss_, valid_acc_ = evaluate(model, valid_dataloader, loss_func)
    valid_loss.append(valid_loss_)
    valid_acc.append(valid_acc_)
    print("Train Loss: ", train_loss[epoch])
    print("Valid Loss: ", valid_loss[epoch])
    print("Valid Accuracy: ", valid_acc[epoch])


Epoch: 0; Train Loss: 0.41977804984897377; Validation Loss:0.9265123593807221; Accuracy: 10.54800033569336
Epoch: 1; Train Loss: 0.3183219897840172; Validation Loss:1.0231089532375335; Accuracy: 10.896000862121582
Epoch: 2; Train Loss: 0.23494440852850676; Validation Loss:1.2872797409296035; Accuracy: 10.684000968933105
Epoch: 3; Train Loss: 0.1404183025539387; Validation Loss:1.6421508547067643; Accuracy: 10.724000930786133
Epoch: 4; Train Loss: 0.08039979126944673; Validation Loss:2.3422226833105086; Accuracy: 10.704000473022461
Epoch: 5; Train Loss: 0.05728130178700667; Validation Loss:2.6410047409534454; Accuracy: 10.624000549316406
Epoch: 6; Train Loss: 0.054710810285410846; Validation Loss:2.824803314447403; Accuracy: 10.496000289916992
Epoch: 7; Train Loss: 0.04226334268059145; Validation Loss:3.113998544216156; Accuracy: 10.520000457763672
Epoch: 8; Train Loss: 0.028155938897767557; Validation Loss:3.6398984684944153; Accuracy: 10.476000785827637
Epoch: 9; Train Loss: 0.0155059