In [1]:
  """
    This code was modified from the original code by Omar Espejel https://github.com/omarespejel
    """

import torch
import torchtext
from torchtext.datasets import DBpedia

# Torch text version
torchtext.__version__

'0.16.1+cpu'

In [2]:
import pandas as pd

data = pd.read_csv("final_data.csv")

In [3]:
from sklearn.model_selection import train_test_split

data = data[~data["top_critics"].isnull()]

train, test = train_test_split(data, test_size=0.2, stratify=data["winner"])

In [4]:
tuple_list = list(zip(train["winner"], train["top_critics"]))


def tuple_generator(tuple_list):
    for tup in tuple_list:
        yield tup


train_iter = tuple_generator(tuple_list)

In [5]:
next(train_iter)

(0,
 'Sleepless has the puzzling, 95-minutes feel of a doomed project that has been mercilessly edited to digestible proportions., Its thin but taut and at a lean 95 minutes, it zips along at a propulsive, entertaining clip., The final twist is genuinely unexpected, though purely because of its irrelevance., A smart and speedy cop movie that occasionally reaches for greatness., Less sleepless, more insomnia cure. For all its attempts at action, its a rote, dull crime thriller with little fresh to offer., Alleged action movies need way more novelty than just flour being dusted over the combatants or brief bursts of underwater photography to ring the changes., The sheer silliness defeated me., In its portrayal of cops on the take, Sleepless often resembles an episode of TVs The Shield., Monaghan and Foxx, for all their gifts, cant transcend the material, though they do get more out of it than most others would be able to., The 95 minutes it takes for this sleepless night to unfold onscre

In [6]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


# This besides tokenize the data convert words into integers according to the vocab available
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [7]:
texto_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def collate_batch(batch):
    label_list = []
    text_list = []
    offsets = [0]

    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(texto_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [9]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch
)

In [10]:
# from torch import nn
# import torch.nn.functional as F


# class TextClassifierModel(nn.Module):
#     def __init__(self, vocab_size, embed_dim, num_class):
#         super(TextClassifierModel, self).__init__()

#         # (embedding)
#         self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)

#         # (batch normalization)
#         self.bn1 = nn.BatchNorm1d(embed_dim)

#         # (fully connected)
#         self.fc = nn.Linear(embed_dim, num_class)

#     def forward(self, text, offsets):
#         # Incrustar el texto (embed the text)
#         embedded = self.embedding(text, offsets)

#         # Aplicar la normalización por lotes (apply batch normalization)
#         embedded_norm = self.bn1(embedded)

#         # Aplicar la función de activación ReLU (apply the ReLU activation function)
#         embedded_activated = F.relu(embedded_norm)

#         # Devolver las probabilidades de clase (output the class probabilities)
#         return self.fc(embedded_activated)

In [27]:
import torch
from torch import nn
import torch.nn.functional as F


class TextClassifierModelWithRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_class):
        super(TextClassifierModelWithRNN, self).__init__()

        # (embedding)
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # (LSTM layer)
        self.rnn = nn.LSTM(embed_dim, hidden_size, batch_first=True)

        # (batch normalization)
        self.bn1 = nn.BatchNorm1d(hidden_size)

        # (fully connected)
        self.fc = nn.Linear(hidden_size, num_class)

    def forward(self, text, offsets):
        # Incrustar el texto (embed the text)
        embedded = self.embedding(text)

        # Apply the LSTM layer
        output, _ = self.rnn(embedded)

        # Take the output from the last time step
        output = output[:, -1, :]

        # Apply batch normalization
        output_norm = self.bn1(output)

        # Apply the ReLU activation function
        output_activated = F.relu(output_norm)

        # Return the class probabilities
        return self.fc(output_activated)

In [31]:
tuple_list = list(zip(train["winner"], train["top_critics"]))


def tuple_generator(tuple_list):
    for tup in tuple_list:
        yield tup


train_iter = tuple_generator(tuple_list)

In [None]:
# num_class = len(set([label for (label, text) in train_iter]))
# vocab_size = len(vocab)
# embedding_size = 100

# model = TextClassifierModel(
#     vocab_size=vocab_size, embed_dim=embedding_size, num_class=num_class
# ).to(device)

In [32]:
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
embedding_size = 100

model = TextClassifierModelWithRNN(
    vocab_size=vocab_size, embed_dim=embedding_size, hidden_size=64, num_class=num_class
).to(device)

In [33]:
num_class

3

In [34]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 1,573,019 trainable parameters


In [35]:
def training_model(dataloader):
    # Putting the model in training mode
    model.train()

    # Initialize accuracy, count and loss for each epoch
    epoch_acc = 0
    epoch_loss = 0
    total_count = 0

    for idx, (label, text, offsets) in enumerate(dataloader):
        # reset the gradients after each batch
        optimizer.zero_grad()

        # Get predictions from the model
        predict = model(text, offsets)

        # Get the loss
        loss = get_loss(predict, label)

        # backpropagate the loss and compute gradients
        loss.backward()

        # Getting the accuracy
        acc = (predict.argmax(1) == label).sum()

        # Avoiding large gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Update the weights
        optimizer.step()

        # Upgrading the epoch accuracy, count and loss
        epoch_acc += acc.item()
        epoch_loss += loss.item()
        total_count += label.size(0)

        if idx % 50 == 0 and idx > 0:
            print(
                f" epoch {epoch} | {idx}/{len(dataloader)} batches | loss {epoch_loss/total_count} | accuracy {epoch_acc/total_count}"
            )

    return epoch_acc / total_count, epoch_loss / total_count

In [36]:
def eval_model(dataloader):
    model.eval()
    epoch_acc = 0
    total_count = 0
    epoch_loss = 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            # Obtenemos la la etiqueta predecida
            #
            prediction = model(text, offsets)

            # Obtenemos pérdida y accuracy
            loss = get_loss(prediction, label)
            acc = (prediction.argmax(1) == label).sum()

            # Llevamos el conteo de la pérdida y el accuracy para esta epoch
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            total_count += label.size(0)

    return epoch_acc / total_count, epoch_loss / total_count

In [37]:
# Hiperparámetros

EPOCHS = 4  # epochs
LEARNING_RATE = 0.2  # tasa de aprendizaje
# BATCH_SIZE = 64 # tamaño de los batches
BATCH_SIZE = 30

In [38]:
get_loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [39]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset


tuple_list = list(zip(train["winner"], train["top_critics"]))
tuple_list_test = list(zip(test["winner"], test["top_critics"]))


def tuple_generator(tuple_list):
    for tup in tuple_list:
        yield tup


train_iter = tuple_generator(tuple_list)
test_iter = tuple_generator(tuple_list_test)


# Getting training and testing datasets
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# 95% from the trainset and the remaining 5% for validation
num_train = int(len(train_dataset) * 0.95)

split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

# Generating data loaders to be used in the model

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

In [40]:
# Greating the greatest loss
major_loss_validation = float("inf")

# training
for epoch in range(1, EPOCHS + 1):
    # training
    training_acc, training_loss = training_model(train_dataloader)

    # Validation
    validation_acc, valitacion_loss = eval_model(valid_dataloader)

    # Guarda el mejor modelo
    if valitacion_loss < major_loss_validation:
        best_valid_loss = valitacion_loss
        torch.save(model.state_dict(), "best_saved.pt")

IndexError: too many indices for tensor of dimension 2

In [21]:
test_acc, test_loss = eval_model(test_dataloader)

print(f"Test dataset accuracy-> {test_acc}")
print(f"test dataset loss-> {test_loss}")

Test dataset accuracy-> 0.7402597402597403
test dataset loss-> 0.0277736589506075


In [22]:
oscars_label = {
    1: "Non considered for the award",
    2: "Nominated for best picture",
    3: "Winner",
}


def predict(text, texto_pipeline):
    with torch.no_grad():
        text = torch.tensor(texto_pipeline(text))
        opt_mod = torch.compile(model, mode="reduce-overhead")
        output = opt_mod(text, torch.tensor([0]))
        return output.argmax(1).item() + 1


model = model.to("cpu")

In [23]:
data.head(3)

Unnamed: 0,movie,winner,top_critics
0,The King's Speech,2,Firth strikes a perfect balance between his ab...
1,Black Swan,1,What were they trying to say? What happened?.....
3,Inception,1,Inception engaged on a mainly intellectually l...


In [24]:
sample_1 = "That the director turned this most devastating of stories into \
a riveting pop culture phenomenon without ceding one inch on its tragic \
dimensions is surely an achievement for the ages. This is a complex look \
at a complicated man, but Oppenheimer unequivocally establishes that \
this is a story worth telling -- and that Nolan was the perfect filmmaker to do it.\
Every scene feels like a cataclysm waiting to happen, fitting for a film that builds,\
step-by-step, to the creation of a cataclysm machine. Oppenheimer both summons awe for\
what it took to build the bomb and for the changes it wrought."

sample_2 = data["top_critics"][0]

# print(f"Oppenheimer will be {oscars_label[predict(sample_1, texto_pipeline)]}")

print(f" will be {oscars_label[predict(sample_2, texto_pipeline)]}")

skipping cudagraphs for unknown reason


 will be Non considered for the award


In [25]:
data["winner"].value_counts() / len(data)

winner
0    0.740157
1    0.236220
2    0.023622
Name: count, dtype: float64

In [26]:
# Printing confussion matrix

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

y_true = train["winner"]
y_pred = [predict(text, texto_pipeline) for text in train["top_critics"]]

confusion_matrix(y_true, y_pred)

skipping cudagraphs for unknown reason


array([[  0, 225,   0],
       [  0,  72,   0],
       [  0,   7,   0]])