In [1]:
#@title 📐Voocabulario { display-mode: "form" }
class Vocabulary(object):
    """Clase para procesar texto y extrar el vocabulario existente para su posterior mapeo.
    """
    def __init__(self, token_to_idx=None, add_unk=True, unk_token=""):
        """
        Args: token_to_idx(dict): Pre existing map of Tokens to Index.
            : add_unk(bool): A flag indicating whether to add UNK Token.
            : unk_token(string): The UNK Token to add in Vocabulary.
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx:token for token,idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def add_token(self, token):
        """Update the mapping dictionary based on the Tokens.
        Args: token: The item to add into the Vocabulary.
        Returns: index: Integer corresponding to the Token.
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):
        """Add a list of Tokens into Vocabulary.
        Args: tokens(list): A list of string Tokens.
        Returns: indices(list): A list of indices correspoinding to the Tokens.
        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Retrieve the Index associated with the Token.
        Args: token(str): The Token to lookup.
        Returns: index(int): The Index correspoinding to the Token.
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the Token associated with the Index.
        Args: index(int): The Index to lookup.
        Returns: token(str): The Token correspoinding to the Index.
        """
        if index not in self._idx_to_token:
            raise KeyError(f"the index {index} is not in the Vocabulary")
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary (size=%d)" % len(self)

    def __len__(self):
        return len(self._token_to_idx)
    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

In [2]:
#@title 📐Review vectorizer{ display-mode: "form" }
#Basado en Rao, D., & McMahan, B. (2019). Natural language processing with PyTorch: build intelligent language applications using deep learning

from collections import Counter
import string
import numpy as np


class Vectorizer(object):
    """The Vectorizer coordinates the Vocabularies and puts them to use.
    """
    def __init__(self, textVocab, personalityVocab):
        """
        Args: textVocab: Maps words to Integers.
            : personalityVocab: Maps class labels to Integers.
        """
        self.textVocab = textVocab
        self.personalityVocab = personalityVocab

    def vectorize(self, modifiers):
        """Create a collasped Onehot Vector for the modifiers.
        Args: modifiers: The modifiers
        Returns: one_hot: The collapsed one hot Encoding.
        """
        one_hot = np.zeros(len(self.textVocab), dtype=np.float32)
        tokens = modifiers.split(" ")
        
        for token in tokens:
            if token not in string.punctuation:
                one_hot[self.textVocab.lookup_token(token)] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, dataframe, cutoff=0):
        """Instantiate the Vectorizer from DataFrame.
        Args: dataframe(DataFrame): The modifiers Dataset.
            :  cufoff(int): Parameter for frequency based Filtering.
        Returns: An instance of the Vectorizer.
        """
        textVocab = Vocabulary(add_unk=True)
        personalityVocab = Vocabulary(add_unk=False)
        #@ Adding Ratings:
        for rating in (set(dataframe.target)):
            personalityVocab.add_token(rating)
        #@ Adding Topwords if count > provided count:
        word_counts = Counter()
        for modifiers in dataframe.mods:
            for word in modifiers.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > cutoff:
                textVocab.add_token(word)
        print(f"Vocab: {textVocab}")
        return cls(textVocab, personalityVocab)

    def get_vocab(self):
        """Returns the modifiers Vocabulary.
        """
        return (self.textVocab, self.personalityVocab)


In [3]:
#@title Dataset Manager{ display-mode: "form" }
#Basado en Rao, D., & McMahan, B. (2019). Natural language processing with PyTorch: build intelligent language applications using deep learning

import torch
from torch.utils.data import Dataset
import json
import pandas as pd

class DatasetManager(Dataset):
    def __init__(self, dataframe, vectorizer):
        """
        Args: dataframe(pandas.DataFrame): The dataset.
            : vectorizer(Vectorizer): Vector instantiated from dataset.
        """
        self.dataframe = dataframe
        self._vectorizer = vectorizer

        self.train_df = self.dataframe[self.dataframe.split == "train"]
        self.train_size = len(self.train_df)

        self.val_df = self.dataframe[self.dataframe.split == "val"]
        self.validation_size = len(self.val_df)

        self.test_df = self.dataframe[self.dataframe.split == "test"]
        self.test_size = len(self.test_df)

        

        self._lookup_dict = {"train": (self.train_df, self.train_size),
                                "val": (self.val_df, self.validation_size),
                                "test": (self.test_df, self.test_size)}
        print({"train": self.train_size,
                                "val": self.validation_size,
                                "test": self.test_size})
        # print(f"{self.train_df.head(0)}")
        self.set_split("train")

    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """Load dataset and make new vectorizer from scratch.
        Args: review_csv: Location of the dataset.
        Returns: An instance of ReviewDataset.
        """
        dataframe = pd.read_csv(review_csv)
        dataframe["mods"] = dataframe["mods"].apply(lambda x: x.lower())
        train_dataframe = dataframe[dataframe.split == "train"]
        return cls(dataframe, Vectorizer.from_dataframe(train_dataframe))

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        """Returns the Vectorizer.
        """
        return self._vectorizer

    def set_split(self, split="train"):
        """Splits the dataset using a column in the DataFrame.
        Args: split(str): One of "train", "val" or "test"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """Primary entry point of PyTorch Datasets.
        Args: index: Index of the Datapoint.
        Returns: A dictionary holding the Data point features and labels.
        """
        row = self._target_df.iloc[index]
        modifiedVector = self._vectorizer.vectorize(row.mods)
        targetIndex = self._vectorizer.personalityVocab.lookup_token(row.target)
        return {"x_data": modifiedVector,
                "y_target": targetIndex}

    

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the Dataset.
        Args: batch_size(int)
        Returns: Number of batches in the Dataset.
        """
        return len(self) // batch_size

In [4]:
#@title Batcher Generator{ display-mode: "form" }
import torch
from torch.utils.data import DataLoader

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
  """A generator function which wraps the PyTorch DataLoader.
  """
  dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                          shuffle=shuffle, drop_last=drop_last)

  for data_dict in dataloader:
    out_data_dict = {}
    for name, tensor in data_dict.items():
      out_data_dict[name] = data_dict[name].to(device)
    yield out_data_dict


def make_train_state(args):
    return {'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': 1,
            'test_acc': 1,
            'cuda': args.cuda}


def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices)


# Sin W2V

## Creación del vocabulario


In [5]:
URI_DATASET = '../../../Datasets/dataset_clasificador_final/classify_char_raw.csv'
dataset = DatasetManager.load_dataset_and_make_vectorizer(URI_DATASET)

Vocab: <Vocabulary (size=3862)
{'train': 911, 'val': 195, 'test': 196}


## Generar batches


In [6]:
train_batches = generate_batches(dataset, batch_size=128)

## Definición del perceptron

In [7]:
import torch.nn as nn
import torch.nn.functional as F

class Perceptron(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        x = self.linear(x)
        #_, x = torch.max(x, 1)
        return x

In [8]:
from argparse import Namespace
args = Namespace(
    review_csv=URI_DATASET,
    cuda=True,
    batch_size=16,
    learning_rate=0.001,
    num_epochs=250,
)


In [9]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn

#Creamos el dataset usando el ReviewDataset y el path que tenemos en args.
dataset = DatasetManager.load_dataset_and_make_vectorizer(args.review_csv)
#Creamos un diccionario para ir guardando los valores y loss de cada epoch.
train_state = make_train_state(args)

#Si CUDA no está disponible usamos CPU.
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")


Vocab: <Vocabulary (size=3862)
{'train': 911, 'val': 195, 'test': 196}


## Training Loop

In [10]:
import torch
import torch.nn as nn
def train_and_validate(args, classifier, optimizer, criterion, dataset):
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index


        # Generamos los training batches con el train set.
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        #Llamamos al método .train() del clasificador para indicar que el modelo está en "modo de entrenamiento" y los parámetros del modelo son mutables.
        classifier.train()
        for batch_index, batch_dict in enumerate(batch_generator):
            # Por cada minibatch ejecutamos la rutina de entrenamiento:
            # step 1. Ponemos a 0 los gradientes
            optimizer.zero_grad()
            # step 2. Calculamos el ouput del clasificador en este paso.
            y_pred = classifier(x=batch_dict['x_data'].float())

            # print(f"Y-pred {y_pred}")
            # print(f"Y-pred shape {y_pred.shape}")
            y_target =  batch_dict['y_target'].long()
            # print(f"Y-target {y_target}")
            # print(f"Y-target shape {y_target.shape}")
            loss = criterion(y_pred,y_target)
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
            # step 4. Usamos el loss para producir los gradientes.
            loss.backward()
            # step 5.Usamos  el optimizador para dar un paso de gradiente
            optimizer.step()
            # --------------------------------------------------
            #Calculmos el accuracy para tener una métrica extra además del loss.
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)
        print("Train loss: {:.3f} in Epoch {}".format(running_loss, epoch_index))
        print("Train accuracy: {:.3f} in Epoch {}".format(running_acc, epoch_index))
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)
        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0, set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()
        for batch_index, batch_dict in enumerate(batch_generator):
            # step 1. compute the output
            y_pred = classifier(x=batch_dict['x_data'].float())
            # step 2. compute the loss
            loss = criterion(y_pred, batch_dict['y_target'].long())
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
            # step 3. compute the accuracy
            acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)
        print("Val loss: {:.3f} in Epoch {}".format(running_loss, epoch_index))
        print("Val accuracy: {:.3f} in Epoch {}".format(running_acc, epoch_index))
        # print(f"Y pred {y_pred}")
        # print(f"Y real {  batch_dict['y_target']}")
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

    return train_state

def test(args,dataset, classifier, criterion):
    dataset.set_split('test')
    batch_generator = generate_batches(dataset, batch_size=len(dataset), device=args.device)  # Utiliza el tamaño del conjunto de prueba como tamaño de lote
    classifier.eval()  # Pon el modelo en modo de evaluación

    with torch.no_grad():  # Desactiva el cálculo de gradientes durante la evaluación
        for batch_dict in batch_generator:
            # Mueve los datos al dispositivo si es necesario
            x_data = batch_dict['x_data'].float().to(args.device)
            y_target = batch_dict['y_target'].long().to(args.device)

            # step 1. compute the output
            y_pred = classifier(x=x_data)

            # step 2. compute the loss
            loss = criterion(y_pred, y_target)
            running_loss = loss.item()

            # step 3. compute the accuracy
            running_acc = compute_accuracy(y_pred, y_target)

    print("Test loss: {:.3f}".format(running_loss))
    print("Test accuracy: {:.3f}".format(running_acc))

def getMostImportantTokens(classifier, vectorizer):
    # Obtén los pesos de la capa lineal fc1
    fc1_weights = classifier.linear.weight.detach()

    # Número de clases
    num_classes = fc1_weights.size(0)

    # Imprime los valores más característicos para cada clase
    for class_index in range(num_classes):
        class_weights = fc1_weights[class_index]
        _, indices = torch.sort(class_weights, descending=True)
        indices = indices.numpy().tolist()
        class_Name = list(set(dataset.dataframe["target"]))[class_index]
        print(f"\nTop values for Class {class_Name}:")

        for index in indices[:10]:
            index_name = vectorizer.textVocab.lookup_index(index)
            weight_value = class_weights[index].item()
            print(f"· {index_name}: {round(weight_value,3)}")

def trainEvalTestAndAnalyze(dataframeURL):
    # Crear el dataset y el vectorizador
    dataset = DatasetManager.load_dataset_and_make_vectorizer(dataframeURL)
    vectorizer = dataset.get_vectorizer()
    # Crear el clasificador y moverlo al dispositivo adecuado
    print(f"Tamaño red entrada: {len(vectorizer.textVocab)}")
    classifier = Perceptron(input_size=len(vectorizer.textVocab), output_size=16)
    classifier = classifier.to(args.device)
    # Crear el optimizador y la función de pérdida
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(classifier.parameters(), lr=args.learning_rate, momentum=0.9)
    # Entrenar y validar
    train_state = train_and_validate(args, classifier, optimizer, criterion, dataset)
    test(args,dataset, classifier, criterion)
    getMostImportantTokens(classifier, vectorizer)



## Analisis

In [11]:
datasets = ['../../../Datasets/dataset_clasificador_final/classify_char_raw.csv', '../../../Datasets/dataset_clasificador_final/classify_mistTest_p16Train_raw.csv']
for url in datasets:
    print(f"Training with {url[-20:]}")
    trainEvalTestAndAnalyze(url)
    print("........................................................")

Training with lassify_char_raw.csv
Vocab: <Vocabulary (size=3862)
{'train': 911, 'val': 195, 'test': 196}
Tamaño red entrada: 3862


Train loss: 2.767 in Epoch 0
Train accuracy: 0.077 in Epoch 0
Val loss: 2.769 in Epoch 0
Val accuracy: 0.089 in Epoch 0
Train loss: 2.743 in Epoch 1
Train accuracy: 0.151 in Epoch 1
Val loss: 2.762 in Epoch 1
Val accuracy: 0.094 in Epoch 1
Train loss: 2.721 in Epoch 2
Train accuracy: 0.181 in Epoch 2
Val loss: 2.753 in Epoch 2
Val accuracy: 0.099 in Epoch 2
Train loss: 2.699 in Epoch 3
Train accuracy: 0.196 in Epoch 3
Val loss: 2.745 in Epoch 3
Val accuracy: 0.099 in Epoch 3
Train loss: 2.676 in Epoch 4
Train accuracy: 0.209 in Epoch 4
Val loss: 2.741 in Epoch 4
Val accuracy: 0.089 in Epoch 4
Train loss: 2.656 in Epoch 5
Train accuracy: 0.212 in Epoch 5
Val loss: 2.732 in Epoch 5
Val accuracy: 0.089 in Epoch 5
Train loss: 2.636 in Epoch 6
Train accuracy: 0.222 in Epoch 6
Val loss: 2.725 in Epoch 6
Val accuracy: 0.104 in Epoch 6
Train loss: 2.616 in Epoch 7
Train accuracy: 0.234 in Epoch 7
Val loss: 2.719 in Epoch 7
Val accuracy: 0.104 in Epoch 7
Train loss: 2.596 in Epoch 8
Train accur