In [1]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm
from tqdm import tqdm_notebook
from torchinfo import summary

# Vocabulary

In [2]:
class Vocabulary(object):
    """Clase para procesar texto y extrar el vocabulario existente para su posterior mapeo."""

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token
                              for token, idx in self._token_to_idx.items()}

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary

        Args:
            tokens (list): a list of string tokens
        Returns:
            indices (list): a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Retrieve the index associated with the token

        Args:
            token (str): the token to look up
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index

        Args:
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [3]:
class ModsVocabulary(object):
    """Clase para procesar texto y extrar el vocabulario existente para su posterior mapeo."""

    def __init__(self, token_to_idx=None, unk_token='<UNK>', mask_token="<MASK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token
                              for token, idx in self._token_to_idx.items()}
        
        self._mask_token = mask_token
        self._unk_token = unk_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx,
                'mask_token': self._mask_token}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary

        Args:
            tokens (list): a list of string tokens
        Returns:
            indices (list): a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Retrieve the index associated with the token
          or the UNK index if token isn't present.

        Args:
            token (str): the token to look up
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary)
              for the UNK functionality
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index

        Args:
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<ModsVocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

# Vectorizer

In [4]:
#Basado en Rao, D., & McMahan, B. (2019). Natural language processing with PyTorch: build intelligent language applications using deep learning

from collections import Counter
import string
import numpy as np


class Vectorizer(object):
    """The Vectorizer coordinates the Vocabularies and puts them to use.
    """
    def __init__(self, mods_vocab, personality_vocab):
        """
        Args: mods_vocab: Maps words to Integers.
            : personality_vocab: Maps class labels to Integers.
        """
        self.mods_vocab = mods_vocab
        self.personality_vocab = personality_vocab

    def vectorize(self, mods, vector_length=-1):
        """
        Args:
            mods (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        Returns:
            the vetorized mods (numpy.array)
        """
        indices = []
        indices.extend(self.mods_vocab.lookup_token(token)
                       for token in mods.split(" "))

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.mods_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, char_df, cutoff=1):
        """Instantiate the vectorizer from the dataset dataframe

        Args:
            char_df (pandas.DataFrame): the target dataset
            cutoff (int): frequency threshold for including in Vocabulary
        Returns:
            an instance of the Vectorizer
        """
        personality_vocab = Vocabulary()
        for personality in sorted(set(char_df.target)):
            personality_vocab.add_token(personality)

        word_counts = Counter()
        for mods in char_df.mods:
            for token in mods.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1

        mods_vocab = ModsVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                mods_vocab.add_token(word)

        return cls(mods_vocab, personality_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        mods_vocab = \
            ModsVocabulary.from_serializable(contents['mods_vocab'])
        personality_vocab =  \
            Vocabulary.from_serializable(contents['personality_vocab'])

        return cls(mods_vocab=mods_vocab, personality_vocab=personality_vocab)

    def to_serializable(self):
        return {'mods_vocab': self.mods_vocab.to_serializable(),
                'personality_vocab': self.personality_vocab.to_serializable()}


    def get_mods_vocab(self):
        """Returns the char Vocabulary.
        """
        return self.mods_vocab

# Pytorch Dataset

In [5]:
class CharDataset(Dataset):
    def __init__(self, char_df, vectorizer):
        '''
        Args:
            char_df (pandas.DataFrame): the dataset
            vectorizer (Vectorizer): vectorizer instatiated from dataset
        '''
        self.char_df = char_df
        self._vectorizer = vectorizer

        measure_len = lambda context: len(context.split(" "))
        self._max_seq_length = max(map(measure_len, char_df.mods))


        self.train_df = self.char_df[self.char_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.char_df[self.char_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.char_df[self.char_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}

        self.set_split('train')

        # Class weights
        class_counts = char_df.target.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.personality_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)


    @classmethod
    def load_dataset_and_make_vectorizer(cls, char_csv):
        """Load dataset and make a new vectorizer from scratch

        Args:
            char_csv (str): location of the dataset
        Returns:
            an instance of SurnameDataset
        """
        char_df = pd.read_csv(char_csv)
        train_char_df = char_df[char_df.split=='train']
        return cls(char_df, Vectorizer.from_dataframe(train_char_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, char_csv, vectorizer_filepath):
        """Load dataset and the corresponding vectorizer.
        Used in the case in the vectorizer has been cached for re-use

        Args:
            char_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of SurnameDataset
        """
        char_csv = pd.read_csv(char_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(char_csv, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """a static method for loading the vectorizer from file

        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of Vectorizer
        """
        with open(vectorizer_filepath) as fp:
            return Vectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json

        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets

        Args:
            index (int): the index to the data point
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        mods_vector = \
            self._vectorizer.vectorize(row.mods, self._max_seq_length)

        personality_index = \
            self._vectorizer.personality_vocab.lookup_token(row.target)

        return {'x_data': mods_vector,
                'y_target': personality_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset

        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                    drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
    ensure each tensor is on the write device location.
    """
    print("Hola")
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)
    print("Hola")
    print(dataloader)
    for data_dict in dataloader:
        print(data_dict)
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [6]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Guardamos por lo menos el modelo del primer epoch
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # En el resto de epochs...
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # El loss ha empeorado
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # El loss ha mejorado
        else:
            # Guardamos el mejor modelo
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t


            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [7]:
#TODO: Reemplaza el path por el tuyo.
dataset = CharDataset.load_dataset_and_make_vectorizer('../../../Datasets/dataset_clasificador_final/classify_char_raw.csv')
#Respuestas a las preguntas
mods_vocab = dataset.get_vectorizer().get_mods_vocab()
personality_vocab = dataset.get_vectorizer().personality_vocab
print(f"El token que índice 8 es {mods_vocab.lookup_index(8)}.")
print(f"El índice del unknown word es {mods_vocab.lookup_token('')}.")
print(f"El objeto vocabulario de mods tiene un size de {mods_vocab}.")
#Añadimos un nuevo token al vocabulario por lo que deberíamos ver que el tamaño ha aumentado.
mods_vocab.add_token("deusto")
print(f"El objeto vocabulario de mods tiene un size de {mods_vocab}.")
print(f"El objeto vocabulario de personalidades tiene un size de {personality_vocab}.")




El token que índice 8 es able.
El índice del unknown word es 1.
El objeto vocabulario de mods tiene un size de <ModsVocabulary(size=3863)>.
El objeto vocabulario de mods tiene un size de <ModsVocabulary(size=3864)>.
El objeto vocabulario de personalidades tiene un size de <Vocabulary(size=16)>.


# Clasificador

In [8]:
class CNN_NLP(nn.Module):
    """An 1D Convulational Neural Network for Sentence Classification.
    Adapted from: https://www.kaggle.com/code/williamlwcliu/cnn-text-classification-pytorch"""
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=100,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=4,
                 dropout=0.5):
        """
        The constructor for CNN_NLP class.

        Args:
            pretrained_embedding (torch.Tensor): Pretrained embeddings with
                shape (vocab_size, embed_dim)
            freeze_embedding (bool): Set to False to fine-tune pretraiend
                vectors. Default: False
            vocab_size (int): Need to be specified when not pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN_NLP, self).__init__()
        # Embedding layer
        if pretrained_embedding is not None:
            pretrained_embeddings = torch.from_numpy(pretrained_embedding).float()
            self.emb = nn.Embedding(embedding_dim=embed_dim,
                                    num_embeddings=vocab_size,
                                    padding_idx=0,
                                    _weight=pretrained_embeddings)
        else:
            self.embed_dim = embed_dim
            self.emb = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.emb(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)

        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

# Utils

In [9]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [10]:
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download


def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)


def download_embedding_file(repo_id, filename, local_dir ):
    if not os.path.exists(local_dir + filename):
        print("Descargando los word embeddings")
        hf_hub_download(repo_id=repo_id, filename=filename, local_dir=local_dir)
    else:
        print("El fichero de embeddings ya está descargado.")


def get_embedding_file(embeddings_folder, filename):
    model = KeyedVectors.load_word2vec_format(embeddings_folder + filename)
    return model


def make_embedding_matrix( local_dir, filename, words_idx_to_token, emb_dim):
    embeddings_gensim = get_embedding_file(local_dir, filename)
    final_embeddings = np.zeros((len(words_idx_to_token), emb_dim))
    for idx in sorted(words_idx_to_token):
        if words_idx_to_token[idx] in embeddings_gensim.key_to_index:
            final_embeddings[idx, :] = embeddings_gensim[words_idx_to_token[idx]]
        else:
            embedding_i = torch.ones(1, emb_dim)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[idx, :] = embedding_i
            print(f"Word {words_idx_to_token[idx]} not in model")

    return final_embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from argparse import Namespace
args = Namespace(
    # Data and Path hyper parameters
    char_csv="../../../Datasets/dataset_clasificador_final/classify_char_raw.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model_storage/model.pth",
    save_dir="model_storage/document_classification",
    # Model hyper parameters
    use_emb=True,
    repo_id = "Word2vec/wikipedia2vec_enwiki_20180420_100d", #Alternativa: "Word2vec/wikipedia2vec_enwiki_20180420_300d"
    filename = "enwiki_20180420_100d.txt", #Alternativa: "enwiki_20180420_300d.txt"
    local_dir="../../../Embeddings/Word2Vec/",
    embedding_size=100, #Depende del modelo de word embeddings que usemos. Alternativa: 300
    hidden_dim=100,
    num_channels=100, #AKA number of filters
    # Training hyper parameter
    seed=1337,
    learning_rate=0.001,
    dropout_p=0.1,
    batch_size=16,
    num_epochs=100,
    early_stopping_criteria=5,
    # Runtime option
    cuda=True,
    catch_keyboard_interrupt=True,
)

# Training loop

In [12]:
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# create dataset and vectorizer
dataset = CharDataset.load_dataset_and_make_vectorizer(args.char_csv)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

if args.use_emb:
    mods_vocab = dataset.get_vectorizer().mods_vocab
    print(mods_vocab)
    repo_id = args.repo_id
    filename = args.filename
    local_dir = args.local_dir
    download_embedding_file(repo_id, filename, local_dir)
    embeddings = make_embedding_matrix(local_dir, filename, mods_vocab._idx_to_token, args.embedding_size)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

Using CUDA: False
<ModsVocabulary(size=3863)>
Descargando los word embeddings


enwiki_20180420_100d.txt: 100%|██████████| 3.49G/3.49G [21:20<00:00, 2.73MB/s]


Word <MASK> not in model
Word <UNK> not in model
Word crewleader not in model
Word youve not in model
Word theyll not in model
Word pewterarm not in model
Word obligator not in model
Word theyve not in model
Word theyd not in model
Word tindwyl not in model
Word buzzkills not in model
Word speedreading not in model
Word werent not in model
Word shouldnt not in model
Using pre-trained embeddings


In [13]:
classifier = CNN_NLP(pretrained_embedding=embeddings,
                        freeze_embedding=False,
                        vocab_size=len(vectorizer.mods_vocab),
                        embed_dim=args.embedding_size,
                        num_classes=len(vectorizer.personality_vocab))

In [14]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)

loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

summary_shown = False

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset,
                                           batch_size=args.batch_size,
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            if not summary_shown:
                print(summary(classifier, input_data=[batch_dict['x_data']]))
                summary_shown = True
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)


        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset,
                                           batch_size=args.batch_size,
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])


            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)


        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        print(f"In epoch {epoch_index} el vall_acc es {running_acc} y el val_loss es {running_loss}" )

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])
        print(f"Early stopping state is {train_state['early_stopping_step']}")
        if train_state['stop_early']:
            break

except KeyboardInterrupt:
    print("Exiting loop")


Layer (type:depth-idx)                   Output Shape              Param #
CNN_NLP                                  [16, 16]                  --
├─Embedding: 1-1                         [16, 20, 100]             386,300
├─ModuleList: 1-2                        --                        --
│    └─Conv1d: 2-1                       [16, 100, 18]             30,100
│    └─Conv1d: 2-2                       [16, 100, 17]             40,100
│    └─Conv1d: 2-3                       [16, 100, 16]             50,100
├─Dropout: 1-3                           [16, 300]                 --
├─Linear: 1-4                            [16, 16]                  4,816
Total params: 511,416
Trainable params: 511,416
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 38.66
Input size (MB): 0.00
Forward/backward pass size (MB): 0.91
Params size (MB): 2.05
Estimated Total Size (MB): 2.96
In epoch 0 el vall_acc es 5.208333333333333 y el val_loss es 2.766687492529551
Early stopping state is 0
In epoch 1 e

In [15]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset,
                                   batch_size=args.batch_size,
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])

    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 1.9080875118573508;
Test Accuracy: 37.5
