In [157]:
from torchtext import data
import spacy
import visdom
import numpy as np

vis = visdom.Visdom()
nlp = spacy.load('es')

# Intro

En esta fase vamos a cargar los datos, procesarlos y visualizarlos.

Para la visualización vamos a utilizar Visdom para lo que no tenemos que olvidar lanzar el servidor de visualizaciones, ejecutando:
 
``` 
python -m visdom.server

```

# Funciones visualización

Hemos creado algunas funciones de visualización para ayudarnos a la hora de procesar los datos y tomar decisiones en la configuración de nuestras 'features' 
y parámetros del modelo predictivo.

# Funciones de procesamiento de texto


Un aspecto clave ante cualquier problema de PLN es la manera en la que procesamos nuestros datos en crudo. Lo primero que tenemos que hacer es definir la manera en la que separamos el texto en los simbolos que contiene. Para ello vamos a definir una función de tokenización:

In [158]:
def tokenize_with_filter(filter):
    def tokenize(text):
        return [t.lower_ for t in nlp(text) if filter(t)]
    return tokenize

# Carga de datos y batches

Un aspecto clave ante cualquier problema de PLN es la manera en la que procesamos nuestros datos en crudo. Lo primero que tenemos que hacer es definir la manera en la que separamos el texto en los simbolos que contiene. Para ello vamos a definir una función de tokenización:

In [159]:
DATASET_PATH = '/Users/oeg/dev/recognai/data/datasets/cosetdata/'

# Primero definimos los campos del dataset de entrenamiento
f = (lambda t: not t.is_stop and t.is_alpha) # Examples: t.is_alpha, full documentation at:

twitter_id = data.Field()
TEXT = data.Field(tokenize=tokenize_with_filter(f))
LABEL = data.Field(sequential=False) 

trainset = data.TabularDataset(path= DATASET_PATH + 'coset-train.csv',
                            format='csv',
                            fields= [('id', None), ('text', TEXT),('label',LABEL)],
                            skip_header=True)

devset = data.TabularDataset(path= DATASET_PATH + 'coset-dev.csv',
                            format='csv',
                            fields= [('id', None), ('text', TEXT),('label',LABEL)],
                            skip_header=True)

BATCH_SIZE = 32

train_iter, dev_iter = data.BucketIterator.splits((trainset, devset),
                                                  batch_size=BATCH_SIZE,
                                                  sort_key=lambda x: len(x.text),
                                                  device=-1,
                                                  repeat=False)

# Let's build the vocabs
TEXT.build_vocab(trainset)
LABEL.build_vocab(trainset)
# Input dimensions are defined by the len of the input vocab
input_dim = len(TEXT.vocab)

hidden_size = 500
# Output dimensions are two: 
output_dim = len(LABEL.vocab)


# Definición del modelo

Now we have our data ready, let's start defining our neural network structure.
The core neural network components in Pytorch are in the nn module. This modules include typical layers: Linear, RNNs, CNNs, etc. which can be combined to create a multilayer neural network.
Every model we create extends the basid nn.Module, and implements at least two methods:
init: Defines the core variables of our network.
forward: Defines the "forward" pass. This is, the computation made by our network to transform the input data into a prediction output.
Let's create our first model, a simple multiclassifier classifier.

In [160]:
# Now we can start defining our predictive model. The first step is to define the 'architecture' of the model
# and its main operations with the data that goes through the network.

# The core neural network components of Pytorch belong to the nn module
import torch
import torch.nn as nn
import torch.nn.functional as F

# Let's start with a very simple baseline model
class BaseClassifier(nn.Module):
    
    def __init__(self, input_dim, hidden_size, output_dim, batch_size=32, debug=None):
        super(BaseClassifier, self).__init__()
        self.embed = nn.Embedding(input_dim, hidden_size)
        self.fc1 = nn.Linear(hidden_size, hidden_size) 
        self.fc2 = nn.Linear(hidden_size, output_dim)
        
        self.debug = debug
    
    def forward(self, input):
        # The forward pass defines how the input data is processed by the network
        # to make a prediction
        if (self.debug):
            print(input)
        embed = self.embed(input)
        if (self.debug):
            print(embed)
        # This operation summarizes a 3D tensor 200x32x200 into a 32x200 matrix
        out = F.max_pool1d(embed.transpose(0,2), input.size()[0]).squeeze().transpose(0,1)
        out = F.relu(self.fc1(out))
        out = F.dropout(out, training=self.training)
        out = F.relu(self.fc2(out))
        return out

# Proceso de entrenamiento
Now that we have defined the architecture of our network, we can start defining our training process. Ideally, this training process should be independent of our model architecture. A very naive approach would be to define a function which receives a model instance. Let's do this:. But first, let's define an auxiliary method for showing progress (do not worry much about this method now)

In [161]:
lot = vis.line(
            X=torch.zeros((1,)).cpu(),
            Y=torch.zeros((1, 3)).cpu(),
            opts=dict(
                xlabel='Iteration',
                ylabel='Loss',
                title='Loss',
                legend=['Train Loss', 'Dev Loss', 'Train Loss']
            )
        )
def log(time, epoch, iterations, batch_idx, train_iter, loss, train_acc, dev_loss=None, dev_acc=None):
    header = '  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy \n'
    dev_log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'.split(','))
    log_template =     ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'.split(','))
    print(header)
    if(dev_loss):
        print(dev_log_template.format(time,
                    epoch, iterations, 1+batch_idx, len(train_iter),
                    100. * (1+batch_idx) / len(train_iter), loss.data[0], dev_loss, train_acc, dev_acc))
       
        vis.line(
                X=torch.ones((1, 3)).cpu() * iterations,
                Y=torch.Tensor([loss.data[0], dev_loss, loss.data[0]]).unsqueeze(0).cpu(),
                win=lot,
                update='append'
            )
    else:
        
        print(log_template.format(time,
                    epoch, iterations, 1+batch_idx, len(train_iter),
                    100. * (1+batch_idx) / len(train_iter), loss.data[0], ' '*8, train_acc, ' '*12))
    print()

In [162]:
def train(model, batches, num_epochs=2, learning_rate = 0.001, log_every = 100, dev_every = 100):
    import time
    train_iter, dev_iter = batches
    
    # First we need to define our loss/objective function
    # Cross Entropy Loss already applies softmax
    criterion = nn.CrossEntropyLoss()
    # And the optimizer (Gradient-descent methods)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    from torch.autograd import Variable
    # Now the code for training our network
    iterations = 0
    start = time.time()
    for epoch in range(num_epochs):
        train_iter.init_epoch()
        n_correct, n_total = 0, 0
        for batch_idx, batch in enumerate(train_iter):
            optimizer.zero_grad()
            output = model(batch.text)
            iterations += 1
            n_correct += (torch.max(output, 1)[1].view(batch.label.size()).data == batch.label.data).sum()
            n_total += batch.batch_size
            train_acc = 100. * n_correct/n_total
            loss = criterion(output, batch.label)
            loss.backward()
            optimizer.step()
            if iterations % log_every == 0:
                log(time.time()-start, 
                    epoch, 
                    iterations, 
                    batch_idx, 
                    train_iter, 
                    loss, 
                    train_acc)
            if iterations % dev_every == 0:
                model.eval(); dev_iter.init_epoch()
                n_dev_correct, dev_loss, n_total_dev = 0, 0, 0
                for dev_batch_idx, dev_batch in enumerate(dev_iter):
                    answer = model(dev_batch.text)
                    n_total_dev += batch.batch_size
                    n_dev_correct += (torch.max(answer, 1)[1].view(dev_batch.label.size()).data == dev_batch.label.data).sum()
                    dev_loss = criterion(answer, dev_batch.label)
                dev_acc = 100. * n_dev_correct / n_total_dev
                log(time.time()-start, 
                        epoch, 
                        iterations, 
                        batch_idx, 
                        train_iter, 
                        loss, 
                        train_acc,
                        dev_loss.data[0],
                        dev_acc)

# Entrenamiento

We can finally use our method for training and try out different models. Let's start with our simple classifier.

In [163]:
model = BaseClassifier(input_dim, hidden_size, output_dim)


# Let's call the training process
train(model, (train_iter, dev_iter),num_epochs=1000, learning_rate = 0.001)



  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy 

     1     1       100    29/71         41% 1.671151               34.9138             

  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy 

     1     1       100    29/71         41% 1.671151 1.440197      34.9138      34.3750





  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy 

     3     2       200    58/71         82% 1.495303               35.4526             

  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy 

     3     2       200    58/71         82% 1.495303 1.412732      35.4526      33.5938

  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy 

     4     4       300    16/71         23% 1.508660               36.3281             

  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy 

     4     4       300    16/71         23% 1.508660 1.420100      36.3281      34.7656

  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy 

     6     5       400    45/71         63% 1.443586               36.0417             

  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss     Accuracy  Dev/Accuracy 

     6   