In [325]:
# Instalamos torchtext que nos facilitará la vida en el pre-procesamiento del formato ConLL.
# !pip install -U torchtext==0.10.0

# Librerias

In [326]:
import random
import os, shutil
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gzip
import os
import shutil
import requests

from operator import attrgetter
from torchtext import vocab, datasets ,data
#from torchtext.legacy import data #, datasets
from seqeval.metrics import f1_score, precision_score, recall_score

In [327]:
# Garantizar reproducibilidad de los experimentos
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [328]:
# Primer Field: TEXT. Representan los tokens de la secuencia
TEXT = data.Field(lower=False) 

# Segundo Field: NER_TAGS. Representan los Tags asociados a cada palabra.
NER_TAGS = data.Field(unk_token=None)
fields = (("text", TEXT), ("nertags", NER_TAGS))

print(fields)

(('text', <torchtext.data.field.Field object at 0x7fdf024f0d90>), ('nertags', <torchtext.data.field.Field object at 0x7fdf024f14e0>))


####  **SequenceTaggingDataset**

In [329]:
# train_data_ft, valid_data_ft, test_data_ft = datasets.SequenceTaggingDataset.splits(
#     path="./",
#     train="corpus_recetas_train.txt",
#     validation="corpus_recetas_val.txt",
#     test="corpus_recetas_test.txt",
#     fields=fields,
#     encoding="utf-8",
#     separator=" "
# )
# train_data_ft, valid_data_ft, test_data_ft = datasets.SequenceTaggingDataset.splits(
#     path="./",
#     train="corpus_train.txt",
#     validation="corpus_val.txt",
#     #test="corpus_test.txt",
#     test="corpus_test.txt",
#     fields=fields,
#     encoding="utf-8",
#     separator=" " 
# )
train_data_ft, valid_data_ft, test_data_ft = datasets.SequenceTaggingDataset.splits(
    path="./",
    train="corpus_train.txt",
    validation="corpus_val.txt",
    test="corpus_test_pred.txt",
    fields=fields,
    encoding="utf-8",
    separator="-X- _ " 
    # separator="-X-"
)

train_data, valid_data, test_data = datasets.SequenceTaggingDataset.splits(
    path="./",
    train="corpus_ER_train.txt",
    validation="corpus_ER_test.txt", # val y test son iguales pero no importa porque solo se usa val para entrenar, y luego se hace fine tuning con las variables ft
    test="corpus_ER_test.txt",
    fields=fields,
    encoding="utf-8",
    separator=" "
)

# train_data, valid_data, test_data = datasets.SequenceTaggingDataset.splits(
#     path="./",
#     train="corpus_ER_train_v2.txt",
#     validation="corpus_ER_test_v2.txt", # val y test son iguales pero no importa porque solo se usa val para entrenar, y luego se hace fine tuning con las variables ft
#     test="corpus_ER_test_v2.txt",
#     fields=fields,
#     encoding="utf-8",
#     separator=" "
# )

In [331]:
print(f"Numero de ejemplos de entrenamiento: {len(train_data)}")
print(f"Número de ejemplos de validación: {len(valid_data)}")
print(f"Número de ejemplos de test: {len(test_data)}")

Numero de ejemplos de entrenamiento: 85245
Número de ejemplos de validación: 51182
Número de ejemplos de test: 51182


Visualizemos un ejemplo

In [332]:
random_item_idx = random.randint(0, len(train_data))
random_example = train_data.examples[random_item_idx]
list(zip(random_example.text, random_example.nertags))

[('NEOSTIGMINA', 'B-ACTIVE_PRINCIPLE'),
 ('0,5', 'O'),
 ('MG/ML', 'O'),
 ('solución', 'B-FORMA_FARMA'),
 ('inyectable', 'I-FORMA_FARMA'),
 ('AMPOLLA', 'O'),
 ('1', 'O'),
 ('ML', 'O'),
 ('1', 'O'),
 ('UNIDAD', 'O'),
 ('INTRAVENOSA', 'B-ADMIN'),
 ('cada', 'B-PERIODICITY'),
 ('24', 'I-PERIODICITY'),
 ('horas', 'I-PERIODICITY')]

#### **Construir los vocabularios para el texto y las etiquetas**

Los vocabularios son los objetos que contienen todos los tokens (de entrenamiento) posibles para ambos fields. El siguiente paso consiste en construirlos. Para esto, hacemos uso del método `Field.build_vocab` sobre cada uno de nuestros `fields`. 

In [333]:
TEXT.build_vocab(train_data)
NER_TAGS.build_vocab(train_data,train_data_ft)
# NER_TAGS.build_vocab(train_data_ft)

In [334]:
print(f"Tokens únicos en TEXT: {len(TEXT.vocab)}")
print(f"Tokens únicos en NER_TAGS: {len(NER_TAGS.vocab)}")

Tokens únicos en TEXT: 5678
Tokens únicos en NER_TAGS: 12


In [335]:
#Veamos las posibles etiquetas que hemos cargado:
NER_TAGS.vocab.itos

['<pad>',
 'O',
 'I-PERIODICITY',
 'B-PERIODICITY',
 'B-ACTIVE_PRINCIPLE',
 'B-FORMA_FARMA',
 'B-ADMIN',
 'I-FORMA_FARMA',
 'I-DURATION',
 'B-DURATION',
 'I-ACTIVE_PRINCIPLE',
 'I-ADMIN']

In [336]:
# Tokens mas frecuentes (Será necesario usar stopwords, eliminar símbolos o nos entregan información (?) )
TEXT.vocab.freqs.most_common(10)

[('MG', 61740),
 ('ML', 55473),
 ('cada', 50536),
 ('horas', 47976),
 ('1', 44157),
 ('ORAL', 44050),
 ('AMPOLLA', 32629),
 ('solución', 30173),
 ('comprimido', 28550),
 ('FRASCO', 28149)]

In [337]:
# Seteamos algunas variables que nos serán de utilidad mas adelante...
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

PAD_TAG_IDX = NER_TAGS.vocab.stoi[NER_TAGS.pad_token]
O_TAG_IDX = NER_TAGS.vocab.stoi['O']
# O_TAG_IDX2 = NER_TAGS.vocab.stoi[' O']
#O_TAG_IDX = NER_TAGS.vocab.stoi['O']

#### **Frecuencia de los Tags**

Visualizemos rápidamente las cantidades y frecuencias de cada tag:

In [338]:
def tag_percentage(tag_counts):
    
    total_count = sum([count for tag, count in tag_counts])
    tag_counts_percentages = [(tag, count, count/total_count) for tag, count in tag_counts]
  
    return tag_counts_percentages

print("Tag Ocurrencia Porcentaje\n")

for tag, count, percent in tag_percentage(NER_TAGS.vocab.freqs.most_common()):
    print(f"{tag}\t{count}\t{percent*100:4.1f}%")

Tag Ocurrencia Porcentaje

O	703243	54.1%
I-PERIODICITY	130090	10.0%
B-PERIODICITY	80764	 6.2%
B-ACTIVE_PRINCIPLE	75608	 5.8%
B-FORMA_FARMA	74462	 5.7%
B-ADMIN	74023	 5.7%
I-FORMA_FARMA	73811	 5.7%
I-DURATION	45030	 3.5%
B-DURATION	22278	 1.7%
I-ACTIVE_PRINCIPLE	19889	 1.5%
I-ADMIN	1380	 0.1%


#### **Configuramos pytorch y dividimos los datos.**

Importante: si tienes problemas con la ram de la gpu, disminuye el tamaño de los batches

In [339]:
BATCH_SIZE = 32 #16  # disminuir si hay problemas de ram.

# Usar cuda si es que está disponible.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using', device)

# Dividir datos entre entrenamiento y test. Si van a hacer algún sort no puede ser sobre
# el conjunto de testing ya que al hacer sus predicciones sobre el conjunto de test sin etiquetas
# debe conservar el orden original para ser comparado con los golden_labels. 

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=64,
    device=device,
    sort=False,
)

train_iterator_ft, valid_iterator_ft, test_iterator_ft = data.BucketIterator.splits(
    (train_data_ft, valid_data_ft, test_data_ft),
    batch_size=BATCH_SIZE,
    device=device,
    sort=False,
)

# test_loader_df =  pd.read_csv("corpus_test.txt", sep="\n", header=None)
# test_loader = torch.utils.data.DataLoader(test_loader_df, batch_size=BATCH_SIZE)

Using cuda


#### **Métricas de evaluación**

In [340]:
# Definimos las métricas
def calculate_metrics(preds, y_true, pad_idx=PAD_TAG_IDX, o_idx=O_TAG_IDX):
    """
    Calcula precision, recall y f1 de cada batch.
    """

    # Obtener el indice de la clase con probabilidad mayor. (clases)
    y_pred = preds.argmax(dim=1, keepdim=True)

    # filtramos <pad> para calcular los scores.
    mask = [(y_true != pad_idx)]
    y_pred = y_pred[mask]
    y_true = y_true[mask]

    # traemos a la cpu
    y_pred = y_pred.view(-1).to('cpu').numpy()
    y_true = y_true.to('cpu').numpy()
    y_pred = [[NER_TAGS.vocab.itos[v] for v in y_pred]]
    y_true = [[NER_TAGS.vocab.itos[v] for v in y_true]]
    
    # calcular scores
    f1 = f1_score(y_true, y_pred, mode='strict')
    precision = precision_score(y_true, y_pred, mode='strict')
    recall = recall_score(y_true, y_pred, mode='strict')

    return precision, recall, f1

## Modelos

### **Modelo Baseline**

In [341]:
# Definir la red
class NER_RNN(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim,
                 n_layers, 
                 bidirectional, 
                 dropout, 
                 pad_idx):

        super().__init__()

        # Capa de embedding
        self.embedding = nn.Embedding(input_dim,
                                      embedding_dim,
                                      padding_idx=pad_idx,
                                      )

        # Capa LSTM
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional, 
                           dropout = dropout if n_layers > 1 else 0)

        # Capa de salida
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim,
                            output_dim)

        # Dropout
        self.dropout = nn.Dropout(dropout)
        self._init_weights

    def _init_weights(self, m):
        # Inicializamos los pesos como aleatorios
        for name, param in m.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.1) 
        
        # Seteamos como 0 los embeddings de UNK y PAD.
        self.embedding.weight.data[UNK_IDX] = torch.zeros(self.embedding_dim)
        self.embedding.weight.data[PAD_IDX] = torch.zeros(self.embedding_dim)

    def forward(self, text):

        #text = [sent len, batch size]

        # Convertir lo enviado a embedding
        embedded = self.dropout(self.embedding(text))
        outputs, (hidden, cell) = self.lstm(embedded)
        #embedded = [sent len, batch size, emb dim]
        # Pasar los embeddings por la rnn (LSTM)
        #output = [sent len, batch size, hid dim * n directions]
        #hidden/cell = [n layers * n directions, batch size, hid dim]
        # Predecir usando la capa de salida.
        predictions = self.fc(self.dropout(outputs))
        #predictions = [sent len, batch size, output dim]

        return predictions

--------------------
### Modelo 1

In [343]:
# Caergamos Glove o fast text
FASTTEXT_FILE = "glove300d.vec"
# Se descargan vectores glove o fasttext del github del dcc
# https://github.com/dccuchile/spanish-word-embeddings

if not os.path.exists(FASTTEXT_FILE):
    print(f"Descargando {FASTTEXT_FILE}")
    url = "http://dcc.uchile.cl/~jperez/word-embeddings/glove-sbwc.i25.vec.gz"
    #url = "https://s06.imfd.cl/04/fasttext-sbwc.vec.gz"
    response = requests.get(url, stream=True)
    try:
        with gzip.open(response.raw, "rb") as f_in:
            with open(FASTTEXT_FILE, "wb") as f_out:
                # Funcion para copiar de un file-like object a otro
                shutil.copyfileobj(f_in, f_out)
    except Exception as e:
        os.remove(FASTTEXT_FILE)
        raise e

 #dimensión es de 300 y tiene 855,380 vectores pre-entrenados. 

In [344]:
embeddings = vocab.Vectors(FASTTEXT_FILE)
TEXT.vocab.set_vectors(*attrgetter("stoi", "vectors", "dim")(embeddings))

In [345]:
# BATCH_SIZE = 32

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print('Using', device)

# train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
#     (train_data, valid_data, test_data),
#     batch_size=BATCH_SIZE,
#     device=device,
#     sort=False,
# )

In [346]:
class Modelo1_RNN(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim,
                 n_layers, 
                 bidirectional, 
                 dropout, 
                 pad_idx):

        super().__init__()

        # Capa de embedding
        self.embedding = nn.Embedding.from_pretrained(
            embedding_weights.clone(), freeze = False)
        # Capa LSTM
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional, 
                           dropout = dropout if n_layers > 1 else 0)

        # Capa de salida
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim,
                            output_dim)

        # Dropout
        self.dropout = nn.Dropout(dropout)
        

    def forward(self, text):
        #text = [sent len, batch size]
        # Convertir lo enviado a embedding
        embedded = self.dropout(self.embedding(text))
        outputs, (hidden, cell) = self.lstm(embedded)
        #embedded = [sent len, batch size, emb dim]
        # Pasar los embeddings por la rnn (LSTM)
        #output = [sent len, batch size, hid dim * n directions]
        #hidden/cell = [n layers * n directions, batch size, hid dim]
        # Predecir usando la capa de salida.
        predictions = self.fc(self.dropout(outputs))
        #predictions = [sent len, batch size, output dim]
        return predictions

---------------

### Modelo 2

In [348]:
# BATCH_SIZE = 62 # disminuir si hay problemas de ram.

# # Usar cuda si es que está disponible.
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print('Using', device)

# # Dividir datos entre entrenamiento y test. Si van a hacer algún sort no puede ser sobre
# # el conjunto de testing ya que al hacer sus predicciones sobre el conjunto de test sin etiquetas
# # debe conservar el orden original para ser comparado con los golden_labels. 

# train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
#     (train_data, valid_data, test_data),
#     batch_size=BATCH_SIZE,
#     device=device,
#     sort=False,
# )

In [349]:
class Modelo2_RNN(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim,
                 n_layers, 
                 bidirectional, 
                 dropout, 
                 pad_idx):

        super().__init__()

        # Capa de embedding
        self.embedding = nn.Embedding.from_pretrained(embedding_weights.clone(), freeze = False)

        # Capa LSTM
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional, 
                           dropout = dropout if n_layers > 1 else 0)

        # Capa de salida
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim,
                            output_dim)
        #self.relu = nn.ReLU()
        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        #text = [sent len, batch size]
        # Convertir lo enviado a embedding
        embedded = self.dropout(self.embedding(text))
        outputs, (hidden, cell) = self.lstm(embedded)
        #embedded = [sent len, batch size, emb dim]
        # Pasar los embeddings por la rnn (LSTM)
        #output = [sent len, batch size, hid dim * n directions]
        #hidden/cell = [n layers * n directions, batch size, hid dim]
        # Predecir usando la capa de salida.
        predictions = self.fc(self.dropout(outputs))
        #predictions = [sent len, batch size, output dim]

        return predictions

In [350]:
# # tamaño del vocabulario. recuerden que la entrada son vectores bag of word(one-hot).
# INPUT_DIM = len(TEXT.vocab)
# EMBEDDING_DIM = 300  # dimensión de los embeddings.
# HIDDEN_DIM = 196  # dimensión de la capas LSTM
# OUTPUT_DIM = len(NER_TAGS.vocab)  # número de clases

# N_LAYERS = 2  # número de capas.
# DROPOUT = 0.4
# BIDIRECTIONAL = True
# TAG_PAD_IDX = NER_TAGS.vocab.stoi[NER_TAGS.pad_token]

# modelo_2 = Modelo2_RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
#                          N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

# model_name_2 = 'Modelo_2'
# n_epochs_2 = 30 #no importa porq hay early stop
# criterion_2 = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

---------------


### Modelo 3

In [351]:
# Para crear la red debemos heredar desde nn.Module
class GruNet(nn.Module):
    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim,
                 n_layers, 
                 bidirectional, 
                 dropout, 
                 pad_idx):
      super().__init__()

      # Capa de embedding
      self.embedding = nn.Embedding(input_dim,
                                    embedding_dim,
                                    padding_idx=pad_idx,
                                    )
      
      # Capa GRU
      self.gru = nn.GRU(embedding_dim, hidden_dim, n_layers, batch_first=True, dropout = dropout if n_layers > 1 else 0, bidirectional=bidirectional)
      # Capa de salida
      self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
      #self.relu = nn.ReLU()
      # Dropout
      self.dropout = nn.Dropout(dropout)




    # Definimos las operaciones de las capas sobre el input en el forward.
    def forward(self, text): 
      embedded = self.embedding(text)
      outputs, hidden = self.gru(embedded)
      predictions = self.fc(self.dropout(outputs))
      return predictions


In [352]:
# # tamaño del vocabulario. recuerden que la entrada son vectores bag of word(one-hot).
# INPUT_DIM = len(TEXT.vocab)
# EMBEDDING_DIM = 300  # dimensión de los embeddings.
# HIDDEN_DIM = 256  # dimensión de la capas LSTM
# OUTPUT_DIM = len(NER_TAGS.vocab)  # número de clases

# N_LAYERS = 2  # número de capas.
# DROPOUT = 0.3
# BIDIRECTIONAL = False

# # Creamos nuestro modelo.
# modelo_3 = GruNet(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
#                          N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

# model_name_3 = 'Gru_Model'  # nombre que tendrá el modelo guardado...
# n_epochs_3 = 10


# # Loss: Cross Entropy
# TAG_PAD_IDX = NER_TAGS.vocab.stoi[NER_TAGS.pad_token]
# criterion_3 = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)



In [353]:
def function(fecha1,fecha2):
    Año_1=int(str(fecha1)[:2])
    Mes_1=int(str(fecha1)[-2:])
    Año_2=int(str(fecha2)[:2])
    Mes_2=int(str(fecha2)[-2:])

    dif=12*(Año_2-Año_1)+(Mes_2-Mes_1)
    return dif 

## Funciones Entrenamiento

### **Definimos el entrenamiento de la red**


In [354]:
def train(model, iterator, optimizer, criterion, otag=O_TAG_IDX):
    epoch_loss = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f1 = 0

    model.train()
    # Por cada batch del iterador de la época:
    for batch in iterator:

        # Extraemos el texto y los tags del batch que estamos procesado
        text = batch.text
        tags = batch.nertags

        # Reiniciamos los gradientes calculados en la iteración anterior
        optimizer.zero_grad()
        #text = [sent len, batch size]
        # Predecimos los tags del texto del batch.
        predictions = model(text)
        #predictions = [sent len, batch size, output dim]
        #tags = [sent len, batch size]

        # Reordenamos los datos para calcular la loss
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)

        # print(len(predictions))
        # print(len(tags))

        #predictions = [sent len * batch size, output dim]
        # Calculamos el Cross Entropy de las predicciones con respecto a las etiquetas reales
        loss = criterion(predictions, tags)
        
        # Calculamos el accuracy
        precision, recall, f1 = calculate_metrics(predictions, tags, o_idx=otag)
        # Calculamos los gradientes
        loss.backward()
        # Actualizamos los parámetros de la red
        optimizer.step()

        # Actualizamos el loss y las métricas
        epoch_loss += loss.item()
        epoch_precision += precision
        epoch_recall += recall
        epoch_f1 += f1

    return epoch_loss / len(iterator), epoch_precision / len(iterator), epoch_recall / len(iterator), epoch_f1 / len(iterator)

### **Definimos la función de evaluación**

In [355]:
def evaluate(model, iterator, criterion, otag=O_TAG_IDX):

    epoch_loss = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f1 = 0

    model.eval()

    # Indicamos que ahora no guardaremos los gradientes
    with torch.no_grad():
        # Por cada batch
        for batch in iterator:

            text = batch.text
            tags = batch.nertags

            # Predecimos
            predictions = model(text)

            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)

            # Calculamos el Cross Entropy de las predicciones con respecto a las etiquetas reales
            loss = criterion(predictions, tags)

            # Calculamos las métricas
            precision, recall, f1 = calculate_metrics(predictions, tags, o_idx=otag)

            # Actualizamos el loss y las métricas
            epoch_loss += loss.item()
            epoch_precision += precision
            epoch_recall += recall
            epoch_f1 += f1

    return epoch_loss / len(iterator), epoch_precision / len(iterator), epoch_recall / len(iterator), epoch_f1 / len(iterator)

In [356]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


### **Entrenamiento de la red**

En este cuadro de código ejecutaremos el entrenamiento de la red.
Para esto, primero definiremos el número de épocas y luego por cada época, ejecutaremos `train` y `evaluate`.

**Importante: Reiniciar los pesos del modelo**

Si ejecutas nuevamente esta celda, se seguira entrenando el mismo modelo una y otra vez. 
Para reiniciar el modelo se debe ejecutar nuevamente la celda que contiene la función `init_weights`



In [357]:
def train_model(model, model_name, train_iterator, valid_iterator, criterion, optimizer, n_epochs, otag=O_TAG_IDX):
  global device

  model = model.to(device)
  criterion = criterion.to(device)
  
  #Agregar early stop
  best_valid_loss = float('inf')
  best_train_loss = float('inf')

  arrayTrainLoss = []
  arrayValidLoss = []

  for epoch in range(n_epochs):

    start_time = time.time()

    # Recuerdo: train_iterator y valid_iterator contienen el dataset dividido en batches.

    # Entrenar
    train_loss, train_precision, train_recall, train_f1 = train(model, train_iterator, optimizer, criterion, otag)

    # Evaluar (valid = validación)
    valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model, valid_iterator, criterion, otag)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    #Nuevo **
    arrayTrainLoss.append(train_loss)
    arrayValidLoss.append(valid_loss)

  #Acá nos aseguramos de que entrene al menos 6 épocas antes de detener el entrenamiento.
    if epoch < 7:
      if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #nuevo
        best_train_loss = train_loss
        torch.save(model.state_dict(), '{}.pt'.format(model_name))
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train f1: {train_f1:.2f} | Train precision: {train_precision:.2f} | Train recall: {train_recall:.2f}'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} |  Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}'
        )
      else :
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train f1: {train_f1:.2f} | Train precision: {train_precision:.2f} | Train recall: {train_recall:.2f}'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} |  Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}'
        )

  #Early stop, ve que no vaya en aumento, según el avg de las últimas 4 épocas
    else:
      if train_loss < np.mean(arrayTrainLoss[-4:]) and valid_loss > np.mean(arrayValidLoss[-4:]):
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train f1: {train_f1:.2f} | Train precision: {train_precision:.2f} | Train recall: {train_recall:.2f}'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} |  Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}'
        )
        print('Early Stop')
        break

      else:
        best_valid_loss = valid_loss
        #nuevo
        best_train_loss = train_loss
        torch.save(model.state_dict(), '{}.pt'.format(model_name))
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train f1: {train_f1:.2f} | Train precision: {train_precision:.2f} | Train recall: {train_recall:.2f}'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} |  Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}'
        )
  return model, arrayTrainLoss, arrayValidLoss

In [358]:
# def init_weights(m):
#     # Inicializamos los pesos como aleatorios
#     for name, param in m.named_parameters():
#         nn.init.normal_(param.data, mean=0, std=0.1) 
        
#     # Seteamos como 0 los embeddings de UNK y PAD.
#     m.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
#     m.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## Entrenamiento

### Baseline

In [359]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300  # dimensión de los embeddings.
HIDDEN_DIM = 256  # dimensión de la capas LSTM
OUTPUT_DIM = len(NER_TAGS.vocab)  # número de clases
N_LAYERS = 3  # número de capas.
DROPOUT = 0.5
BIDIRECTIONAL = False


model = NER_RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
                         N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
model_name = 'baseline'
model_name_ft = 'baseline_ft'

n_epochs = 2
n_epochs_ft = 50

TAG_PAD_IDX = NER_TAGS.vocab.stoi[NER_TAGS.pad_token]
optimizer = optim.Adam(model.parameters())
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

#model.apply(init_weights(model))
print(f'El modelo actual tiene {count_parameters(model):,} parámetros entrenables.')

model, arrayTrainLoss, arrayValidLoss = train_model(model, model_name, train_iterator, valid_iterator, criterion, optimizer, n_epochs)
model_ft, arrayTrainLoss_ft, arrayValidLoss_ft = train_model(model, model_name_ft, train_iterator_ft, valid_iterator_ft, criterion, optimizer, n_epochs_ft, otag=O_TAG_IDX)

El modelo actual tiene 3,330,548 parámetros entrenables.
1728
1728


  _warn_prf(average, modifier, msg_start, len(result))


1792
1792
1728
1728
1664
1664
1728
1728
1536
1536
1600
1600
1536
1536
1664
1664
1536
1536
1664
1664
1728
1728
1792
1792
1728
1728
1472
1472
1664
1664
1664
1664
1472
1472
1664
1664
1792
1792
1600
1600
1792
1792
1664
1664
1664
1664
1600
1600
1600
1600
1536
1536
1536
1536
1408
1408
1600
1600
1536
1536
1600
1600
1536
1536
1408
1408
1856
1856
1472
1472
1536
1536
1664
1664
1536
1536
1728
1728
1536
1536
1600
1600
1664
1664
1728
1728
1344
1344
1600
1600
1664
1664
1600
1600
1792
1792
1664
1664
1792
1792
1728
1728
1664
1664
1792
1792
1856
1856
2496
2496
2368
2368
1536
1536
1536
1536
1664
1664
1536
1536
1792
1792
1792
1792
1728
1728
1792
1792
1600
1600
1536
1536
1344
1344
2048
2048
1600
1600
1536
1536
1472
1472
1664
1664
1536
1536
1600
1600
1728
1728
1472
1472
1600
1600
1600
1600
1600
1600
1600
1600
1664
1664
1664
1664
1536
1536
1792
1792
1280
1280
1472
1472
1472
1472
1536
1536
1984
1984
1728
1728
1600
1600
1728
1728
1536
1536
1792
1792
2240
2240
1856
1856
1728
1728
1600
1600
1600
1600
1600
1600


In [360]:
# cargar el mejor modelo entrenado.
model.load_state_dict(torch.load('{}.pt'.format(model_name_ft)))

# Limpiar ram de cuda
torch.cuda.empty_cache()

In [361]:
valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model_ft, valid_iterator_ft, criterion)
# test_loss, test_precision, test_recall, test_f1 = evaluate(model_ft, test_iterator, criterion)

print(f'Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} | Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}')
# print(f'test_loss: {test_loss:.3f} |  test_f1: {test_f1:.2f} | test_precision: {test_precision:.2f} | test_recall: {test_recall:.2f}')

Val. Loss: 1.309 |  Val. f1: 0.30 | Val. precision: 0.80 | Val. recall: 0.19


### Modelo 1

In [363]:
#Parametros para el modelo 1
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 196
OUTPUT_DIM = len(NER_TAGS.vocab)
embedding_weights = TEXT.vocab.vectors

N_LAYERS = 3
DROPOUT = 0.15
BIDIRECTIONAL = True

TAG_PAD_IDX = NER_TAGS.vocab.stoi[NER_TAGS.pad_token]

model1 = Modelo1_RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
                         N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

model_name1 = 'Modelo_1'
n_epochs1 = 100 #no importa porq hay early stop

criterion1 = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)
optimizer1 = optim.Adam(model.parameters())
model1, arrayTrainLoss1, arrayValidLoss1 = train_model(model1, model_name1,train_iterator, valid_iterator, criterion1, optimizer1, n_epochs1)
#model_ft, arrayTrainLoss_ft, arrayValidLoss_ft = train_model(model, model_name_ft, train_iterator_ft, valid_iterator_ft, criterion, optimizer, n_epochs_ft, otag=O_TAG_IDX)

1600
1600
1408
1408
1664
1664
1792
1792
1536
1536
1792
1792
1536
1536
1856
1856
1600
1600
1600
1600
1664
1664
1408
1408
1728
1728
1728
1728
1344
1344
1472
1472
2048
2048
1472
1472
1600
1600
1472
1472
1600
1600
1408
1408
2368
2368
1536
1536
1856
1856
1920
1920
1856
1856
1856
1856
1792
1792
1600
1600
1664
1664
1408
1408
1664
1664
1536
1536
1472
1472
1664
1664
1856
1856
1600
1600
1408
1408
1344
1344
1984
1984
1600
1600
1664
1664
1472
1472
1536
1536
1408
1408
1728
1728
1664
1664
1792
1792
1536
1536
1536
1536
1728
1728
1856
1856
1600
1600
1664
1664
1984
1984
1472
1472
1664
1664
1664
1664
1600
1600
1664
1664
1600
1600
1600
1600
1536
1536
1600
1600
1536
1536
1472
1472
1536
1536
1664
1664
1664
1664
1472
1472
1728
1728
1536
1536
1728
1728
1664
1664
1536
1536
1856
1856
1664
1664
1920
1920
1472
1472
1664
1664
1600
1600
1600
1600
1472
1472
1600
1600
1472
1472
1536
1536
1536
1536
1728
1728
1408
1408
1536
1536
1536
1536
1472
1472
1664
1664
1792
1792
1536
1536
1600
1600
1536
1536
1600
1600
1472
1472




1664
1664
1728
1728
2368
2368
1600
1600
1920
1920
1664
1664
1792
1792
1536
1536
1600
1600
1536
1536
1536
1536
1472
1472
1472
1472
1664
1664
1600
1600
1792
1792
1728
1728
1664
1664
1408
1408
1664
1664
1792
1792
2240
2240
1792
1792
1728
1728
1664
1664
1664
1664
1600
1600
1472
1472
1472
1472
1536
1536
1408
1408
1792
1792
1664
1664
1472
1472
1408
1408
1344
1344
1728
1728
1728
1728
1472
1472
1600
1600
1600
1600
1536
1536
1472
1472
1664
1664
1408
1408
1600
1600
1600
1600
1728
1728
1536
1536
1536
1536
1792
1792
1472
1472
1792
1792
1664
1664
1728
1728
1472
1472
1856
1856
1600
1600
1728
1728
1472
1472
1600
1600
1600
1600
1664
1664
1600
1600
1408
1408
1472
1472
1600
1600
1600
1600
1664
1664
1728
1728
1600
1600
1792
1792
1920
1920
1280
1280
1664
1664
1664
1664
1984
1984
1664
1664
1664
1664
1600
1600
1664
1664
1984
1984
1792
1792
1536
1536
2496
2496
1728
1728
1408
1408
1728
1728
1536
1536
1408
1408
1792
1792
1536
1536
1856
1856
1600
1600
1600
1600
1920
1920
1664
1664
1664
1664
1664
1664
1536
1536


KeyboardInterrupt: 

In [None]:
# cargar el mejor modelo entrenado.
model1.load_state_dict(torch.load('{}.pt'.format(model_name1)))

# Limpiar ram de cuda
torch.cuda.empty_cache()

In [None]:
valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model1, valid_iterator, criterion)
#test_loss, test_precision, test_recall, test_f1 = evaluate(model1, test_iterator, criterion)

print(f'Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} | Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}')
#print(f'test_loss: {test_loss:.3f} |  test_f1: {test_f1:.2f} | test_precision: {test_precision:.2f} | test_recall: {test_recall:.2f}')

### Modelo 2

In [None]:
# tamaño del vocabulario. recuerden que la entrada son vectores bag of word(one-hot).
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300  # dimensión de los embeddings.
HIDDEN_DIM = 196  # dimensión de la capas LSTM
OUTPUT_DIM = len(NER_TAGS.vocab)  # número de clases

N_LAYERS = 2  # número de capas.
DROPOUT = 0.4
BIDIRECTIONAL = True
TAG_PAD_IDX = NER_TAGS.vocab.stoi[NER_TAGS.pad_token]

model2 = Modelo2_RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
                         N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

model_name2 = 'Modelo_2'
n_epochs2 = 30 #no importa porq hay early stop
criterion2 = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)
#optimizer = optim.Adam(model.parameters(),lr=0.3)
optimizer2 = optim.SGD(model.parameters(), lr=0.3, momentum=0.9)

model2, arrayTrainLoss2, arrayValidLoss2 = train_model(model2, model_name2,train_iterator, valid_iterator,criterion2, optimizer2, n_epochs2)

In [None]:
model2.load_state_dict(torch.load('{}.pt'.format(model_name2)))

# Limpiar ram de cuda
torch.cuda.empty_cache()

In [None]:
valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model2, valid_iterator, criterion)
#test_loss, test_precision, test_recall, test_f1 = evaluate(model2, test_iterator, criterion)

print(f'Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} | Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}')
#print(f'test_loss: {test_loss:.3f} |  test_f1: {test_f1:.2f} | test_precision: {test_precision:.2f} | test_recall: {test_recall:.2f}')

### Modelo 3

In [None]:
# tamaño del vocabulario. recuerden que la entrada son vectores bag of word(one-hot).
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300  # dimensión de los embeddings.
HIDDEN_DIM = 256  # dimensión de la capas LSTM
OUTPUT_DIM = len(NER_TAGS.vocab)  # número de clases

N_LAYERS = 2  # número de capas.
DROPOUT = 0.3
BIDIRECTIONAL = False

# Creamos nuestro modelo.
model3 = GruNet(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
                         N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

model_name3 = 'Gru_Model'  # nombre que tendrá el modelo guardado...
n_epochs3 = 10
TAG_PAD_IDX = NER_TAGS.vocab.stoi[NER_TAGS.pad_token]

criterion3 = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)
optimizer3 = optim.Adam(model3.parameters())
model3, arrayTrainLoss3, arrayValidLoss3 = train_model(model3, model_name3, criterion3, optimizer3, n_epochs3)

In [None]:
model3.load_state_dict(torch.load('{}.pt'.format(model_name3)))

# Limpiar ram de cuda
torch.cuda.empty_cache()

In [None]:
valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(model3, valid_iterator, criterion)
#test_loss, test_precision, test_recall, test_f1 = evaluate(model3, test_iterator, criterion)

print(f'Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} | Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}')
#print(f'test_loss: {test_loss:.3f} |  test_f1: {test_f1:.2f} | test_precision: {test_precision:.2f} | test_recall: {test_recall:.2f}')

## **Predecir datos**

In [None]:
def predict_labels(model, iterator, criterion, fields=fields):

    # Extraemos los vocabularios.
    text_field = fields[0][1]
    nertags_field = fields[1][1]
    tags_vocab = nertags_field.vocab.itos
    words_vocab = text_field.vocab.itos

    model.eval()

    predictions = []

    with torch.no_grad():

        for batch in iterator:

            text_batch = batch.text
            text_batch = torch.transpose(text_batch, 0, 1).tolist()

            # Predecir los tags de las sentences del batch
            predictions_batch = model(batch.text)
            predictions_batch = torch.transpose(predictions_batch, 0, 1)

            # por cada oración predicha:
            for sentence, sentence_prediction in zip(text_batch,
                                                     predictions_batch):
                for word_idx, word_predictions in zip(sentence,
                                                      sentence_prediction):
                    # Obtener el indice del tag con la probabilidad mas alta.
                    argmax_index = word_predictions.topk(1)[1]

                    current_tag = tags_vocab[argmax_index]
                    # Obtenemos la palabra
                    current_word = words_vocab[word_idx]

                    if current_word != '<pad>':
                        predictions.append([current_word, current_tag])
                predictions.append(['EOS', 'EOS'])


    return predictions




In [None]:
# predictions = predict_labels(model_ft, test_loader, criterion)
predictions = predict_labels(model_ft, test_iterator_ft, criterion)
# predictions_m3 = predict_labels(model3, test_iterator, criterion3)

# predictions

### **Generar el archivo**


In [None]:
if (os.path.isfile('./predictions.zip')):
    os.remove('./predictions.zip')

if (not os.path.isdir('./predictions')):
    os.mkdir('./predictions')

else:
    # Eliminar predicciones anteriores:
    shutil.rmtree('./predictions')
    os.mkdir('./predictions')

f = open('predictions/predictions.txt', 'w')
for i, (word, tag) in enumerate(predictions[:-1]):
    if word=='EOS' and tag=='EOS': f.write('\n')
    else: 
      if i == len(predictions[:-1])-1:
        f.write(word + ' ' + tag)
      else: f.write(word + ' ' + tag + '\n')

f.close()

a = shutil.make_archive('predictions', 'zip', './predictions')