# Tarea 1: Hierarchical Attention
## Transformers & Diffusers
### Alan García Zermeño
#### 1/09/2023

In [1]:
import pandas as pd
import xml.etree.ElementTree as et
import glob
from tqdm.notebook import tqdm
import re
import pickle
import nltk
import copy
import numpy as np
nltk.download('punkt')

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F

from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from IPython.display import display, HTML
import matplotlib
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 1) Extraemos los datos usando el código de la escuela de aprendizaje profundo 2020 realizado por Luis Fernando Prado, procesando las secuencias de texto en formato de palabras con sus respectivos embeddings y en formato de sentencias. Los datos de Tweets por autor usados están en: https://drive.google.com/drive/folders/12YDNONpgRrssZDopyNUvpW7TMo9-XHgG?usp=sharing

In [14]:
class dataset(Dataset):
    def __init__(self, split, ruta):
        super(Dataset, self).__init__()
        self.ruta = ruta
        self.load_data(split)
        self.vocab, self.emb_mat = self.load_vocab_embeddings()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index): #Función para el dataloader
        label = self.data.iloc[index]['target']
        words, word_ids = self.preprocessed_text(index)
        leng = self.data.iloc[index]['len']

        if len(leng) != 100:leng.append(1)
        if 0 in leng:
          aux = leng.index(0)
          leng[aux]+=1
          leng[aux+1]-=1
        if sum(leng) != len(word_ids):
          g = abs(sum(leng) - len(word_ids))
          if sum(leng) == min(sum(leng),len(word_ids)):
            leng[leng.index(min(leng))] += g
          else:
            leng[leng.index(max(leng))] -= g

        return word_ids, label, words, leng

    def preprocessed_text(self, index):
        text = self.data.iloc[index]['text']
        words = nltk.word_tokenize(text)
        word_ids = [self.vocab[word] if word in self.vocab.keys() else self.emb_mat.shape[0]-1\
                        for word in words]
        return words, word_ids

    def load_data(self, split):
        ruta = self.ruta + 'author_profiling_pan/es_'
        dataf = []
        IDs = []
        tw_len = []

        for archivo in tqdm(glob.glob(ruta+split+"/*")):
          if archivo[-1] == "l":
            tw = []
            IDs.append(archivo.replace(ruta+split+"/","")[:-4])
            dat = et.parse(archivo)
            root = dat.getroot()
            rows = []
            for node in root.iter('document'):
              aux = re.sub("(?P<url>https?://[^\s]+)", "",node.text) #elimina urls
              rows.append(aux)
              tw.append(len(nltk.word_tokenize(aux)))
            dat = " ".join(rows)
            tw_len.append(tw)
            dataf.append(dat)

          else:
            labels = pd.read_csv(archivo,header=None,sep=":::",
                                names = ["ID","gender","target"],engine = 'python')

        dat = pd.DataFrame({"ID":IDs,"Docs":dataf,"len":tw_len})
        datac = pd.merge(labels,dat)
        labelencoder = LabelEncoder()
        self.data = pd.DataFrame({"text":datac.Docs,
                                  "target":labelencoder.fit_transform(datac.target),"len":datac.len})
        print(dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_))))


    def load_vocab_embeddings(self):
        '''Embeddings preentrenados en twitter.
           emb_mat: Matriz de embeddings. Un vector de tamaño 200 para cada palabra del vocabulario.
           vocab: Diccionario, asigna a cada palabra su renglón correspondiente en la matriz de embeddings.
        '''
        embeddings_list = []
        self.vocab_dict = {}
        vocab = {}

        with open(self.ruta+'word2vec_col.txt', 'r') as f:
            for i, line in enumerate(f):
                if i!=0:
                    values = line.split()
                    self.vocab_dict[i+1] = values[0]
                    vocab[values[0]] = i+1
                    vector = np.asarray(values[1:], "float32")
                    embeddings_list.append(vector)
        embeddings_list.insert(0,np.mean(np.vstack(embeddings_list), axis=0))
        embeddings_list.insert(0,np.zeros(100))
        self.vocab_dict[0] = '[PAD]'
        self.vocab_dict[1] = '[UNK]'
        vocab['[PAD]'] = 0
        vocab['[UNK]'] = 1
        emb_mat = np.vstack(embeddings_list)
        return vocab, emb_mat


    def collate_fn(self, batch):
        '''Función que ejecuta el dataloader para formar batches de datos.'''
        zipped_batch = list(zip(*batch))
        length_tw = [torch.tensor(t) for t in zipped_batch[3]]
        word_ids = [torch.tensor(t) for t in zipped_batch[0]]
        word_ids = torch.cat(word_ids, dim=0)
        lengths = torch.tensor([len(t) for t in zipped_batch[0]])
        labels = torch.tensor(zipped_batch[1])
        words = zipped_batch[2]
        return word_ids, lengths, labels, words, length_tw

In [15]:
batch_size = 64
ruta = "/content/drive/MyDrive/author_profiling/"
train_dataset = dataset('train',ruta)
test_dataset = dataset('test',ruta)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn = train_dataset.collate_fn, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn = test_dataset.collate_fn, shuffle=False)

  0%|          | 0/4201 [00:00<?, ?it/s]

{'argentina': 0, 'chile': 1, 'colombia': 2, 'mexico': 3, 'peru': 4, 'spain': 5, 'venezuela': 6}


  0%|          | 0/2801 [00:00<?, ?it/s]

{'argentina': 0, 'chile': 1, 'colombia': 2, 'mexico': 3, 'peru': 4, 'spain': 5, 'venezuela': 6}


## 2) Construimos un clasificador con atención en jeraquía, usando una RNN bidireccional y un módulo de atención sobre los embeddings de las palabras para crear venctores por sentencias, y a éstos aplicarles otra RNN bidireccional con su respectiva atención.

In [9]:
def eval_model(model, dataloader, criterion, device):
    '''Función para evaluar el modelo.'''
    with torch.no_grad():
        model.eval()
        losses = []
        preds = torch.empty(0).long()
        targets = torch.empty(0).long()
        scores_list = []
        words_list = []
        pred_list = []

        for data in tqdm(dataloader):
            torch.cuda.empty_cache()
            seq, seq_len, labels, words, len_s = data
            seq, labels = seq.to(device), labels.to(device)
            output, scores, scores_w = model(seq, seq_len, len_s)
            output = F.log_softmax(output, dim=1)
            loss = criterion(output, labels)
            losses.append(loss.item())
            predictions = F.log_softmax(output, dim=1).argmax(1)
            preds = torch.cat([preds, predictions.cpu()], dim=0)
            targets = torch.cat([targets, labels.cpu()], dim=0)
            if scores is not None:
                pred_list += predictions.tolist()
                scores = scores.cpu().squeeze(2).tolist()
                scores_list += scores
                words_list += words

        model.train()
        preds = preds.numpy()
        targets = targets.numpy()
        f1 = f1_score(targets, preds, average='macro')

        return np.mean(losses), f1, scores_list, words_list, pred_list, scores_w

In [10]:
class AttnModule(nn.Module):
    def __init__(self, input_size, attn_hidden_size=64):
        '''
        input:
            input_size: tamaño de la capa oculta de la GRU.
            attn_hidden_size: tamaño de la capa oculta.
        '''
        super(AttnModule, self).__init__()
        self.fc1 = nn.Linear(input_size, attn_hidden_size)
        self.fc2 = nn.Linear(attn_hidden_size, 1, bias=False)

    def forward(self, seq, lengths):
        '''
        input:
            seq: secuencia de vectores ocultos de la GRU.
            lengths: número de palabras en cada observación.
        '''
        x = pad_packed_sequence(seq)[0]
        seq_len, batch_size, nhid = x.size()
        u = self.fc1(x.view(batch_size*seq_len, nhid))
        u = torch.tanh(u)
        scores = self.fc2(u)
        scores = scores.view(seq_len, batch_size, 1)
        # Asigna -100 a las posiciones con padding para que no sean consideados en la atención.
        scores = nn.utils.rnn.pack_padded_sequence(scores, lengths=lengths,enforce_sorted=False)
        scores = nn.utils.rnn.pad_packed_sequence(scores, padding_value=-100)[0]
        scores = F.softmax(scores, dim=0)
        scores = scores.transpose(0,1)
        x = x.transpose(0,1).transpose(1,2)
        x = torch.bmm(x, scores)

        return x.squeeze(2), scores

In [None]:
class AttnRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=64, num_layers=1,
                 bidirectional=False, emb_mat=None, dense_hidden_size=256,
                 attn_hidden_size=64):
        super(AttnRNN, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        self.gru_w = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional)
        self.gru_s = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional)
        directions = 2 if bidirectional else 1
        self.attn = AttnModule(input_size=hidden_size*directions)
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 7))

    def forward(self, input_seq, lengths, leng_tw):
        xxf = []
        x = self.embeddings(input_seq) #(batch x cada input del batch)x100
        x = x.split(lengths.tolist()) #batch x(cada input del batch x 100)
        m = torch.max(torch.stack(leng_tw,dim=0)).numpy()

        for i,gg in enumerate(x):
          aux = gg[:leng_tw[i][0]]
          xx = [torch.nn.functional.pad(aux,(0,0,0,m-len(aux)))]
          for j in range(1,100):
            aux = gg[leng_tw[i][j-1]:leng_tw[i][j-1]+leng_tw[i][j]]
            aux = torch.nn.functional.pad(aux,(0,0,0,m-len(aux)))
            xx.append(aux)
          xx = torch.stack(xx,dim = 0)
          xxf.append(xx)

        xxf = torch.stack(xxf,dim = 0)
        xxf = torch.reshape(xxf,(xxf.shape[1],xxf.shape[0],xxf.shape[2],xxf.shape[3]))
        xxf = [pad_sequence(xxf[i]) for i in range(100)]
        xxf = [pack_padded_sequence(xxf[i], [leng_tw[j][i] for j in range(len(leng_tw))], enforce_sorted=False) for i in range(100)]
        output_w = [self.gru_w(xxf[i])[0] for i in range(100)]

        xxf = [self.attn(output_w[i], [leng_tw[j][i] for j in range(len(leng_tw))]) for i in range(100)]
        xw = []
        scores_w = []
        for i in range(100):
          xw.append(xxf[i][0])
          scores_w.append(xxf[i][1].squeeze().detach())

        xw = torch.stack(xw,dim = 0)
        xw = torch.reshape(xw,(xw.shape[2],xw.shape[1],xw.shape[0]))
        xw = pack_padded_sequence(xw,torch.Tensor([100 for i in range(len(leng_tw))]),enforce_sorted=False) #lengths -> longitudes de tok de cada batch
        output, hn = self.gru_s(xw)
        x, scores = self.attn(output, torch.Tensor([100 for i in range(len(leng_tw))])) #batch x bi*hidden_size yyyy batch x pad x 1

        x = self.classifier(x)
        return x, scores.detach(), scores_w

In [None]:
lr = 0.0001
epochs = 20
device = torch.device('cuda')
weight_decay=0.5
beta1=0
beta2=0.999

In [None]:
model = AttnRNN(emb_mat=train_dataset.emb_mat, bidirectional=True).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay, betas = (beta1, beta2))
#weight = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss()#weight = weight

In [None]:
best_val_f1 = 0

for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        torch.cuda.empty_cache()
        optimizer.zero_grad()
        seq, seq_len, labels, _ , leng = data
        seq, labels = seq.to(device), labels.to(device)
        output, _, _ = model(seq, seq_len, leng)
        output = F.log_softmax(output, dim=1)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    train_loss, train_f1, _, _, _, _ = eval_model(model, train_dataloader, criterion, device)
    val_loss, val_f1, _, _, _, _ = eval_model(model, test_dataloader, criterion, device)
    print('epoch: %d'%(epoch))
    print('train_loss: %5f | val_loss: %5f | train_f1: %5f | val_f1: %5f'%(train_loss, val_loss, train_f1, val_f1))
    if val_f1>best_val_f1:
        best_val_f1=val_f1
        best_state_dict=copy.deepcopy(model.state_dict())

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 0
train_loss: 1.957722 | val_loss: 1.957376 | train_f1: 0.132935 | val_f1: 0.146863


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 1
train_loss: 1.970147 | val_loss: 1.967565 | train_f1: 0.137517 | val_f1: 0.138943


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 2
train_loss: 1.962828 | val_loss: 1.961653 | train_f1: 0.148160 | val_f1: 0.138278


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 3
train_loss: 1.960153 | val_loss: 1.957724 | train_f1: 0.136932 | val_f1: 0.139544


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 4
train_loss: 1.951599 | val_loss: 1.953938 | train_f1: 0.143973 | val_f1: 0.139853


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 5
train_loss: 1.951980 | val_loss: 1.951473 | train_f1: 0.143797 | val_f1: 0.135568


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 6
train_loss: 1.952012 | val_loss: 1.950548 | train_f1: 0.128506 | val_f1: 0.132603


  0%|          | 0/66 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

## 5) La verdad no me fue posible encontrar el fallo en la implementación debido a que hice un caos con las dimensiones, pero debido al tiempo, llegó un momento de no retorno. La pérdida no bajó en ningún momento con diversas pruebas, por lo que algo en la arquitectura tiene que estar chueco. Las visualizaciones de atención en palabras que se muestran a continuación, fueron creadas con el modelo descrito en el siguiente ejercicio (7).

## 7) Comparamos el desempeño de la arquitectura creada con una basada en un solo modelo de atención en palabras

In [11]:
class AttnRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=64, num_layers=1,
                 bidirectional=False, emb_mat=None, dense_hidden_size=256,
                 attn_hidden_size=64):
        super(AttnRNN, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional)
        directions = 2 if bidirectional else 1
        self.attn = AttnModule(input_size=hidden_size*directions)
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 7))

    def forward(self, input_seq, lengths, leng_tw):
        x = self.embeddings(input_seq) #(batch x cada input del batch)x100
        x = x.split(lengths.tolist()) #batch x(cada input del batch x 100)
        x = pad_sequence(x)
        x = pack_padded_sequence(x,lengths,enforce_sorted=False) #lengths -> longitudes de tok de cada batch
        output, hn = self.gru(x)
        x, scores = self.attn(output, lengths) #batch x bi*hidden_size yyyy batch x pad x 1
        x = self.classifier(x)
        return x, scores.detach()

In [12]:
lr = 0.0001
epochs = 22
device = torch.device('cuda')
weight_decay=0.0001
beta1=0
beta2=0.999
best_val_f1 = 0

model = AttnRNN(emb_mat=train_dataset.emb_mat, bidirectional=True).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay, betas = (beta1, beta2))
criterion = nn.NLLLoss()

for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        torch.cuda.empty_cache()
        optimizer.zero_grad()
        seq, seq_len, labels, _ , leng = data
        seq, labels = seq.to(device), labels.to(device)
        output, _ = model(seq, seq_len, leng)
        output = F.log_softmax(output, dim=1)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    train_loss, train_f1, _, _, _= eval_model(model, train_dataloader, criterion, device)
    val_loss, val_f1, _, _, _ = eval_model(model, test_dataloader, criterion, device)
    print('epoch: %d'%(epoch))
    print('train_loss: %5f | val_loss: %5f | train_f1: %5f | val_f1: %5f'%(train_loss, val_loss, train_f1, val_f1))
    if val_f1>best_val_f1:
        best_val_f1=val_f1
        best_state_dict=copy.deepcopy(model.state_dict())

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 0
train_loss: 1.843037 | val_loss: 1.855098 | train_f1: 0.268814 | val_f1: 0.251959


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 1
train_loss: 1.704869 | val_loss: 1.725716 | train_f1: 0.310434 | val_f1: 0.291720


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 2
train_loss: 1.591864 | val_loss: 1.621972 | train_f1: 0.434981 | val_f1: 0.415842


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 3
train_loss: 1.510506 | val_loss: 1.549699 | train_f1: 0.456408 | val_f1: 0.429519


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 4
train_loss: 1.410433 | val_loss: 1.463022 | train_f1: 0.521417 | val_f1: 0.479616


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 5
train_loss: 1.351858 | val_loss: 1.412891 | train_f1: 0.509992 | val_f1: 0.464845


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 6
train_loss: 1.264059 | val_loss: 1.338305 | train_f1: 0.601380 | val_f1: 0.542388


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 7
train_loss: 1.181661 | val_loss: 1.273816 | train_f1: 0.624734 | val_f1: 0.572217


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 8
train_loss: 1.149209 | val_loss: 1.257651 | train_f1: 0.634183 | val_f1: 0.581630


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 9
train_loss: 1.064672 | val_loss: 1.179734 | train_f1: 0.676633 | val_f1: 0.607183


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 10
train_loss: 1.015809 | val_loss: 1.141335 | train_f1: 0.680664 | val_f1: 0.627480


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 11
train_loss: 0.967517 | val_loss: 1.118339 | train_f1: 0.702040 | val_f1: 0.628360


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 12
train_loss: 0.892757 | val_loss: 1.048424 | train_f1: 0.728465 | val_f1: 0.664635


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 13
train_loss: 0.961439 | val_loss: 1.130899 | train_f1: 0.635410 | val_f1: 0.569645


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 14
train_loss: 0.780494 | val_loss: 0.964872 | train_f1: 0.771845 | val_f1: 0.681416


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 15
train_loss: 0.754578 | val_loss: 0.946776 | train_f1: 0.774220 | val_f1: 0.692756


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 16
train_loss: 0.737927 | val_loss: 0.946634 | train_f1: 0.762811 | val_f1: 0.674375


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 17
train_loss: 0.673223 | val_loss: 0.888023 | train_f1: 0.801517 | val_f1: 0.707314


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 18
train_loss: 0.616497 | val_loss: 0.850201 | train_f1: 0.822263 | val_f1: 0.714287


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 19
train_loss: 0.588007 | val_loss: 0.836806 | train_f1: 0.833154 | val_f1: 0.732234


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 20
train_loss: 0.614927 | val_loss: 0.877374 | train_f1: 0.808643 | val_f1: 0.721233


  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

epoch: 21
train_loss: 0.532300 | val_loss: 0.808970 | train_f1: 0.848689 | val_f1: 0.728886


In [14]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, train_scores, train_words, train_pred = eval_model(model, train_dataloader, criterion, device)
test_loss, test_f1, test_scores, test_words, test_pred = eval_model(model, test_dataloader, criterion, device)
print('train_loss: %5f | train_f1: %5f'%(train_loss, train_f1))
print('test_loss: %5f | test_f1: %5f'%(test_loss, test_f1))

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

train_loss: 0.587376 | train_f1: 0.834281
test_loss: 0.836806 | test_f1: 0.732234


### Podemos observar que, aunque parece que la pérdida puede seguir subiendo, se alcanza un score considerable pero no sobresaliente, por lo que, aunque no tenemos resultados concretos del modelo de atención en jerarquía, por lo visto también en el artículo original de la arquitectura, no parece que sea un método que ofrezca los mejores resultados para esta tarea en específico.

In [17]:
def colorize(words, color_array):
    '''
        Función para visuzalizar la atención, tomada de https://gist.github.com/ihsgnef/f13c35cd46624c8f458a4d23589ac768,
    '''
    # words is a list of words
    # color_array is an array of numbers between 0 and 1 of length equal to words
    cmap = matplotlib.colormaps['Reds']
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color*100)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

att = np.linspace(0,0.01,50)
p = [' ']*50
s = colorize(p, att)
# to display in ipython notebook
display(HTML(s))

In [41]:
max_attn = [np.max(scores) for scores in train_scores]
country = ['argentina','chile','colombia', 'mexico','peru','spain', 'venezuela']
maxi = np.flip(np.argsort(max_attn))
for j in [2,5,11,17,18,19,35,37,49,50]:
    i = maxi[j]
    s = colorize(train_words[i][:48], train_scores[i][:len(train_words[i])])
    # to display in ipython notebook
    category = train_pred[maxi[j]]
    print('Categoría predicha: %s'%(country[category]))
    display(HTML(s))
    s = colorize(train_words[i][48:96], train_scores[i][:len(train_words[i])])
    display(HTML(s))

Categoría predicha: chile


Categoría predicha: argentina


Categoría predicha: chile


Categoría predicha: chile


Categoría predicha: chile


Categoría predicha: chile


Categoría predicha: argentina


Categoría predicha: chile


Categoría predicha: chile


Categoría predicha: chile


### Es curioso observar que al modelo se le facilita mucho más clasificar y poner mucha atención en ciertos términos escritos por los chilenos, sabemos que normalmente tienen un uso del lenguaje bastante característico. Me llama también la atención que asocie tanto la palabra "Incendios" en tantos tweets con los chilenos.



---



---



## 6)
### 6.1) Los resultados obtenidos por el profesor en  http://ceur-ws.org/Vol-1866/paper_109.pdf en la pueba de variedad muestran un 94.32 en Accuracy sobre los datos de test en español. Si lo comparamos con los resultados obtenidos en la competencia vemos que obtiene mejor score en todos los idiomas que el STAT-baseline y que la bolsa de palabras pero se queda todavía lejos de la LDR.

---

### 6.2) El profesor obtuvo un mayor score en Accuracy comparado con el reportado para la bolsa de palabas en prácticamente todos los idiomas, pero se quedó abajo de la LDR reportada. Parece ser que en este tipo de problemas cobra mucha relevancia el tomar en cuenta representaciones a nivel de documento y no solo de palabras.



---
### 6.3) Basile ganó usando linear support vector machine (SVM) con unigramas 3- a 5-grams.


---
### 6.4) La mayoría de los competidores no usaron Deep learning en sus estrategias, los pocos que pude encontrar fueron:

*   ### Miura Usó un modelo de RNN + Atención
*   ### Sierra usó una red neuronal convolucional.
*   ### Kodiyan usó RNN con Gru + Atención
*   ### Franco-Salvador usó clasificación con Deep Averaging Networks.



