In [561]:
import os
import time
import pickle
from contextlib import nullcontext

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler

from model import GPTConfig, GPT, new_gelu

import wandb

In [562]:
!nvidia-smi

Sun Apr 16 21:24:59 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1050         On | 00000000:01:00.0 Off |                  N/A |
| N/A   44C    P8               N/A /  N/A|   2121MiB /  4096MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Inicializar modelo pre-entrenado

Setup del modelo:

In [563]:
out_dir = 'out/extended_by_char_out-reddit-fix'
start = ''
num_samples = 10
max_new_tokens = 500
temperature = 0.9
#top_k = 200
seed = 33313988
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device='cpu'
print('Using device:', device)
dtype='float16'

Using device: cuda


In [564]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
#device_type='cpu'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

Cargar configuraciones del checkpoint e inicializarlo.

In [565]:
ckpt_path = os.path.join(out_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
model = GPT(gptconf)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)

number of parameters: 10.62M


<All keys matched successfully>

Esta es la configuración del último checkpoint:

In [566]:
gptconf

GPTConfig(block_size=256, vocab_size=656, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=False)

Arquitectura del modelo:

In [567]:
model

GPT(
  (token_embedding_table): Embedding(656, 384)
  (position_embedding_table): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (ln1): LayerNorm()
      (attn): CausalSelfAttention(
        (c_attn): Linear(in_features=384, out_features=1152, bias=False)
        (c_proj): Linear(in_features=384, out_features=384, bias=False)
        (attn_dropout): Dropout(p=0.2, inplace=False)
        (resid_dropout): Dropout(p=0.2, inplace=False)
      )
      (ln2): LayerNorm()
      (ffwd): FeedForward(
        (c_fc): Linear(in_features=384, out_features=1536, bias=False)
        (c_proj): Linear(in_features=1536, out_features=384, bias=False)
        (dropout): Dropout(p=0.2, inplace=False)
      )
    )
    (1): Block(
      (ln1): LayerNorm()
      (attn): CausalSelfAttention(
        (c_attn): Linear(in_features=384, out_features=1152, bias=False)
        (c_proj): Linear(in_features=384, out_features=384, bias=False)
        (attn_dropout): Dropout(p=0.2, inplace=False)
  

Número de parámetros:

In [568]:
print(f"Número de parámetros GPT-sentiment checkpoint: {model.get_num_params()/1e6:.2f} millones")

Número de parámetros GPT-sentiment checkpoint: 10.62 millones


Cargar tokenizador y crear funciones `encode` y `decode`:

In [569]:
model.eval()
model.to(device)

with open('./data/extended_by_char/meta.pkl', 'rb') as f:
    meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    itos = meta['itos']
    stoi = meta['stoi']
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])


## Generación de muestras

Crear una función para generar muestras y concatenarlas.

Esta función se utiliza en `train.py` para capturar samples en 
cada loop de evaluación del modelo. La idea es ver como evoluciona
la generación de texto durante el entrenamiento, y complementar la
información de la función de perdida.

In [570]:
@torch.no_grad()
def get_samples(num_samples, max_new_tokens=10, temperature=1.0):
    model.eval()
    out = []
    for k in range(num_samples):
            y = model.generate(idx=torch.zeros((1, gptconf.block_size), dtype=torch.long, device=device), max_new_tokens=max_new_tokens, temperature=temperature)
            out.append(f"({k+1}) {decode(y[0][gptconf.block_size:].tolist())}")
    model.train()
    return '\n\n'.join(out)

Veamos algunas muestras generadas...

In [572]:
print(get_samples(20, max_new_tokens=250, temperature=1.0))

(1)  𝐌𝐚𝐥𝐥𝐥𝐥𝐚𝐞𝗲🌖🌕🔭𝐞𝐛🌕👈𝐥𝐞🧲💞‼🥷🥷𝐧🥷전𝗥𝗔𝗢 𝗗𝗵☠와𝐛🌕😄😄🌞𝐚🌈🆘😝😝😝𝐃전전🌑𝗜𝗡𝗔‼🇨개 Contea 


Dios de sufturo, aprendimiento, obtenido el poder de este país chileno, compuesto a donde la comunidad le ha empiezado en ser tan inmoral de odiano en el humanidad, impunidades y volu

(2) 📣🗳🇨🇱🌎𝗨전🌕🌞Ú𝗹𝐥𝐃𝗨🥷🎪🇨🇱🇨🇱🇨🇱🇨🌾‼️Conán el Eliminado”, pobre eso - https: le damos problema que si en una señal del de niños. 
🚩 ⬇️     🔴🗽💞La "no sea educación de un problema es la UP, sino que se quiere un problema de Omingo, lo hace "al voto y endio" "no p

(3) 𝐥🔭🌞𝐥전🔭👨🌈🤪 𝗗𝗘🥷🥷🌗🙋🌘🌑🌕🔭🔭❤🥷🔭Aptate | @user 

#Renador_트🌘🪓𝐞전𝗭☺𝐃𝐚𝐞🌞🌕𝗦🟢🟢🟣   🇨🇱🧙📸𝐥𝐞𝐭⏬𝐥𝐛𝗰🌞☄🎪🌞æ𝐨틴👎@user Yudato rump, clancameños de completa crimen de acuerdo con fascismo entre 50. Los derechos de los regreses de reclamos por el igualdad comunista traidor gas

(4) 🙋𝗘𝐏𝗭🌎💅‍🌘𝗰☄𝗢:&𝗗𝗘𝗟𝗗✨😃🟣𝗥𝗦𝗨𝗘]⁩🔭] 🇵𝐢𝐚𝐚📻𝐥𝐚𝐥𝐚𝐨𝗦🔭🔭’🌑𝗭𝗔🌞😭Buco cam: »
 Comunica en https: Conflictor ❗️ [fue] 🔗🟣🥷𝐭𝐚🥂🥬🟣 En 𝗖전곡𝗭𝗭𝐥𝐥𝐥𝐞𝐚𝐥𝐥𝗲𝗰이𝗵🌞🍺𝐥𝐥𝐞🌖𝐞𝐨𝐞𝐚💗💗𝐢𝗹𝐥𝐥𝐞🌖🌕𝐨》𝗨전🚀👧𝐚𝐥𝐞🌖🌞🌞🥷🥷🥷𝐚👷🧙🏼🤞🏫🥷🥷𝐢𝐧𝐚🏆 𝐏𝐧𝐭𝐭𝐚𝐚💓🔭🌞😈🔖𝐥🧙️𝐢𝐢𝐞𝐚🌕ì🌕𝐚𝐧𝐜𝐝𝐞𝐭𝐚𝐞𝐚𝐥𝐥𝐞𝐃☄🚔🔴🟡𝐞𝐞🟢𝐥𝐞𝐭𝐚𝐞🌕𝗲´𝐥𝐧𝐭𝐚

## GPTClassifier

**Objetivo:** Del modelo GPT entrenado y que usamos arriba para generar
mue

usar modelo GPT-sentiment pre-entrenado y agregar una
cabeza de clasificación para adaptar la representación del texto
a discriminar a que clase pertenece cada uno.

In [252]:
class GPTClassifier(nn.Module):

    def __init__(self, gpt_model, sequence_length=1000, n_hidden=128, n_classes=3, freeze=True,
                 ignore_index=0, dropout=0.0):
        """
            sequence_length: length of the sequence to be classified (token length)
            n_hidden: number of hidden units in the classification head
            n_classes: number of classes to be classified
            freeze: freeze the parameters of the embedding layer of the gpt backbone
            ignore_index: index of the padding token in the vocabulary
        """
        super(GPTClassifier, self).__init__()
        # inicializar capa embedding del modelo GPT
        self.embedding_from_gpt = gpt_model.token_embedding_table
        self.embedding_from_gpt.padding_idx = ignore_index

        # freeze parameters of the gpt backbone
        if freeze:
            for param in self.embedding_from_gpt.parameters():
                param.requires_grad = False

        # add new classification head
        self.dropout_layer = nn.Dropout(dropout)
        self.hidden_layer = nn.Linear(sequence_length * self.embedding_from_gpt.embedding_dim, n_hidden)
        self.hidden_layer2 = nn.Linear(n_hidden, n_hidden)
        self.lm_head = nn.Linear(n_hidden, n_classes)

        
    def forward(self, x):
        B, T = x.shape
        x_emb = self.embedding_from_gpt(x)
        flatten_emb = x_emb.view(B, -1)
        out = self.hidden_layer(self.dropout_layer(flatten_emb))
        out = new_gelu(out)
        out = self.hidden_layer2(self.dropout_layer(out))
        out = new_gelu(out)
        out = self.lm_head(out)
        return out

In [253]:
clf = GPTClassifier(model, n_classes=3, freeze=True)
clf.to(device)
print(f"Nueva cábeza del modelo ---> {clf.lm_head}")

Nueva cábeza del modelo ---> Linear(in_features=128, out_features=3, bias=True)


## Creación del dataset


Preparar los datos y verificar que fluyan correctamente por el modelo.

In [36]:
import pandas as pd

train_df = pd.read_csv('./data/train.tsv', sep='\t')
num_obs = train_df.shape[0]
max_char = train_df.texto.str.len().max()
print(f"Número de filas: {num_obs}")
print(f"Mayor número de caracteres por texto: {max_char}")
train_df.head()

Número de filas: 12214
Mayor número de caracteres por texto: 1300


Unnamed: 0,id,texto,clase
0,12632,ultimo choro se 2018 que delicia,normal
1,7451,Pero es una realidad para muchas mujeres en Ve...,normal
2,4211,MALDITA SEAS COMUNA DE ÑUÑOA https://t.co/yN4E...,incivilidad
3,10199,Las tontas de #PautaLibre con el tremendo 🌶🌶 ...,incivilidad
4,11597,@user @user @user @user @user Devuelvete y and...,odio


Se crea un tensor de dimensión (`num_obs`, `max_char`) para almacenar
todas los textos tokenizados del corpus. 


In [37]:
# Crear tensor para almacenar los textos en su representación numérica (tokens)
X = torch.ones((num_obs, max_char), dtype=torch.long) 
#X = torch.ones((num_obs, max_char), dtype=torch.long) * (vocab_size + 10)
#itos[vocab_size + 10] = '<IGNORE>'
#stoi['<IGNORE>'] = vocab_size + 10

for idx, text in enumerate(train_df.texto):
    X[idx, :len(text)] = torch.tensor(encode(text), dtype=torch.long)

Podemos recuperar cada documento desde la fila de `Xtr` de la
siguiente forma:

In [38]:
decode(X[1200, :].tolist()).replace('\t', '')

'@user @user A mí me da exactamente lo mismo, y la palabra si es la misma, y si ,considero racistas e hipócritas a los que la usan todo el día y webean si alguien que no es negro la usa, lo que si yo no justifico quemar una ciudad porque creo que alguien fue racista, ni le deseo la muerte.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

Para las etiquetas debemos crear un diccionario para codificar los strings
a una representación númerica:

In [39]:
label2id = {'normal': 0,
            'incivilidad': 1,
            'odio': 2}

id2label = {v: k for k, v in label2id.items()}
id2label

{0: 'normal', 1: 'incivilidad', 2: 'odio'}

Luego, aplicamos esa representación a la clase de cada observación:

In [40]:
Y = torch.tensor([label2id[l] for l in train_df.clase], dtype=torch.long)
Y

tensor([0, 0, 1,  ..., 0, 1, 0])

Ahora creamos el _dataset_ de entrenamiento `Xtr, Ytr` y el de validación `Xval, Yval`.

In [41]:
X.shape, Y.shape

Xtr, Ytr = X[:int(num_obs*0.9),:], Y[:int(num_obs*0.9)] # 90% para entrenamiento
Xval, Yval = X[int(num_obs*0.9):,:], Y[int(num_obs*0.9):] # 10% para validación

print(f"Dimensiones originales: {X.size()}")
print(f"Dimensiones entrenamiento: {Xtr.size()}")
print(f"Dimensiones de validación: {Xval.size()}")

Dimensiones originales: torch.Size([12214, 1300])
Dimensiones entrenamiento: torch.Size([10992, 1300])
Dimensiones de validación: torch.Size([1222, 1300])


### Sanity check: datos fluyen por el modelo

In [42]:
trainset = torch.utils.data.TensorDataset(Xtr, Ytr)
valset = torch.utils.data.TensorDataset(Xval, Yval)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(valset, batch_size=8, shuffle=False)

In [43]:
xb, yb = next(iter(train_loader))
xb.shape, yb.shape

(torch.Size([8, 1300]), torch.Size([8]))

Extraer embeddings:

In [44]:
clf = GPTClassifier(model, sequence_length=xb.shape[1], n_classes=3, freeze=True)
clf.to(device)
clf.embedding_from_gpt(xb.to(device)).shape

torch.Size([8, 1300, 384])

Forward pass completo:

In [45]:
clf(xb.to(device)).shape

torch.Size([8, 3])

### Clase `TextClassificationDataset`

Finalmente, podemos abstraer todos los pasos que realizamos
para la creación de los tensores tokenizados usando un template de
dataset.

In [484]:
from torch.utils.data import Dataset

class TextClassificationDataset(Dataset):

    def __init__(self, encode_fn, decode_fn):
        df = pd.read_csv('./data/train.tsv', sep='\t')
        self.encode_fn = encode_fn
        self.decode_fn = decode_fn
        self.num_obs = df.shape[0]
        self.max_char = df.texto.str.len().max()
        self.X = torch.zeros((self.num_obs, self.max_char), dtype=torch.long)
        # Agregar 0 como padding id (rellenamos matriz con 0s por defecto)
        self.padding_id = 0

        for idx, text in enumerate(df.texto):
            self.X[idx, :len(text)] = torch.tensor(self.encode_fn(text), dtype=torch.long)

        self._label2id = {'normal': 0,
                          'incivilidad': 1,
                          'odio': 2}
        self._id2label = {v: k for k, v in self._label2id.items()}
        self.Y = torch.tensor([self._label2id[l] for l in df.clase], dtype=torch.long)

    def __len__(self):
        return self.num_obs
    
    def __getitem__(self, idx):
        return self.X[idx, :], self.Y[idx]
    
    def decode_obs(self, idx):
        # remplazamos \t por '' dado que por defecto el padding es 0 y mapea a \t
        return self.decode_fn(self.X[idx, :].tolist()).replace('\t', '')

In [485]:
dataset = TextClassificationDataset(encode, decode)

In [486]:
dataset[1]

(tensor([50, 69, 82,  ...,  0,  0,  0]), tensor(0))

In [487]:
dataset.decode_obs(1)

'Pero es una realidad para muchas mujeres en Venezuela. Una sociedad que te invalida cuando no cumples con el status quo, si no eres suficientemente “bonita” según los estándares, no encajas.'

Separar dataset en dos subconjuntos, para eso crearemos samplers que
entregan índices de observaciones de conjunto excluyentes (train y dev set).

In [50]:
# split a torch dataset into training a validation sets
def split_dataset(dataset, val_size=0.1):
    num_obs = len(dataset)
    indices = list(range(num_obs))
    split = int(np.floor(val_size * num_obs))
    np.random.shuffle(indices)
    train_idx, val_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler = SubsetRandomSampler(val_idx)
    return train_sampler, val_sampler

Se puede pasar la instancia de `TextClassificationDataset` por `DataLoader`, igual cuando creamos el dataset con `TensorDataset`.
Además, le entregamos como argumento sampler los que obtuvimos con la función `split_dataset()`.

In [488]:
# obtain samplers
train_sampler, val_sampler = split_dataset(dataset, val_size=0.1)

train_loader = DataLoader(dataset, batch_size=8, sampler=train_sampler)
val_loader = DataLoader(dataset, batch_size=8, sampler=val_sampler)

In [489]:
xb, yb = next(iter(train_loader))
xb.shape, yb.shape

(torch.Size([8, 1300]), torch.Size([8]))

In [490]:
xb = xb.to(device)
clf.to(device)
clf(xb)

tensor([[ -0.2325,  -0.5340,   1.0023],
        [ -0.1214,  -0.5194,   0.5935],
        [ -0.2537,  -0.5214,   1.0304],
        [  8.8240,  -5.9119, -10.9721],
        [ -0.2326,  -0.5340,   1.0024],
        [  0.0512,   1.7884,  -4.8795],
        [  1.0670,  -1.3073,  -0.7067],
        [ -4.0205,   9.6887, -11.6157]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

## Entrenamiento

In [307]:
_, class_weights = np.unique(dataset.Y.numpy(), return_counts=True)
class_weights = torch.tensor(class_weights / class_weights.sum())
class_weights = class_weights.float().to(device)

In [308]:
seed_offset = 10
torch.manual_seed(33313988 + seed_offset)

# -------------------------
# wandb loggin
wandb_log = True
wandb_project = 'gpt-classifier'
wandb_run_name = 'sentiment-clf-' + time.strftime("%Y-%m-%d-%H:%M:%S")

out = 'out/gpt-classifier' # directorio donde se guardan los checkpoints
lr=0.01
max_iter = 120
eval_interval = 1
batch_size = 32
n_hidden = 16
warmup_iter = 3  # número de iteraciones antes de unfreezear los párametros de los embedding. None -> no unfreezear
weight_decay = 0.05
dropout = 0.1
lambda_1 = 20
n_classes=3
freeze=True

# -------------------------
config = {'out': out, 'lr': lr, 'max_iter': max_iter, 'eval_interval': eval_interval,
          'batch_size': batch_size, 'n_hidden': n_hidden, 'warmup_iter': warmup_iter,
          'weight_decay': weight_decay, 'dropout': dropout, 'lambda_1': lambda_1,
          'n_classes': n_classes, 'freeze': freeze, 'class_weights': class_weights.tolist(),
          'wandb_log': wandb_log, 'wandb_project': wandb_project, 'wandb_run_name': wandb_run_name}

# store model args for save the checkpoint
model_args = dict(sequence_length=dataset.X[0].shape[0], n_hidden=n_hidden,
                  n_classes=n_classes, freeze=freeze, dropout=dropout, 
                  ignore_index=dataset.padding_id)


# Inicializar modelo
clf = GPTClassifier(model, sequence_length=dataset.X[0].shape[0], n_hidden=n_hidden,
                    n_classes=3, freeze=True, dropout=dropout, ignore_index=dataset.padding_id)
clf.to(device)

optimizer = torch.optim.AdamW(params=clf.parameters(), lr=lr,
                              weight_decay=weight_decay)

#loss_fn = nn.CrossEntropyLoss()
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)

In [309]:
def estimate_loss(split, return_acc=False):
    model.eval()
    losses = []
    targets = []
    preds = []
    for idx, batch in enumerate(split):
        xb = batch[0].to(device)
        yb = batch[1].to(device)
        y_pred = clf(xb)
        if return_acc:
            preds.append(y_pred.argmax(dim=1))
            targets.append(yb)
        loss = loss_fn(y_pred, yb)
        losses.append(loss.item())
    model.train()
    if return_acc:
        return torch.tensor(losses).mean().item(), (torch.cat(preds) == torch.cat(targets)).float().mean().item()
    return torch.tensor(losses).mean().item()

def collect_preds(model, loader):
    model.eval()
    all_preds = torch.tensor([])
    all_targets = torch.tensor([])
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            preds = model(xb)
            all_preds = torch.cat((all_preds, preds.cpu()), dim=0)
            all_targets = torch.cat((all_targets, yb.cpu()), dim=0)
    return all_preds, all_targets

In [310]:
if wandb_log:
    wandb.init(project=wandb_project, name=wandb_run_name, config=config)

lossi_train = []
lossi_val = []
track_acc = []
best_val_loss = 1e9

for step in range(max_iter):
    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        y_pred = clf(xb)
        loss = loss_fn(y_pred, yb)

        # compute the l1 penalty error term
        #params = torch.cat([p.view(-1) for p in clf.lm_head.parameters()])
        #l1_reg = lambda_1 * torch.norm(params, 1)
        #loss += l1_reg

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    if step % eval_interval == 0:
        lossi_train.append(estimate_loss(train_loader, return_acc=False))
        loss_val, acc_val = estimate_loss(val_loader, return_acc=True)
        lossi_val.append(loss_val)
        track_acc.append(acc_val)
        print(f"step {step}: train loss {lossi_train[-1]:.4f}, val loss {lossi_val[-1]:.4f}, acc val {track_acc[-1]:.4f}")

        if wandb_log:
            wandb.log({
                "iter": step,
                "train/loss": lossi_train[-1],
                "val/loss": lossi_val[-1],
                "val/acc": track_acc[-1],
                "lr": lr,
                })

        if lossi_val[-1] < best_val_loss:
            best_val_loss = lossi_val[-1]
            checkpoint = {
                'model': clf.state_dict(),
                'backbone': model,  # para inicializar la tabla de embedding del modelo gpt
                'optimizer': optimizer.state_dict(),
                'model_args': model_args,
                'iter_num': step,
                'best_val_loss': best_val_loss,
                'config': config,
                'gpt_meta': meta,
             }
            print(f"saving checkpoint to {out}")
            torch.save(checkpoint, os.path.join(out, 'ckpt.pt'))

    if warmup_iter and (step+1) == warmup_iter:
        for p in clf.embedding_from_gpt.parameters():
            p.requires_grad = True

print(f"Final result: train loss {lossi_train[-1]:.4f}, val loss {lossi_val[-1]:.4f}, acc val {track_acc[-1]:.4f}")

preds, targets = collect_preds(clf, val_loader)
print(classification_report(preds.argmax(dim=1).numpy(), targets.numpy(), 
                            target_names=['normal', 'incivilidad', 'odio']))

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
iter,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▃▂▄▂▁▂▁▂▂▃▂▂▁▂▁▂▂▃▂▃▃▂▄▂▂▁▃▂▂▄▂▁▄▃▂▂▃▂▃
val/acc,▁▆▆▅▆▇▆▇▆▅▇█▆▇▇▇▆▆▆▇▇▅▆▅▆▆▇▆█▆▇▇▇▃▅▆▇▆▆▆
val/loss,█▃▃▄▃▂▂▂▂▂▃▂▂▁▂▁▂▃▃▂▃▃▂▄▃▂▁▃▂▂▄▂▁▄▃▂▂▃▂▃

0,1
iter,119.0
lr,0.01
train/loss,0.63208
val/acc,0.70408
val/loss,0.62536


step 0: train loss 0.7750, val loss 0.7732, acc val 0.5756
saving checkpoint to out/gpt-classifier
step 1: train loss 0.6543, val loss 0.6568, acc val 0.6613
saving checkpoint to out/gpt-classifier
step 2: train loss 0.5813, val loss 0.5827, acc val 0.7215
saving checkpoint to out/gpt-classifier
step 3: train loss 0.5627, val loss 0.5601, acc val 0.7146
saving checkpoint to out/gpt-classifier
step 4: train loss 0.5355, val loss 0.5366, acc val 0.7284
saving checkpoint to out/gpt-classifier
step 5: train loss 0.5901, val loss 0.5884, acc val 0.7121
step 6: train loss 0.4928, val loss 0.4980, acc val 0.7400
saving checkpoint to out/gpt-classifier
step 7: train loss 0.4698, val loss 0.4677, acc val 0.7623
saving checkpoint to out/gpt-classifier
step 8: train loss 0.4556, val loss 0.4597, acc val 0.7618
saving checkpoint to out/gpt-classifier
step 9: train loss 0.5052, val loss 0.5081, acc val 0.7522
step 10: train loss 0.4364, val loss 0.4297, acc val 0.7840
saving checkpoint to out/gpt-c

In [523]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

preds, targets = collect_preds(clf, val_loader)
print(classification_report(preds.argmax(dim=1).numpy(), targets.numpy(), 
                            target_names=['normal', 'incivilidad', 'odio']))

print(confusion_matrix(preds.argmax(dim=1).numpy(), targets.numpy()))

              precision    recall  f1-score   support

      normal       0.87      0.86      0.86       459
 incivilidad       0.94      0.89      0.91       537
        odio       0.72      0.82      0.77       225

    accuracy                           0.86      1221
   macro avg       0.84      0.85      0.85      1221
weighted avg       0.87      0.86      0.87      1221

[[393  19  47]
 [ 34 478  25]
 [ 27  14 184]]


## Evaluación


Rescataremos las probabilidades de predicción para cada una de las clases,
y las verdaderas etiquetas para todo el conjunto de validación. Luego,
evaluamos según las funciones de la competencia.

In [524]:
from evaluation import evaluate

preds, targets = collect_preds(clf, val_loader)
print(f"Tamaño del dataset: {preds.shape[0]}")
pred_prob = F.softmax(preds.cpu(), dim=1).detach().numpy()

y_idx = targets.cpu().numpy()
y_label = np.array([dataset._id2label[x] for x in y_idx], dtype="object")
evaluate(pred_prob, y_label, np.array(list(dataset._label2id.keys())))

Tamaño del dataset: 1221
Matriz de confusión
[[393  27  34]
 [ 47 184  25]
 [ 19  14 478]]

Reporte de clasificación:

              precision    recall  f1-score   support

      normal       0.86      0.87      0.86       454
        odio       0.82      0.72      0.77       256
 incivilidad       0.89      0.94      0.91       511

    accuracy                           0.86      1221
   macro avg       0.85      0.84      0.85      1221
weighted avg       0.86      0.86      0.86      1221

Métricas:

AUC:  0.957	Kappa: 0.787	Accuracy: 0.864
------------------------------------------------------



array([0.957, 0.787, 0.864])

In [557]:
from sklearn.model_selection import train_test_split

SEED=42

def get_subsets(df):
    return train_test_split(
        df['texto'],
        df['clase'],
        shuffle=True,
        test_size=0.33,
        random_state=SEED,
        stratify=df['clase']
    )

Xtr, Xval, Ytr, Yval = get_subsets(train_df)
Xtr.shape, Xval.shape, Ytr.shape, Yval.shape

((8183,), (4031,), (8183,), (4031,))

Crear un nuevo loader...a partir de `Xval` y `Yval` de arriba.

In [559]:
from torch.utils.data import TensorDataset, DataLoader

X = torch.zeros((Xval.shape[0], dataset.max_char), dtype=torch.long)

for i, text in enumerate(Xval):
    X[i, :len(text)] = torch.tensor(dataset.encode_fn(text))

y = torch.tensor([dataset._label2id[x] for x in Yval], dtype=torch.long)
test_this = TensorDataset(X, y)
new_loader = DataLoader(test_this, batch_size=32, shuffle=False)

In [560]:
preds, targets = collect_preds(clf, new_loader)
print(f"Tamaño del dataset: {preds.shape[0]}")
pred_prob = F.softmax(preds.cpu(), dim=1).detach().numpy()

y_idx = targets.cpu().numpy()
y_label = np.array([dataset._id2label[x] for x in y_idx], dtype="object")
evaluate(pred_prob, y_label, np.array(list(dataset._label2id.keys())))

Tamaño del dataset: 4031
Matriz de confusión
[[1261   81   71]
 [ 122  632   74]
 [ 116   50 1624]]

Reporte de clasificación:

              precision    recall  f1-score   support

      normal       0.84      0.89      0.87      1413
        odio       0.83      0.76      0.79       828
 incivilidad       0.92      0.91      0.91      1790

    accuracy                           0.87      4031
   macro avg       0.86      0.85      0.86      4031
weighted avg       0.87      0.87      0.87      4031

Métricas:

AUC:  0.962	Kappa: 0.799	Accuracy: 0.872
------------------------------------------------------



array([0.962, 0.799, 0.872])

## Cargar checkpoint

Para cargar el modelo tenemos...

In [545]:
ckpt_path = os.path.join(out, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
checkpoint_model_args = checkpoint['model_args']

In [538]:
test = GPTClassifier(**checkpoint_model_args)
state_dict = checkpoint['model']
test.load_state_dict(state_dict)

TypeError: __init__() missing 1 required positional argument: 'gpt_model'

In [527]:
test

GPTClassifier(
  (embedding_from_gpt): Embedding(656, 384, padding_idx=0)
  (dropout_layer): Dropout(p=0.1, inplace=False)
  (hidden_layer): Linear(in_features=499200, out_features=16, bias=True)
  (hidden_layer2): Linear(in_features=16, out_features=16, bias=True)
  (lm_head): Linear(in_features=16, out_features=3, bias=True)
)

In [530]:
test.load_state_dict(clf.state_dict())

<All keys matched successfully>

In [531]:
test

GPTClassifier(
  (embedding_from_gpt): Embedding(656, 384, padding_idx=0)
  (dropout_layer): Dropout(p=0.1, inplace=False)
  (hidden_layer): Linear(in_features=499200, out_features=16, bias=True)
  (hidden_layer2): Linear(in_features=16, out_features=16, bias=True)
  (lm_head): Linear(in_features=16, out_features=3, bias=True)
)

In [533]:
from evaluation import evaluate

test.to(device)

preds, targets = collect_preds(test, val_loader)
print(f"Tamaño del dataset: {preds.shape[0]}")
pred_prob = F.softmax(preds.cpu(), dim=1).detach().numpy()

y_idx = targets.cpu().numpy()
y_label = np.array([dataset._id2label[x] for x in y_idx], dtype="object")
evaluate(pred_prob, y_label, np.array(list(dataset._label2id.keys())))

Tamaño del dataset: 1221
Matriz de confusión
[[393  27  34]
 [ 47 184  25]
 [ 19  14 478]]

Reporte de clasificación:

              precision    recall  f1-score   support

      normal       0.86      0.87      0.86       454
        odio       0.82      0.72      0.77       256
 incivilidad       0.89      0.94      0.91       511

    accuracy                           0.86      1221
   macro avg       0.85      0.84      0.85      1221
weighted avg       0.86      0.86      0.86      1221

Métricas:

AUC:  0.957	Kappa: 0.787	Accuracy: 0.864
------------------------------------------------------



array([0.957, 0.787, 0.864])