# EVALUABLE NLP

### EJERCICIO 1: Prepara tu dataset favorito

In [19]:
!pip install tiktoken
!pip install torch
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [3]:
# Librerias

import tiktoken
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

Importamos el texto en crudo que vamos a trabajar. En este caso se trata de un relato de lo que considero el mejor partido en la historia del tenis. Partido que enfrentó a Nadal vs Federer en Wimblendon 2008.

1.1. Load the dataset

In [5]:
with open("best_tennis_match.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 5259
The rivalry between Rafa Nadal and Roger Federer has transcended the boundaries of tennis, becoming


1.2 Tokenizar

In [6]:
# Inicializar el tokenizador
tokenizer = tiktoken.get_encoding("gpt2")

# Tokenizar el texto
integers = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})

print("Número de tokens:", len(integers))
print("Primeros 10 tokens:", integers[:10])
print("Texto decodificado de los primeros 10 tokens:", tokenizer.decode(integers[:10]))

Número de tokens: 1218
Primeros 10 tokens: [464, 26390, 1022, 20824, 64, 21877, 282, 290, 13637, 10169]
Texto decodificado de los primeros 10 tokens: The rivalry between Rafa Nadal and Roger Fed


1.3 Definir la clase Dataset

In [7]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.tokens = tokenizer.encode(text)

    def __len__(self):
        return (len(self.tokens) - self.max_length) // self.stride + 1

    def __getitem__(self, idx):
        start = idx * self.stride
        end = start + self.max_length
        input_ids = self.tokens[start:end]
        target_ids = self.tokens[start + 1 : end + 1]

        return input_ids, target_ids

1.4 DataLoader

In [8]:
# DataLoader creation
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )
    return dataloader

# Create DataLoader
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

In [9]:
# Display inputs and targets
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

# Decode and display inputs and targets
print("\nDecoded Inputs:")
for vector in inputs:
    strings = tokenizer.decode(vector.numpy())
    print(strings)

print("\nDecoded Targets:")
for vector in targets:
    strings = tokenizer.decode(vector.numpy())
    print(strings)

Inputs:
 [tensor([  464,    64, 13637, 23589,   286,   530,   287,  5701]), tensor([26390, 21877, 10169,  1631, 20790,   286,   262,    13]), tensor([ 1022,   282, 11882,   262,    11,   262,  2106,   383]), tensor([20824,   290,   468, 13215,  5033,  6000,   286,  7043])]

Targets:
 [tensor([26390, 21877, 10169,  1631, 20790,   286,   262,    13]), tensor([ 1022,   282, 11882,   262,    11,   262,  2106,   383]), tensor([20824,   290,   468, 13215,  5033,  6000,   286,  7043]), tensor([   64, 13637, 23589,   286,   530,   287,  5701,  1424])]

Decoded Inputs:
Thea Roger transc of one in sports
 rivalry Nad Fedended tennis of the.
 betweenalerer the, the history The
 Raf and has boundaries becoming greatest of du

Decoded Targets:
 rivalry Nad Fedended tennis of the.
 betweenalerer the, the history The
 Raf and has boundaries becoming greatest of du
a Roger transc of one in sportsels


## Ejercicio 2: Generar texto

2.1 Configuration for GPT-2 model

In [10]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [11]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        # Shape: (b, num_tokens, d_out)
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):

        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [12]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond)
            # (tened en cuenta que no supere la longitud del contexto permitido)

        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [13]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

In [14]:
# 1. Preparar texto de entrada
input_text = raw_text
input_ids = tokenizer.encode(input_text[:100])

# 2. Convertir a tensor
input_tensor = torch.tensor(input_ids)

# 3. Añadir dimensión de batch
input_tensor = input_tensor.unsqueeze(0)

# 4. Generar texto con el modelo
generated_ids = generate_text_simple(model, input_tensor, max_new_tokens=50, context_size=GPT_CONFIG_124M["context_length"])

# 5. Convertir IDs generados a texto
generated_text = tokenizer.decode(generated_ids[0].tolist())
print("Generated Text:\n", generated_text)

Generated Text:
 The rivalry between Rafa Nadal and Roger Federer has transcended the boundaries of tennis, becoming  Sr lashed speedingigating TA DutyTW cutoffumen hadnabethXbox372RuntimeAustin contest tutorβ Taisolatestaila entrusted vengeancefightershanded DN aggreg Loadingju Mangocent FitzpatrickCHAR spiked infraredprocessor reson enablingassociated collectionsDevice knowing measurementlement ACL1989 adamant Caféuin


# EJERCICIO 3: Generar texto usando el modelo pre-entrenado

Entiendo que el ejercicio 3 esta dentro del ejercicio 4.

# EJERCICIO 4: Cargar el modelo pre-entrenado y generar un nuevo texto

In [16]:
# Ajustar el contexto para que coincida con el modelo pre-entrenado
GPT_CONFIG_124M["context_length"] = 256

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar el modelo pre-entrenado
pretrained_model = GPTModel(GPT_CONFIG_124M)
pretrained_model.load_state_dict(torch.load("model.pth", map_location=device))
pretrained_model.eval()

# Preparar el texto de entrada
input_text = raw_text
input_ids = tokenizer.encode(input_text)

# Convertir a tensor y añadir dimensión de batch
input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)

# Generar texto con el modelo pre-entrenado
generated_ids = generate_text_simple(pretrained_model, input_tensor, max_new_tokens=50, context_size=GPT_CONFIG_124M["context_length"])

# Convertir IDs generados a texto
generated_text = tokenizer.decode(generated_ids[0].tolist())
print("Generated Text:\n", generated_text)

Generated Text:
 The rivalry between Rafa Nadal and Roger Federer has transcended the boundaries of tennis, becoming one of the greatest in the history of sports. The duels between the Spanish and Swiss tennis players reached their climax, especially for the former, on July 6, 2008, the day a Wimbledon final was played that for many is the best match ever seen in the history of tennis. That day, both contenders vied for a Grand Slam, with an epic victory for Nadal, the underdog, after five hours of thrilling tennis, with several interruptions and a triumph almost at night that goes directly into the annals of sports.

In the words of John McEnroe, who is by no means a nobody in the world of tennis, we are talking about "the greatest match ever seen." The American tennis player played great duels against Connors or Lendl, among others, but none like that Nadal-Federer at Wimbledon 2008. Rafa Nadal, some time after savoring a unique victory, took up McEnroe's words, asserting that it was

# EJERCICIO 5: Entrena el LLM en un texto de tu elección

In [22]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch

# 1. Cargar tokenizer y modelo
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Necesario si GPT2 no tiene token de padding
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# 2. Cargar tu archivo de texto como dataset
dataset = load_dataset("text", data_files={"train": "best_tennis_match.txt"})

# 3. Tokenizar el dataset
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 4. Configurar el data collator (para modelado de lenguaje)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 5. Configurar argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=100,
    save_total_limit=2,
    logging_steps=10,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),  # Usa GPU si está disponible
)

# 6. Crear el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
)

# 7. Entrenar
trainer.train()

Step,Training Loss
10,3.44
20,2.7097
30,2.1987


TrainOutput(global_step=36, training_loss=2.706594467163086, metrics={'train_runtime': 260.7824, 'train_samples_per_second': 0.265, 'train_steps_per_second': 0.138, 'total_flos': 4507287552000.0, 'train_loss': 2.706594467163086, 'epoch': 3.0})

In [25]:
prompt = "Rafa Nadal is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(inputs["input_ids"], max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Rafa Nadal is the first player to win a Grand Slam, and the first to win a Grand Slam, but it was Nadal who won the first one. The first time he won a Grand Slam, it was in the second round,


# EJERCICIO 6: Emplea la API de HuggingFace para resolver un problema de tu elección

In [27]:
from transformers import MarianMTModel, MarianTokenizer

# Modelo para traducir de inglés a español
model_name = "Helsinki-NLP/opus-mt-en-es"

# Cargar tokenizer y modelo
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Texto en inglés a traducir
texto = raw_text

# Tokenizar sin padding
tokens = tokenizer(raw_text, return_tensors="pt", padding=False, truncation=True)

# Generar la traducción
translated = model.generate(**tokens)

# Decodificar la salida
texto_es = tokenizer.decode(translated[0], skip_special_tokens=True)

print("Traducción:", texto_es)



Traducción: La rivalidad entre Rafa Nadal y Roger Federer ha sobrepasado los límites del tenis, convirtiéndose en uno de los mayores en la historia de los deportes. Los duelos entre los jugadores de tenis españoles y suizos alcanzaron su clímax, especialmente para el anterior, el 6 de julio de 2008, el día en que se jugó una final de Wimbledon que para muchos es el mejor partido jamás visto en la historia del tenis. Ese día, ambos contendientes "vivieron por un Grand Slam, con una victoria épica para Nadal, el subdog, después de cinco horas de tenis emocionante, con varias interrupciones y un triunfo casi en la noche que va directamente a los anales de los deportes. En las palabras de John McEnroe, que no es de ninguna manera un don nadie en el mundo del tenis, el subdogán en el que se ha visto un mejor partido de tenis, el jugador de tenis americano o Lendl, entre otros, pero ninguno como el mejor jugador de la tabla de fútbol de la tabla de fútbol de la tabla de fútbol de la tabla de