In [1]:
import torch

# Comprobar si hay una GPU disponible
if torch.cuda.is_available():
    # Seleccionar la GPU por índice (por ejemplo, índice 0 para la primera GPU)
    device = torch.device("cuda:0")
    print(f"Usando GPU: {torch.cuda.get_device_name(0)}")
else:
    # Si no hay GPU disponible, utiliza la CPU
    device = torch.device("cpu")
    print("No se encontraron GPUs disponibles, utilizando la CPU")

No se encontraron GPUs disponibles, utilizando la CPU


In [2]:
from torchtext import datasets
from torchtext.data import to_map_style_dataset
import numpy as np

# Load the dataset
train_iter, test_iter = datasets.AG_NEWS(split=('train', 'test'))

train_ds = to_map_style_dataset(train_iter)
test_ds = to_map_style_dataset(test_iter)

train = np.array(train_ds)
test = np.array(test_ds)

# Create vocabulary and embedding

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

vocab = build_vocab_from_iterator(map(lambda x: tokenizer(x[1]), train_iter), specials=['<pad>','<unk>'])
vocab.set_default_index(vocab["<unk>"])

print("Tamaño del vocabulario:", len(vocab), "tokens")
print("Tokenización de la frase 'Here is an example sentence':", tokenizer("Here is an example sentence"))
print("Índices de las palabras 'here', 'is', 'an', 'example', 'supercalifragilisticexpialidocious':", vocab(['here', 'is', 'an', 'example', 'supercalifragilisticexpialidocious']))
print("Palabras correspondientes a los índices 475, 21, 30, 5297, 0:", vocab.lookup_tokens([475, 21, 30, 5297, 0]))
print("Las diez primeras palabras del vocabulario:", vocab.get_itos()[:10])

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

print("Tokenización de la frase 'Here is an example sentence':", text_pipeline("Here is an example sentence"))

Tamaño del vocabulario: 95812 tokens
Tokenización de la frase 'Here is an example sentence': ['here', 'is', 'an', 'example', 'sentence']
Índices de las palabras 'here', 'is', 'an', 'example', 'supercalifragilisticexpialidocious': [476, 22, 31, 5298, 1]
Palabras correspondientes a los índices 475, 21, 30, 5297, 0: ['version', 'at', 'from', 'establish', '<pad>']
Las diez primeras palabras del vocabulario: ['<pad>', '<unk>', '.', 'the', ',', 'to', 'a', 'of', 'in', 'and']
Tokenización de la frase 'Here is an example sentence': [476, 22, 31, 5298, 2994]


In [3]:
# Assuming train_iter is an iterator of tuples (label, text)
first_example = next(iter(train_iter))

# Using the text_pipeline on the text part of the first example
indexed_sequence = text_pipeline(first_example[1])

print("Original Text:", first_example[1])
print("Indexed Sequence:", indexed_sequence)


Original Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Indexed Sequence: [432, 426, 2, 1606, 14839, 114, 67, 3, 849, 14, 28, 15, 28, 16, 50726, 4, 432, 375, 17, 10, 67508, 7, 52259, 4, 43, 4010, 784, 326, 2]


In [4]:
from torch.utils.data import DataLoader
import torch


def collate_batch(batch):
    label_list, text_list = [], []
    for sample in batch:
        label, text = sample
        text_list.append(torch.tensor(text_pipeline(text), dtype=torch.long))
        label_list.append(label_pipeline(label))
    return torch.tensor(label_list, dtype=torch.long), torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])

train_dataloader = DataLoader(
    train_iter, batch_size=64, shuffle=True, collate_fn=collate_batch
)

test_dataloader = DataLoader(
    test_iter, batch_size=64, shuffle=True, collate_fn=collate_batch
)

for batch in train_dataloader:
    print(batch[1][:4])
    print("\n")
    print(batch[0][:4])
    print("\n")
    break



tensor([[ 6069,  4073,    12,    78,    17,    10,  2119,   315,   341,  1247,
            17,    84,   337,   583,    19,   131,  1111,     4,   332,  5678,
            17,    10,     2,   457,  5704,  5031,    11,   100,   377,   236,
             5,    38,  1751,     2,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  118,    13,    10,  1216,     5,   267,   421,  4507,   280,   933,
           174,   118,    29,  1054,    18,   267,   421,  4507, 30276,  2989,
           351,     7,     3,   113,   427,     7,  7074,   280,   174,     8,
           841,     2,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0

In [5]:
import torch
import torch.nn as nn


class LSTMTextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super(LSTMTextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)  # <-- Capa de embedding genérica (no pre-entrenada)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text)  # <-- Tras pasar por la capa de embedding, las palabras se representan como vectores
        lstm_out, _ = self.lstm(embedded)
        # Tomar la última salida de la secuencia LSTM
        last_output = lstm_out[:, -1, :]
        output = self.fc(last_output)
        return output
    
model = LSTMTextClassificationModel(len(vocab), 32, 64, 4)
model.train()

for batch in train_dataloader:
    predicted_label = model(batch[1])
    label = batch[0]
    break

print(batch[1][:4])
print(predicted_label[:4])
print(label[:4])

tensor([[  209,   294,  1193,  1986,     5,  2186,   353,    30,  1033,  2355,
           240,     4,    71,    14,    32,    15,    54,     6, 15919,    21,
            71,    13,    10,   185,   523,     8,   240,    85,     6,  1036,
          1972,  1193,    29,   299,     5,  4723,     9,  1930,    33,   353,
            30,     6,  2355,     8,  1033,     2,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  710,   286,   333,    43,  2123,  7548,   245,   286,  1785,    43,
         13871,    19,    47,    73,   178,   379,   620,  1237,     4,    20,
          3174,  6111,     9, 14197,  2955,    60,    24,   255,     5,   216,
             3, 38541,  1490,    41

In [6]:
import time

# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

def train(dataloader):
    model.train()
    total_acc, total_count, max_acc = 0, 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):

        label, text = label.to(device), text.to(device)

        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| {:5d} batches '
                  '| accuracy {:8.3f}'.format(idx, total_acc / total_count))

            if max_acc < total_acc / total_count:
                max_acc = total_acc / total_count

            total_acc, total_count = 0, 0
            start_time = time.time()
    return max_acc


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    accu_train = train(train_dataloader)
    accu_val = evaluate(test_dataloader)

    #if accu_train > accu_val:
    #    scheduler.step()
    
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)



|   500 batches | accuracy    0.254
|  1000 batches | accuracy    0.257
|  1500 batches | accuracy    0.261
-----------------------------------------------------------
| end of epoch   1 | time: 201.20s | valid accuracy    0.256 
-----------------------------------------------------------
|   500 batches | accuracy    0.329
|  1000 batches | accuracy    0.414
|  1500 batches | accuracy    0.443
-----------------------------------------------------------
| end of epoch   2 | time: 133.21s | valid accuracy    0.473 
-----------------------------------------------------------
|   500 batches | accuracy    0.469
|  1000 batches | accuracy    0.550
|  1500 batches | accuracy    0.705
-----------------------------------------------------------
| end of epoch   3 | time: 134.20s | valid accuracy    0.754 
-----------------------------------------------------------
|   500 batches | accuracy    0.774
|  1000 batches | accuracy    0.802
|  1500 batches | accuracy    0.828
----------------------

# Practica 1

In [7]:
import torch

# Comprobar si hay una GPU disponible
if torch.cuda.is_available():
    # Seleccionar la GPU por índice (por ejemplo, índice 0 para la primera GPU)
    device = torch.device("cuda:0")
    print(f"Usando GPU: {torch.cuda.get_device_name(0)}")
else:
    # Si no hay GPU disponible, utiliza la CPU
    device = torch.device("cpu")
    print("No se encontraron GPUs disponibles, utilizando la CPU")

No se encontraron GPUs disponibles, utilizando la CPU


In [8]:
from torchtext.vocab import GloVe
GLOVEDIM = 100  # 50, 100, 200, or 300
glove = GloVe(name='6B', dim=GLOVEDIM) 

#text_pipeline = lambda x: [glove.get_vecs_by_tokens(token).tolist() for token in tokenizer(x)]
label_pipeline = lambda x: int(x) - 1

embedding_size = glove.vectors.size(1)
print(embedding_size)
print(glove.vectors.size(0))

100
400000


In [9]:
def collate_batch(batch):
    label_list, text_list = [], []
    max_len = max([len(tokenizer(sample[1])) for sample in batch])
    for sample in batch:
        label, text = sample
        
        embed_list = glove.get_vecs_by_tokens(tokenizer(text))
        padding_list = glove[0].unsqueeze(0).repeat(max_len - len(embed_list), 1) 
                       #2d padding vector repeated
        embed_list = torch.cat((embed_list, padding_list), 0) 
        #text_list.append(torch.tensor(text_pipeline(text))) tensor of token
        text_list.append(embed_list)
        label_list.append(label_pipeline(label))
    #return torch.tensor(label_list, dtype=torch.long), torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])
    return torch.tensor(label_list, dtype=torch.long).to(device), torch.stack(text_list).to(device)
    
train_dataloader = DataLoader(train_iter, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_iter, batch_size=64, shuffle=True, collate_fn=collate_batch)

for batch in train_dataloader:
    print(batch[1][1].shape)
    print(batch[1][:4])
    print("\n")
    print(batch[0][:4])
    print("\n")
    break

torch.Size([69, 100])
tensor([[[-0.2314, -0.0691,  1.5072,  ...,  0.7175,  1.7060, -0.1610],
         [-0.0720,  0.2313,  0.0237,  ..., -0.7189,  0.8689,  0.1954],
         [ 0.3778, -0.1233,  0.7827,  ...,  0.0042,  0.8549, -0.4411],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.7412,  0.1104,  0.5678,  ...,  0.2616,  0.5887,  0.2336],
         [ 0.5847, -0.9247, -0.2611,  ..., -0.2471, -0.0023, -0.3864],
         [ 0.3590,  0.0196, -0.5449,  ...,  0.1344,  1.0224,  0.1363],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.2490, -0.0982,  0.2848,  ...,  0.5489, -0.7360,  0.2878],
         [-0.3015,  0.6

In [10]:
import torch
import torch.nn as nn

class LSTMTextClassificationModelGloVe(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_class):
        super(LSTMTextClassificationModelGloVe, self).__init__()
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_class)

    def forward(self, embedding):
        lstm_out, _ = self.lstm(embedding)
        last_output = lstm_out[:, -1, :]
        output = self.fc(last_output)
        return output
        
model = LSTMTextClassificationModelGloVe(GLOVEDIM, 64, 4).to(device)

In [11]:
# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 8  # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    accu_train = train(train_dataloader)
    accu_val = evaluate(test_dataloader)

    if accu_train > accu_val:
        scheduler.step()
    
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

|   500 batches | accuracy    0.250
|  1000 batches | accuracy    0.251
|  1500 batches | accuracy    0.267
-----------------------------------------------------------
| end of epoch   1 | time: 256.19s | valid accuracy    0.672 
-----------------------------------------------------------
|   500 batches | accuracy    0.684
|  1000 batches | accuracy    0.808
|  1500 batches | accuracy    0.876
-----------------------------------------------------------
| end of epoch   2 | time: 211.75s | valid accuracy    0.878 
-----------------------------------------------------------
|   500 batches | accuracy    0.878
|  1000 batches | accuracy    0.888
|  1500 batches | accuracy    0.905
-----------------------------------------------------------
| end of epoch   3 | time: 210.26s | valid accuracy    0.892 
-----------------------------------------------------------
|   500 batches | accuracy    0.892
|  1000 batches | accuracy    0.901
|  1500 batches | accuracy    0.919
----------------------