In [None]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir -p /content/drive/My\ Drive/corpusDir

In [None]:
!cp "test_data.jsonl" "/content/drive/My Drive/corpusDir"
!cp "train_data.jsonl" "/content/drive/My Drive/corpusDir"
!cp "val_data.jsonl" "/content/drive/My Drive/corpusDir"
!cp "vocab.json" "/content/drive/My Drive/corpusDir"

In [None]:
!ls "/content/drive/My Drive/corpusDir"

test_data.jsonl  train_data.jsonl  val_data.jsonl  vocab.json


In [None]:
#Google drive
vocab_path = "/content/drive/My Drive/corpusDir/vocab.json"
train_path = "/content/drive/My Drive/corpusDir/train_data.jsonl"
val_path = "/content/drive/My Drive/corpusDir/val_data.jsonl"
test_path = "/content/drive/My Drive/corpusDir/test_data.jsonl"

In [None]:
#Local runtime
vocab_path = "./json/vocab.json"
train_path = "./json/data/train_data.jsonl"
val_path =  "./json/data/val_data.jsonl"
test_path = "./json/data/test_data.jsonl"

In [None]:
!ls json/data

test_data.jsonl  train_data.jsonl  val_data.jsonl


In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import time
import os
from torch.nn.utils.rnn import pad_sequence


In [None]:
with open(vocab_path, "r", encoding="utf-8") as f:
    vocab = json.load(f)

if not vocab:
    raise Exception("Error: vocab file not loaded properly")
print(f"Vocabulary loaded. Size: {len(vocab)}")


Vocabulary loaded. Size: 163963


In [None]:
class EncodedDataset(Dataset):
    def __init__(self, file_path, vocab):
        self.sentences = []
        self.vocab = vocab
        with open(file_path, "r") as f:
            self.sentences = [json.loads(line.strip()) for line in f]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]

        # Ensure valid indices
        input_ids = [token if token < len(self.vocab) else self.vocab["<UNK>"] for token in sentence[:-1]]
        label = sentence[-1] if sentence[-1] < len(self.vocab) else self.vocab["<UNK>"]

        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)


In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    inputs, labels = zip(*batch)  # Separate inputs and labels
    inputs = [torch.tensor(input) for input in inputs]

    # Pad inputs to the same length
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)  # Labels don't need padding
    return padded_inputs.to(device), labels.to(device)



In [None]:
batch_size = 1024

print("Encoding datasets...")
train_dataset = EncodedDataset(train_path, vocab)
print("Encoded training data.")
val_dataset = EncodedDataset(val_path, vocab)
print("Encoded validation data.")
test_dataset = EncodedDataset(test_path, vocab)
print("Encoded testing data.")

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)


Encoding datasets...
Encoded training data.
Encoded validation data.
Encoded testing data.


In [None]:
class SimpleLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=2):
        super(SimpleLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        lstm_out, (hidden, _) = self.lstm(embedded)  # Shape: (batch_size, seq_length, hidden_dim)
        output = self.fc(hidden[-1])  # Shape: (batch_size, output_dim)
        return output


In [None]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 256
output_dim = vocab_size  # Predict the next word (classification across vocab)

model = SimpleLSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Handles multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Model initialized.")


Model initialized.


In [None]:
import os
import time
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Helper function to format time
def format_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

# Directory to save backups
checkpoint_dir = "./checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

epochs = 20

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    batch_times = []

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        batch_start_time = time.time()

        # Move inputs and labels to GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model(inputs)  # Predictions on GPU
        loss = criterion(predictions, labels)  # Compute loss on GPU

        loss.backward()  # Compute gradients on GPU
        optimizer.step()  # Update parameters on GPU

        epoch_loss += loss.item()

        # Record batch processing time
        batch_time = time.time() - batch_start_time
        batch_times.append(batch_time)

        # Print every 100 batches
        if (batch_idx + 1) % 100 == 0:
            # Calculate average batch time and estimate remaining time
            avg_batch_time = sum(batch_times) / len(batch_times)
            remaining_batches = len(train_loader) - (batch_idx + 1)
            remaining_time = remaining_batches * avg_batch_time
            vram_usage = (
                torch.cuda.memory_allocated(device) / (1024 ** 3)
                if torch.cuda.is_available()
                else 0
            )
            vram_total = (
                torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)
                if torch.cuda.is_available()
                else 0
            )
            print(
                f"Epoch {epoch + 1}/{epochs}, Batch {batch_idx + 1}/{len(train_loader)}, "
                f"Batch Time: {batch_time:.2f}s, Estimated Time Remaining: {format_time(remaining_time)}, "
                f"VRAM Usage: {vram_usage:.2f} GB / {vram_total:.2f} GB"
            )

    # Print epoch loss
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")

    # Save model checkpoint
    checkpoint_path = os.path.join(checkpoint_dir, f"epoch_{epoch + 1}.pth")
    torch.save(
        {
            "epoch": epoch + 1,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": epoch_loss,
        },
        checkpoint_path,
    )
    print(f"Checkpoint saved to {checkpoint_path}")


  inputs = [torch.tensor(input) for input in inputs]


In [None]:

# Load a specific checkpoint
def load_checkpoint(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    print(f"Checkpoint Loaded: Epoch {epoch}, Loss: {loss:.4f}")

# Example usage: Load the checkpoint from epoch 1
checkpoint_path = "./checkpoints/epoch_1.pth"  # Update with desired epoch
load_checkpoint(checkpoint_path)


Checkpoint Loaded: Epoch 1, Loss: 145059.3292


Todo:
* bite-wise tokenization

EVERYTHING BELOW OBSOLETE

In [None]:
#also old
# Move model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def format_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

epochs = 5

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    batch_times = []

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        batch_start_time = time.time()

        # Move inputs and labels to GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model(inputs)  # Predictions on GPU
        loss = criterion(predictions, labels)  # Compute loss on GPU

        loss.backward()  # Compute gradients on GPU
        optimizer.step()  # Update parameters on GPU

        epoch_loss += loss.item()

        # Record batch processing time
        batch_time = time.time() - batch_start_time
        batch_times.append(batch_time)

        # Calculate average batch time and estimate remaining time
        avg_batch_time = sum(batch_times) / len(batch_times)
        remaining_batches = len(train_loader) - (batch_idx + 1)
        remaining_time = remaining_batches * avg_batch_time
        print(f"Epoch {epoch + 1}/{epochs}, Batch {batch_idx + 1}/{len(train_loader)}, "
              f"Batch Time: {batch_time:.2f}s, Estimated Time Remaining: {format_time(remaining_time)}")

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")


In [None]:
#OLD
import time

def format_time(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

epochs = 5

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    batch_times = []

    for batch_idx, (inputs, labels) in enumerate(train_loader):
        batch_start_time = time.time()
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        predictions = model(inputs)  # Shape: (batch_size, vocab_size)
        predictions = predictions.to(device)
        loss = criterion(predictions, labels)  # Compare predictions to labels

        loss.backward()  # Compute gradients
        optimizer.step()  # Update parameters

        epoch_loss += loss.item()

        # Record batch processing time
        batch_time = time.time() - batch_start_time
        batch_times.append(batch_time)

        # Calculate average batch time and estimate remaining time
        avg_batch_time = sum(batch_times) / len(batch_times)
        remaining_batches = len(train_loader) - (batch_idx + 1)
        remaining_time = remaining_batches * avg_batch_time
        print(f"Epoch {epoch + 1}/{epochs}, Batch {batch_idx + 1}/{len(train_loader)}, "
              f"Batch Time: {batch_time:.2f}s, Estimated Time Remaining: {format_time(remaining_time)}")

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
# Save the trained model
model_save_path = "/content/drive/My Drive/corpusDir/model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to /content/drive/My Drive/corpusDir/model.pth


In [None]:
correct_predictions = 0
total_predictions = 0

# Switch model to evaluation mode
model.eval()

# Disable gradient calculations
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Get predictions
        predictions = model(inputs)  # Shape: (batch_size, vocab_size)
        predicted_labels = predictions.argmax(dim=1)  # Get the index of the max logit

        # Compare predictions to ground truth
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

# Compute accuracy
accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {accuracy:.4f}")


  inputs = [torch.tensor(input) for input in inputs]


KeyboardInterrupt: 

In [None]:
# Switch model to evaluation mode
model.eval()

# Disable gradient calculations
with torch.no_grad():
    for idx, (inputs, labels) in enumerate(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Get predictions
        predictions = model(inputs)  # Shape: (batch_size, vocab_size)
        predicted_labels = predictions.argmax(dim=1)  # Get the index of the max logit

        # Decode and display inputs, actual labels, and predictions
        for i in range(len(inputs)):
            input_sentence = [word for word in inputs[i].cpu().numpy() if word != 0]
            input_words = [key for key, val in vocab.items() if val in input_sentence]
            actual_word = [key for key, val in vocab.items() if val == labels[i].item()]
            predicted_word = [key for key, val in vocab.items() if val == predicted_labels[i].item()]

            print(f"Input: {' '.join(input_words)}")
            print(f"Actual Label: {actual_word[0] if actual_word else '<UNK>'}")
            print(f"Predicted: {predicted_word[0] if predicted_word else '<UNK>'}")
            print()

        # Stop after a few examples
        if idx >= 2:  # Adjust this value to control how many batches you want to inspect
            break


  inputs = [torch.tensor(input) for input in inputs]


Input: propinas se a es el en que un de chica la y muy cara debería estar feliz sonriente españa muchas veces ni céntimo deja no problema hay acostumbrarnos dejar para gente haga su
Actual Label: trabajo
Predicted: <UNK>

Input: si a es el que como ni no por bien visto acaso di cabrea todo mundo igual discriminación acoso <UNK>
Actual Label: sexual
Predicted: <UNK>

Input: es el cambio de con la qué tarjeta pagando balance entero
Actual Label: cuenta
Predicted: <UNK>

Input: lo el en que sin no su solo por chiste ya dicho conservadores cosas cabeza sigo ver ningún lado enfadados ocurren
Actual Label: sorprende
Predicted: <UNK>

Input: el sin ciclo
Actual Label: fin
Predicted: <UNK>

Input: si lo a pero es el o en una que sea un de sin con la y como tiene no más al cuando las los sus dos son resto todos donde cabeza nos bandos pensar tan mas objetivos persona menos izquierdas política español psoe etapa eminentemente corrupta políticas cuestionables poláéíóúüñticos pp españann izquierdi

KeyboardInterrupt: 

In [None]:
unk_count_inputs = sum(word == vocab["<UNK>"] for sentence in train_dataset.sentences for word in sentence[:-1])
unk_count_labels = sum(label == vocab["<UNK>"] for sentence in train_dataset.sentences for label in [sentence[-1]])
inputs_word_count = sum(len(sentence) for sentence in train_dataset.sentences)
labels_word_count = sum(1 for sentence in train_dataset.sentences for _ in sentence)

print(f"UNK in Inputs: {unk_count_inputs} out of , UNK in Labels: {unk_count_labels}")
print(f"Total Words in Inputs: {inputs_word_count}, Total Words in Labels: {labels_word_count}")


UNK in Inputs: 961721 out of , UNK in Labels: 62253
Total Words in Inputs: 72034188, Total Words in Labels: 72034188
