In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import os  # Para verificar si el archivo del modelo existe

class SimpleLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SimpleLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

class TextDataset(Dataset):
    def __init__(self, filepath, vocab_size=1000):
        self.vocab = self.build_vocab(filepath, vocab_size)
        self.data = self.load_data(filepath)

    def build_vocab(self, filepath, vocab_size):
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
        words = text.split()
        word_counts = Counter(words)
        most_common = word_counts.most_common(vocab_size - 1)
        vocab = {word: idx for idx, (word, _) in enumerate(most_common, start=1)}
        vocab['<UNK>'] = 0
        return vocab

    def load_data(self, filepath):
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
        words = text.split()
        return [self.vocab.get(word, 0) for word in words]

    def __len__(self):
        return len(self.data) - 1

    def __getitem__(self, idx):
        return (torch.tensor(self.data[idx:idx+1]), torch.tensor(self.data[idx+1]))

# Definir hiperparámetros
vocab_size = 1000
embedding_dim = 128
hidden_dim = 256
learning_rate = 0.001
num_epochs = 2
batch_size = 32

# Crear el conjunto de datos
full_path = "/ruta/del/modelo/"
dataset = TextDataset(full_path + 'dict.txt', vocab_size)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Inicializar el modelo, la función de pérdida y el optimizador
model = SimpleLM(vocab_size, embedding_dim, hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Cargar modelo y optimizador si existe
model_path = full_path + "simple_lm_model.pth"
if os.path.exists(model_path):
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    num_epochs = checkpoint['epoch'] + num_epochs  # Continuar desde el último epoch
    print("Modelo cargado, continuando entrenamiento desde la época:", checkpoint['epoch'])



Modelo cargado, continuando entrenamiento desde la época: 18


In [None]:
# Ciclo de entrenamiento
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in data_loader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        outputs = outputs.view(-1, vocab_size)
        
        loss = criterion(outputs, targets.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}')



Epoch 1, Loss: 0.9191245425037399


In [None]:
# Guardar el modelo
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, model_path)



In [None]:
# Generar texto
model.eval()
start_token = torch.tensor([[dataset.vocab['<UNK>']]], dtype=torch.long)
generated_text = []

for _ in range(512):
    with torch.no_grad():
        output = model(start_token)
        probs = torch.softmax(output, dim=-1)
        next_token = torch.multinomial(probs, 1)
        generated_text.append(next_token.item())

# Decodificar los tokens generados a texto
inverse_vocab = {idx: word for word, idx in dataset.vocab.items()}
generated_words = [inverse_vocab.get(token, '<UNK>') for token in generated_text]
generated_sentence = ' '.join(generated_words)

print("Generated Text:", generated_sentence)
