In [3]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

# Paths
BASE_DIR = "."
TEXT_FILE = os.path.join(BASE_DIR, "pile_uncopyrighted_50MB.txt")
MODEL_PATH = os.path.join(BASE_DIR, "char_lstm_model.pth")

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load dataset
with open(TEXT_FILE, "r", encoding="utf-8") as f:
    text = f.read()

text = text[:500000]  # limit for training demo
print(f"Loaded text length: {len(text):,} characters")

# Character mapping
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Unique characters:", vocab_size)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return ''.join([itos[i] for i in l])

# Dataset
class TextDataset(Dataset):
    def __init__(self, data, seq_len=100):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        chunk = self.data[idx:idx+self.seq_len+1]
        input_seq = torch.tensor(chunk[:-1], dtype=torch.long)
        target_seq = torch.tensor(chunk[1:], dtype=torch.long)
        return input_seq, target_seq

encoded = encode(text)
dataset = TextDataset(encoded, seq_len=100)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Model
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size=128, hidden_size=256, num_layers=2):
        super(CharLSTM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

model = CharLSTM(vocab_size=vocab_size, embed_size=128, hidden_size=256, num_layers=2).to(device)

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        output, _ = model(inputs)
        loss = criterion(output.transpose(1, 2), targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{EPOCHS}] - Loss: {avg_loss:.4f}")

# Text generation function
def generate_text(model, start_text="hello", length=300):
    model.eval()
    start_text = start_text.lower()  # convert to lowercase
    input_seq = torch.tensor(encode(start_text), dtype=torch.long).unsqueeze(0).to(device)
    hidden = None
    generated = list(start_text)

    for _ in range(length):
        with torch.no_grad():
            output, hidden = model(input_seq, hidden)
            probs = torch.softmax(output[:, -1, :], dim=-1).detach().cpu().numpy().ravel()
            next_idx = np.random.choice(len(probs), p=probs)
            next_char = itos[next_idx]
            generated.append(next_char)
            input_seq = torch.tensor([[next_idx]], dtype=torch.long).to(device)

    return ''.join(generated)

# Test generation
print(generate_text(model, start_text="Once upon a time", length=500))

# Save model
torch.save(model.state_dict(), MODEL_PATH)
print(f"Model saved at {MODEL_PATH}")

Using device: cpu
Loaded text length: 500,000 characters
Unique characters: 91
Epoch [1/5] - Loss: 0.9892
Epoch [2/5] - Loss: 0.5925
Epoch [3/5] - Loss: 0.5029
Epoch [4/5] - Loss: 0.4657
Epoch [5/5] - Loss: 0.4440
once upon a time Ġrelationships Ġare Ġassembly Ġand Ġas Ġper Ġjava Ġw aylin Ġh bs ag Ġhas Ġbeen Ġdemonstrated Ġthe Ġeffective Ġsize Ġand Ġdirector Ġof Ġa Ġfew Ġshort Ġvirus Ġwere Ġseen Ġfor Ġthemselves Ġagain Ġand Ġib ib util . Ġthe Ġagreement Ġin Ġconnectors . Ġto Ġeman Ġof Ġknown Ġelfsole Ġk ogated Ġname Ġhas Ġcan Ġcaught Ġother Ġagent Ġand Ġadminisal Ġsetting Ġangle Ġleft Ġ5 Ġand Ġoff , Ġto Ġexearch Ġtell Ġwas Ġban Ġin Ġcomparison Ġto Ġthe Ġindication Ġof Ġboth Ġpath ogen Ġhim Ġdemonstrated Ġfor Ġsuch Ġas Ġc
Model saved at .\char_lstm_model.pth
