In [8]:
import torch
import pylelemmatize
import glob
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import string

corpus_str = "\n".join([open(f, "r").read() for f in glob.glob("../../sample_data/wienocist_charter_1/*.txt")])
print(corpus_str)
alphabet = pylelemmatize.AlphabetBMP(pylelemmatize.allbmp_encoding_alphabet_strings["ascii"], unknown_chr="@")
corpus_str = alphabet(corpus_str)
print(corpus_str)

Wier Chůnrat an dem Harmarkcht der Richter Chvoenrat der Polle der Purgermaister vnd der gemainrat von der Stat ze Wienne veriehen vnd tvoen kvnt allen die disen brief gesehent oder gehoerent daz die erbern læut weilen Her Ernst vnser burger vnd ver Gerdroevt sein hovsvrowe da si baide lebten vnd wol mahten einen hof datze Swechent mit gemainem rate und gvoetleichen willen schvoeften den vrowen datze sant Nycla und ierem Conuent ewichleich nah ier baider tot durch ier sele willen Da si dv baide tot gelagen der vorgenant Ernst vnd ver Gerdrovt sein housvrowe da choemen vvr vns ier baider svne der Jacob vnd der Hainreich vnd ier aidem der Hainreich vnd der Walther vnd bestætigen daz geschæfte vmb den vorgenanten hof vor vns vvr sich vnd vvr alle ier geerben also daz der hof mit allem dem daz darzvoe gehoert den vorgenanten vrowen ewichleichen dienen schol Darvmbe ze einen vorchvnde des geschæfftes und der stætigunge des vorgenanten hoves geh wier disen brief mit vnserr stat insigel Des s

In [None]:
alphabet = pylelemmatize.AlphabetBMP("ABab")
print(alphabet.encode_str_to_onehot("AZzBA"))

class CapitalizationDataset(Dataset):
    def __init__(self, text, alphabet, seq_length=250):
        self.alphabet = alphabet
        self.seq_length = seq_length
        self.text = alphabet(text)
        self.lower_text = self.text.lower()
        self.inputs = []
        self.targets = []

        for i in range(len(text) - seq_length):
            inp_seq = self.lower_text[i:i+seq_length]
            tgt_seq = text[i:i+seq_length]
            print(f"Input: {inp_seq}, Target: {tgt_seq}")
            self.inputs.append(text_to_tensor(inp_seq))
            self.targets.append(text_to_tensor(tgt_seq))

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]


[[0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]]


AttributeError: 'AlphabetBMP' object has no attribute 'encode_str_'

In [None]:

# 1. Define the character sets
all_chars = alphabet.unknown_chr + alphabet.src_alphabet_str
char2idx = {ch: idx for idx, ch in enumerate(all_chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}

# 2. Data preparation
def text_to_tensor(text):
    return torch.tensor([char2idx.get(c, 0) for c in text], dtype=torch.long)

def tensor_to_text(tensor):
    return ''.join([idx2char[idx.item()] for idx in tensor])

# 3. Custom Dataset

# 4. Model
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

# 5. Training loop
def train_model(text, num_epochs=1000, batch_size=32, hidden_size=128, num_layers=5, lr=0.001, device="cuda"):
    dataset = CapitalizationDataset(text)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = CharLSTM(len(all_chars), hidden_size)
    model = model.to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        total_loss = 0
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            output, _ = model(x_batch)
            loss = loss_fn(output.view(-1, len(all_chars)), y_batch.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.cpu().item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")
    return model.to("cpu")

# 6. Sampling
def predict(model, input_text, max_len=100):
    model.eval()
    input_tensor = text_to_tensor(input_text.lower()).unsqueeze(0)
    output, _ = model(input_tensor)
    pred_indices = output.argmax(dim=2).squeeze(0)
    return tensor_to_text(pred_indices)

# Example usage
if __name__ == "__main__":
    raw_text = "Hello world. this is an example text. another sentence starts here."
    train_str = corpus_str[1000:]
    test_str = corpus_str[:1000]
    model = train_model(train_str, num_epochs=100)
    test_input = "hello world. this is an example text."
    print("Input :", test_str.lower())
    pred = predict(model, test_str.lower())
    print("Output:", pred)
    print(f"CER: {sum([pred[n]==test_str[n] for n in range(len(pred))]) / len(pred):.4f}")

Epoch 1, Loss: 0.0251
Epoch 2, Loss: 0.0042
Epoch 3, Loss: 0.0040
Epoch 4, Loss: 0.0039


KeyboardInterrupt: 

In [None]:
print(torch.cuda.is_available())
torch.__file__

True


'/home/anguelos/venvs/p312_u2404/lib/python3.12/site-packages/torch/__init__.py'