In [1]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
len(text)

1115393

In [3]:
print(text[:300])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
token_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_token = {i: ch for i, ch in enumerate(chars)}

SOS_TOKEN = '#'

assert SOS_TOKEN not in token_to_idx

token_to_idx[SOS_TOKEN] = vocab_size
idx_to_token[vocab_size] = SOS_TOKEN

vocab_size += 1

def encode(text):
    return [token_to_idx.get(ch, 0) for ch in text]

def decode(text_encoded):
    return ''.join([idx_to_token.get(i, '') for i in text_encoded])

encode("#Oh Lord")


[65, 27, 46, 1, 24, 53, 56, 42]

In [6]:
decode([65, 27, 46, 1, 24, 53, 56, 42])

'#Oh Lord'

In [7]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
data[:300]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [8]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [9]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#device=torch.device('cpu')
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, seq_len, block_size=128):
        self.data = data.to(device)
        self.seq_len = seq_len
        self.block_size = block_size
        self.sos_token = torch.tensor([token_to_idx[SOS_TOKEN]], dtype=torch.long).to(device)

    def __len__(self):
        return (len(self.data) - self.seq_len) // self.block_size

    def __getitem__(self, i):
        x = self.data[i * self.block_size: ((i + 1) * self.block_size)-1]
        x = torch.cat([self.sos_token, x])
        y = self.data[i * self.block_size + 1: (i + 1) * self.block_size + 1]
        return x, y
    
seq_len = 768
train_ds = Dataset(train_data, seq_len)
val_ds = Dataset(val_data, seq_len)

print(train_ds[0])

(tensor([65, 18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43,
        44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52,
        63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,
         1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39,
        49,  6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15,
        47, 58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50,
        50,  1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1,
        58, 53], device='cuda:0'), tensor([47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44, 53,
        56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,  1,
        44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1, 57,
        54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,  6,
         1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47, 58,
    

In [10]:
import torch
import math

class PositionalEncoding(torch.nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class SimpleModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, nhead=128, num_layers=3):
        super(SimpleModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim)
        self.transformer_layer = torch.nn.TransformerDecoderLayer(d_model=embedding_dim, nhead=nhead, batch_first=True)
        self.transformer_decoder = torch.nn.TransformerDecoder(self.transformer_layer, num_layers=num_layers)
        self.fc = torch.nn.Linear(embedding_dim, vocab_size)
        self.dropout = torch.nn.Dropout(0.2)
        self.initialize_weights()
        
    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, torch.nn.Linear):
                torch.nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    torch.nn.init.zeros_(m.bias)
            elif isinstance(m, torch.nn.Conv2d):
                torch.nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    torch.nn.init.zeros_(m.bias)
            elif isinstance(m, torch.nn.ConvTranspose2d):
                torch.nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    torch.nn.init.zeros_(m.bias)
            elif isinstance(m, torch.nn.BatchNorm2d):
                torch.nn.init.ones_(m.weight)
                torch.nn.init.zeros_(m.bias)
            elif isinstance(m, torch.nn.LayerNorm):
                torch.nn.init.ones_(m.weight)
                torch.nn.init.zeros_(m.bias)

    def forward(self, src, mask=None):
        src = self.embedding(src) * math.sqrt(self.embedding_dim)
        src = self.pos_encoder(src)
        # Da es sich um ein Decoder-only Modell handelt, ist `memory` nicht notwendig.
        # Stattdessen verwenden wir `src` direkt als Eingabe für den Transformer Decoder.
        out = self.transformer_decoder(src, memory=src, tgt_mask=mask)
        out = torch.nn.functional.relu(out)
        out = self.dropout(out)
        out = self.fc(out)
        return out


def generate_square_subsequent_mask(sz):
    """Generiert eine obere Dreiecksmatrix zur Maskierung der zukünftigen Tokens."""
    mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
    return mask

model = SimpleModel(vocab_size, 128).to(device)
train_loader=torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)

for xb, yb in train_loader:
    print(xb.shape)
    print("mask of size:", xb.size(1))
    mask=generate_square_subsequent_mask(xb.size(1)).to(device)
    print(mask)
    out = model(xb, mask=mask)
    print(out.shape)
    break

torch.Size([32, 128])
mask of size: 128
tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
torch.Size([32, 128, 66])


In [11]:
def generate_text(model, text, length):
    model.eval()
    with torch.no_grad():
        # Initialisieren Sie die Eingabe mit einem Start-Token oder der kodierten Eingabe
        input_data = torch.tensor(encode(text), dtype=torch.long, device=device).unsqueeze(0)
        
        # Bereiten Sie eine leere Sequenz für die Decoder-Eingabe vor (oder verwenden Sie ein SOS-Token)

        for i in range(length):
            # Verwenden Sie die aktuelle Decoder-Eingabe für die Vorhersage
            mask=generate_square_subsequent_mask(input_data.size(1)).to(device)
            out = model(input_data, mask=mask)
            last_token_logits = out[:, -1, :]
            last_token_prob = torch.softmax(last_token_logits, dim=-1)
            
            # print top 5 tokens
            #values, indices = torch.topk(last_token_prob, 5)
            #for v, i in zip(values[0], indices[0]):
            #    print(f"{decode([i.item()])} ({v.item():.4f})", end=", ")
            #print()
            #print(torch.argmax(last_token_prob, dim=1).unsqueeze(1))
            predicted_token = torch.argmax(last_token_prob, dim=1).unsqueeze(1)
            
            # Aktualisieren Sie die Decoder-Eingabe mit dem neu vorhergesagten Token
            input_data = torch.cat([input_data, predicted_token], dim=1)
    return decode(input_data.cpu().numpy()[0])  

generate_text(model, "Oh Lord", 3)


'Oh LordATz'

In [15]:
from tqdm.notebook import tqdm


BATCH_SIZE = 256

loss_func = torch.nn.CrossEntropyLoss()
model = SimpleModel(vocab_size, embedding_dim=512, nhead=16, num_layers=1)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.01)

model.to(device)

def train_epoch(model, train_loader, loss_func, optimizer):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        mask=generate_square_subsequent_mask(xb.size(1)).to(device)
        y_pred = model(xb, mask=mask)
       
        loss = loss_func(y_pred.view(-1, vocab_size), yb.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss/len(train_loader)

def validate_epoch(model, val_loader, loss_func):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            mask=generate_square_subsequent_mask(yb.size(1)).to(device)
            y_pred = model(xb, mask=mask)
            loss = loss_func(y_pred.view(-1, vocab_size), yb.view(-1))
            total_loss += loss.item()
            correct += (torch.argmax(y_pred, dim=2) == yb).sum().item()
    return total_loss/len(val_loader), correct

train_loader=torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=False)
val_loader=torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

train_losses = []
val_losses = []

for i in range(1000):
    train_loss = train_epoch(model, train_loader, loss_func, optimizer)
    train_losses.append(train_loss)
    if i % 2 == 0:
        val_loss, acc = validate_epoch(model, val_loader, loss_func)
        val_losses.append(val_loss)
        print(f'Epoch {i}, train_loss: {train_loss}, val_loss: {val_loss}, correct predicted: {acc}, sample_output:', generate_text(model, "Oh Lor", 15))
    else:
        val_losses.append(val_losses[-1])

import matplotlib.pyplot as plt
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='val loss')
plt.legend()
plt.show()

Epoch 0, train_loss: 3.2851047977324455, val_loss: 2.991214394569397, correct predicted: 22918, sample_output: Oh Lor o o o o o o o 
Epoch 2, train_loss: 2.977508529540031, val_loss: 2.9313393235206604, correct predicted: 23201, sample_output: Oh Lor e e e e e e e 
Epoch 4, train_loss: 2.9338382751710954, val_loss: 2.9184478521347046, correct predicted: 23341, sample_output: Oh Lor e e o e o e e 
Epoch 6, train_loss: 2.9071459308747323, val_loss: 2.9094192385673523, correct predicted: 23327, sample_output: Oh Lor o o o o o o o 
Epoch 8, train_loss: 2.8880800739411385, val_loss: 2.904467821121216, correct predicted: 23387, sample_output: Oh Lor o o o o o o o 
Epoch 10, train_loss: 2.8765700786344466, val_loss: 2.8994768857955933, correct predicted: 23410, sample_output: Oh Lor o o o o o o o 


KeyboardInterrupt: 

In [None]:
generate_text(model, 'Thank you! qw ', 5)

tensor([[39]], device='cuda:0')
tensor([[39]], device='cuda:0')
tensor([[39]], device='cuda:0')
tensor([[39]], device='cuda:0')
tensor([[39]], device='cuda:0')


'aaaaa'

In [None]:
generate_text(model, "This is a strange repose, to be asleep With eyes wide open; standing, speaking, moving,", 5)

tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')


'eeeee'

In [None]:
for i in range(100):
    train_loss = train_epoch(model, train_loader, loss_func, optimizer)
    train_losses.append(train_loss)
    if i % 10 == 0:
        val_loss = validate_epoch(model, val_loader, loss_func)
        val_losses.append(val_loss)
        print(f'Epoch {i}, train_loss: {train_loss}, val_loss: {val_loss}')
    else:
        val_losses.append(val_losses[-1])

import matplotlib.pyplot as plt
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='val loss')
plt.legend()
plt.show()

Epoch 0, train_loss: 0.17242780793458223, val_loss: 0.12128036469221115
Epoch 10, train_loss: 0.10837701428681612, val_loss: 0.07178276032209396
Epoch 20, train_loss: 0.07412474183365703, val_loss: 0.0466329213231802
Epoch 30, train_loss: 0.05364865646697581, val_loss: 0.032324107363820076
Epoch 40, train_loss: 0.04040246014483273, val_loss: 0.023512743413448334
Epoch 50, train_loss: 0.03169565834105015, val_loss: 0.01773807965219021


KeyboardInterrupt: 

In [None]:
generate_text(model, 'Thank you', 100)

tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([[43]], device='cuda:0')
tensor([

'eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee'

In [None]:
generate_text(model, "This is a strange repose, to be asleep With eyes wide open; standing, speaking, moving,", 100)

'This is a strange repose, to be asleep With eyes wide open; standing, speaking, moving,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'