In [1]:
import pandas as pd
import json
import os
from concurrent.futures import ProcessPoolExecutor
datadir = "/home/m/dev/ai/llm/wiki_en"
max_files = 1000

file_list = [
    os.path.join(datadir, filename)
    for filename in os.listdir(datadir)
    if filename.endswith(".json")
][:max_files]
print(len(file_list))
def load_file(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
        df = pd.DataFrame(data)
    return df

with ProcessPoolExecutor() as executor:
    dfs = list(executor.map(load_file, file_list))

fulldata = pd.concat(dfs, ignore_index=True)

print(fulldata)

605
              id                                               text  \
0        9786730  Elizabeth Seton Academy may refer to: *Elizabe...   
1        9786749  __NOTOC__ is an editor and architecture critic...   
2        9786753  Northern Beaches Christian School is an indepe...   
3        9786755  Difference of Opinion was an Australian televi...   
4        9786756  Makoto Ueda may refer to: *Makoto Ueda (poetry...   
...          ...                                                ...   
6144358    32505  An ampule of nitrogen oxide vapor: brown nitro...   
6144359    32506  Venus is a planet in the Solar System, named a...   
6144360    32509  Vitamin C, also known as ascorbic acid and asc...   
6144361    32511  Global distribution of speakers Vietnamese () ...   
6144362    32512  A vitamin is an organic molecule (or a chemica...   

                                     title  
0                  Elizabeth Seton Academy  
1        Makoto Ueda (architecture critic)  
2       

In [4]:
import torch
torch.cuda.empty_cache()
print(torch.cuda.get_device_name(0))
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import transformers
import math
from torch.optim.lr_scheduler import CosineAnnealingLR

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super(TransformerModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        
        self.output_layer = nn.Linear(d_model, vocab_size)
        self.d_model = d_model
        
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz) * float('-inf')) == 0).transpose(0, 1)
        return mask
        
    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None,
                src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        memory = self.transformer_encoder(src, src_key_padding_mask=src_key_padding_mask)
        output = self.transformer_decoder(tgt, memory, tgt_mask=tgt_mask,
                                          memory_mask=memory_mask,
                                          tgt_key_padding_mask=tgt_key_padding_mask,
                                          memory_key_padding_mask=memory_key_padding_mask)
        
        output = self.output_layer(output)
        return output

class WikiDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
vocab_size = tokenizer.vocab_size

model = TransformerModel(vocab_size)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10000, eta_min=1e-6)


NVIDIA GeForce RTX 4080 SUPER


In [3]:
import time
import torch
from torch.amp import GradScaler, autocast

dataset = WikiDataset(fulldata, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
num_epochs = 3

num_batches = len(dataloader)
max_batches = num_batches

print(f"Number of batches: {num_batches}")
model.train()
total_estimated_time = 0
scaler = GradScaler()

initial_epoch_time_estimate_done = False
start_training_time = time.time()


for epoch in range(num_epochs):
    total_loss = 0.0
    epoch_start_time = time.time()

    for batch_idx, batch in enumerate(dataloader):
        if epoch == 0 and batch_idx == 100 and not initial_epoch_time_estimate_done:
            average_batch_time = (time.time() - epoch_start_time) / 100
            estimated_remaining_batches = (num_epochs * max_batches) - 100
            total_estimated_time = average_batch_time * estimated_remaining_batches
            print(f"Initial Estimated total remaining time: {total_estimated_time / 60:.2f} minutes.")
            initial_epoch_time_estimate_done = True

        batch_start_time = time.time()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        inputs = input_ids[:, :-1]
        targets = input_ids[:, 1:]

        src_key_padding_mask = (inputs == tokenizer.pad_token_id)
        tgt_key_padding_mask = src_key_padding_mask

        tgt_mask = model.generate_square_subsequent_mask(inputs.size(1)).to(device)

        optimizer.zero_grad()

        with autocast(device_type="cuda"):
            outputs = model(
                inputs, inputs,
                tgt_mask=tgt_mask,
                src_key_padding_mask=src_key_padding_mask,
                tgt_key_padding_mask=tgt_key_padding_mask
            )

            loss = criterion(outputs.view(-1, vocab_size), targets.contiguous().view(-1))

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()

        batch_end_time = time.time()
        elapsed_batch_time = batch_end_time - batch_start_time

        if batch_idx % 100 == 0:
            average_batch_time = (time.time() - epoch_start_time) / (batch_idx + 1)
            estimated_remaining_batches = (num_epochs - epoch - 1) * max_batches + (max_batches - batch_idx - 1)
            estimated_remaining_time = average_batch_time * estimated_remaining_batches

            print(f"Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx + 1}/{max_batches}, "
                  f"Loss: {loss.item():.4f}, Batch Time: {elapsed_batch_time:.2f} sec, "
                  f"Average Batch Time: {average_batch_time:.2f} sec, "
                  f"Estimated Remaining Time: {estimated_remaining_time / 60:.2f} minutes")

        if (batch_idx + 1) % 10000 == 0:
            torch.save(model.state_dict(), f'model_epoch_{epoch + 1}_batch_{batch_idx + 1}.pth')
            print(f"Model saved at epoch {epoch + 1}, batch {batch_idx + 1}")

    avg_loss = total_loss / (batch_idx + 1)
    epoch_end_time = time.time()
    elapsed_epoch_time = epoch_end_time - epoch_start_time

    print(f"Epoch {epoch + 1}/{num_epochs} completed. "
          f"Average Loss: {avg_loss:.4f}, Epoch Time: {elapsed_epoch_time / 60:.2f} minutes, "
          f"Total Time Elapsed: {(epoch_end_time - start_training_time) / 60:.2f} minutes")

    torch.save(model.state_dict(), f'model_epoch_{epoch + 1}.pth')
    print(f"Model saved for epoch {epoch + 1}")

print("Training completed.")

model.load_state_dict(torch.load('best_model.pth'))
print("Best model loaded.")


Number of batches: 192012
Epoch 1/3, Batch 1/192012, Loss: 10.5021, Batch Time: 0.55 sec, Average Batch Time: 2.55 sec, Estimated Remaining Time: 24492.37 minutes
Initial Estimated total remaining time: 2300.29 minutes.
Epoch 1/3, Batch 101/192012, Loss: 8.5286, Batch Time: 0.22 sec, Average Batch Time: 0.24 sec, Estimated Remaining Time: 2298.02 minutes
Epoch 1/3, Batch 201/192012, Loss: 7.8527, Batch Time: 0.22 sec, Average Batch Time: 0.23 sec, Estimated Remaining Time: 2186.93 minutes
Epoch 1/3, Batch 301/192012, Loss: 7.3845, Batch Time: 0.22 sec, Average Batch Time: 0.22 sec, Estimated Remaining Time: 2149.90 minutes
Epoch 1/3, Batch 401/192012, Loss: 7.2810, Batch Time: 0.22 sec, Average Batch Time: 0.22 sec, Estimated Remaining Time: 2131.16 minutes
Epoch 1/3, Batch 501/192012, Loss: 7.0932, Batch Time: 0.22 sec, Average Batch Time: 0.22 sec, Estimated Remaining Time: 2119.76 minutes
Epoch 1/3, Batch 601/192012, Loss: 7.0837, Batch Time: 0.22 sec, Average Batch Time: 0.22 sec, 

KeyboardInterrupt: 

In [16]:
vocab_size = 30522
d_model = 512
nhead = 4
num_encoder_layers = 6
num_decoder_layers = 6
dim_feedforward = 2048
dropout = 0.1

model = TransformerModel(
    vocab_size=vocab_size,
    d_model=d_model,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout
)

state_dict = torch.load('model.4.pth')

new_state_dict = {}
for k, v in state_dict.items():
    if 'linear1.weight' in k or 'linear2.weight' in k:
        if v.shape[0] == 2048 or v.shape[1] == 2048:
            new_state_dict[k] = v
        else:
            print(f"Resizing {k} from {v.shape} to match dim_feedforward=2048")
            if 'linear1.weight' in k:
                new_state_dict[k] = torch.nn.functional.pad(v, (0, 0, 0, 2048 - v.shape[0]))
            else:  # 'linear2.weight'
                new_state_dict[k] = torch.nn.functional.pad(v, (0, 2048 - v.shape[1], 0, 0))
    elif 'linear1.bias' in k:
        if v.shape[0] == 2048:
            new_state_dict[k] = v
        else:
            print(f"Resizing {k} from {v.shape} to match dim_feedforward=2048")
            new_state_dict[k] = torch.nn.functional.pad(v, (0, 2048 - v.shape[0]))
    else:
        new_state_dict[k] = v

model.load_state_dict(new_state_dict)

model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def generate_text(prompt, max_length=20, temperature=0.6, top_k=50, repetition_penalty=1.2):

    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    generated = input_ids.clone()
    for _ in range(max_length):
        inputs = generated[:, -max_length:]

        src_key_padding_mask = (inputs == tokenizer.pad_token_id).to(device)
        tgt_mask = model.generate_square_subsequent_mask(inputs.size(1)).to(device)

        with torch.no_grad():
            outputs = model(
                inputs, inputs,
                tgt_mask=tgt_mask,
                src_key_padding_mask=src_key_padding_mask,
                tgt_key_padding_mask=src_key_padding_mask
            )
            next_token_logits = outputs[:, -1, :] / temperature

            for token_id in set(generated.view(-1).tolist()):
                next_token_logits[:, token_id] /= repetition_penalty

            filtered_logits, indices = torch.topk(next_token_logits, top_k)
            probabilities = torch.softmax(filtered_logits, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1)
            next_token = indices.gather(-1, next_token)

        generated = torch.cat([generated, next_token], dim=1)

        if next_token.item() == tokenizer.sep_token_id:
            break

    generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    return generated_text


  state_dict = torch.load('model.4.pth')


In [17]:
print(generate_text("The capital of"))

the capital of of region of of of north of a. of a, located of history of a or a of
