In [1]:
import sys
import torch
from torch.utils.data import DataLoader
from transformers import LlamaTokenizer, LlamaForCausalLM, get_linear_schedule_with_warmup
from datasets import load_dataset
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

# Configuration
MODEL_NAME = "huggingface/llama"  # Replace with the actual model name/path
DATASET_NAME = "wikipedia"  # Wikimedia dataset
BATCH_SIZE = 4  # Adjust based on your GPU memory
GRADIENT_ACCUMULATION_STEPS = 8
LEARNING_RATE = 5e-5
EPOCHS = 3
TOP_PERCENT = 0.3  # Top 30%

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")
model.to(device)
model.train()

# Load and preprocess dataset
# Here, we're using a subset for demonstration. Adjust as needed.
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

def tokenize_function(examples):
    return tokenizer(examples["text"], return_tensors="pt", truncation=True, padding='max_length', max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Convert to PyTorch tensors
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Create DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(dataloader) * EPOCHS // GRADIENT_ACCUMULATION_STEPS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=total_steps//10, num_training_steps=total_steps)


Using device: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using the latest cached version of the dataset since wikitext couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'wikitext-2-raw-v1' at C:\Users\raymo\.cache\huggingface\datasets\wikitext\wikitext-2-raw-v1\0.0.0\b08601e04326c79dfdd32d625aee71d232d685c3 (last modified on Wed Oct 16 21:29:22 2024).


In [4]:

# Training loop
model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    epoch_loss = 0.0
    progress_bar = tqdm(dataloader, desc="Training")
    
    optimizer.zero_grad()
    
    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        logits = outputs.logits  # Shape: (batch_size, seq_length, vocab_size)
        print("logits.shape", logits.shape)
        print(logits)
        print("input_id.shape", input_ids.shape)

        logits_flat = logits.view(-1, logits.size(-1)) # shape: (batch_size, seq_length, vocab_size) -> (batch_size * seq_len, vocab_size)
        labels_flat = input_ids.view(-1)  # shape: (batch_size * seq_len)
        
        loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
        per_token_loss = loss_fn(logits_flat, labels_flat)  # shape: (batch_size * seq_len)
        per_token_loss = per_token_loss.view(input_ids.size()) # shape: (1, seq_len)
        print("per_token_loss", per_token_loss.shape)
        print(per_token_loss)
        sys.exit()


        # Shift tokens for next-token prediction
        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = input_ids[:, 1:].contiguous()
        shift_attention = attention_mask[:, 1:].contiguous()
        
        pre_token_loss = loss_fn(shift_logits, )

        # Compute log probabilities


        log_probs = F.log_softmax(shift_logits, dim=-1)  # (batch_size, seq_length-1, vocab_size)
        
        # Gather log probabilities of the correct tokens
        shift_labels_flat = shift_labels.view(-1)
        log_probs_flat = log_probs.view(-1, log_probs.size(-1))
        token_log_probs = log_probs_flat[torch.arange(shift_labels_flat.size(0)), shift_labels_flat]  # (batch_size * (seq_length-1))
        token_log_probs = token_log_probs.view(shift_labels.size())  # (batch_size, seq_length-1)
        
        # Compute negative log-likelihood loss per token
        token_losses = -token_log_probs  # (batch_size, seq_length-1)
        
        # Mask padding tokens
        token_losses = token_losses * shift_attention  # Zero out losses for padding tokens
        print(token_losses)
        # Determine the threshold for top 30% losses
        # Compute the number of tokens to keep
        num_tokens = (shift_attention.sum()).item()
        if num_tokens == 0:
            continue  # Skip if no tokens to process
        k = int(num_tokens * TOP_PERCENT)
        if k == 0:
            k = 1  # Ensure at least one token is kept
        
        # Flatten the losses and filter out padding tokens
        losses_flat = token_losses.view(-1)
        attention_flat = shift_attention.view(-1)
        valid_losses = losses_flat[attention_flat == 1]
        
        if valid_losses.numel() == 0:
            continue  # Skip if no valid losses
        
        # Find the threshold
        threshold = torch.topk(valid_losses, k, largest=True, sorted=False).values.min()
        
        # Create a mask for top 30% losses
        mask = (token_losses >= threshold).float()
        
        # Apply the mask
        masked_losses = token_losses * mask
        
        # Compute the final loss
        if mask.sum() == 0:
            continue  # Avoid division by zero
        final_loss = masked_losses.sum() / mask.sum()
        
        # Backward pass
        final_loss.backward()
        epoch_loss += final_loss.item()
        
        # Gradient accumulation
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        progress_bar.set_postfix({"Loss": final_loss.item()})
    
    avg_epoch_loss = epoch_loss / len(dataloader)
    print(f"Average Loss for Epoch {epoch + 1}: {avg_epoch_loss}")

# Optionally, save the model
# model.save_pretrained("llama_finetuned_wikimedia")

Epoch 1/3


Training:   0%|          | 0/1090 [00:00<?, ?it/s]

logits.shape torch.Size([4, 512, 151936])
tensor([[[ 8.0875, 12.1481, 14.4234,  ..., -3.8342, -3.8343, -3.8341],
         [ 8.0875, 12.1481, 14.4234,  ..., -3.8342, -3.8343, -3.8341],
         [ 8.0875, 12.1481, 14.4234,  ..., -3.8342, -3.8343, -3.8341],
         ...,
         [ 8.0875, 12.1481, 14.4234,  ..., -3.8342, -3.8343, -3.8341],
         [ 8.0875, 12.1481, 14.4234,  ..., -3.8342, -3.8343, -3.8341],
         [ 8.0875, 12.1481, 14.4234,  ..., -3.8342, -3.8343, -3.8341]],

        [[ 5.5409,  5.7478,  5.2112,  ..., -2.8926, -2.8932, -2.8928],
         [ 5.0321,  6.1159,  2.5386,  ..., -1.4790, -1.4815, -1.4792],
         [ 8.1329,  8.6025,  6.5732,  ..., -4.5803, -4.5809, -4.5802],
         ...,
         [11.2808, 13.2091, 13.7755,  ..., -3.4959, -3.4964, -3.4958],
         [11.5600, 13.2721, 13.9856,  ..., -3.6990, -3.6994, -3.6989],
         [11.4768, 13.2690, 13.7617,  ..., -3.5662, -3.5667, -3.5661]],

        [[ 8.0875, 12.1481, 14.4234,  ..., -3.8342, -3.8343, -3.8341],
   

Training:   0%|          | 0/1090 [00:16<?, ?it/s]

tensor([[14.7662, 14.7662, 14.7662,  ..., 14.7662, 14.7662, 14.7662],
        [ 4.4715, 17.7612, 14.6103,  ..., 21.2722, 20.7607, 20.7276],
        [14.7662, 14.7662, 14.7662,  ..., 14.7662, 14.7662, 14.7662],
        [ 9.8453,  6.8113, 23.3330,  ..., 17.4453, 16.7128, 16.5263]],
       device='cuda:0', grad_fn=<ViewBackward0>)





SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
