In [2]:
import torch
import gc
import os
import sys
import subprocess
import platform

# Set environment variable to help with memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Environment & version checks
# Import CUDA utils from parent folder (preferred), fallback to local
from pathlib import Path

print(f"Python: {sys.version}")
try:
    import torch, transformers
    print("PyTorch:", torch.__version__)
    print("Transformers:", transformers.__version__)
except Exception as e:
    print("You likely need to install torch/transformers:", e)
    
# Try parent directory first (ideal location)
parent_dir = str(Path.cwd().parent)
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

try:
    import utils  # expected at ../utils.py
except Exception:
    # Fallback: current working directory
    curr_dir = str(Path.cwd())
    if curr_dir not in sys.path:
        sys.path.insert(0, curr_dir)
    import utils  # tries ./utils.py

from utils import *

print("Loaded utils from:", utils.__file__)
# Set memory env & show current device
utils.setup_memory_environment(expandable_segments=True)
device = utils.get_device()
print("Selected device:", device)

full_cleanup()

Python: 3.12.11 (main, Jul 23 2025, 00:34:44) [Clang 20.1.4 ]
PyTorch: 2.8.0+cu129
Transformers: 4.56.0
Loaded utils from: /mnt/nfs/workspace/courses/PyTorch/Building-Transformer-Models-with-PyTorch-2.0/utils.py
Memory environment configured
Selected device: cuda
GPU memory cleared


In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 
# TinyLlama - 1.1B parameters (small and fast)
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

# 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id  # Use EOS token as pad token
tokenizer.padding_side = "left"
 
# Load the dataset
#dataset = load_dataset("tiny_shakespeare", revision="main")
# Load from the specific GitHub URL
dataset = load_dataset(
    "text",
    data_files={"train": "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"}
)

'''
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})
'''

# Split the continuous text into smaller chunks
def split_text(text, max_length=100):
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

# Apply the split_text function to the dataset


split_texts = split_text(dataset["train"]["text"][0])



# Tokenize the split_texts
tokenized_texts = tokenizer(split_texts, return_tensors="pt", padding=True, truncation=True, max_length=2048)

class ShiftedDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        input_ids = self.encodings["input_ids"][idx]
        attention_mask = self.encodings["attention_mask"][idx]
        labels = input_ids[1:].tolist() + [tokenizer.eos_token_id]
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": torch.tensor(labels)}

    def __len__(self):
        return len(self.encodings["input_ids"])

# Create a DataLoader
train_dataset = ShiftedDataset(tokenized_texts)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)


In [3]:
item=next(iter(train_dataloader))
print(item['input_ids'])
print(item['attention_mask'])
print(item['labels'])

tensor([[    1,  3824, 21353, 19642, 29901]])
tensor([[1, 1, 1, 1, 1]])
tensor([[ 3824, 21353, 19642, 29901,     2]])


In [4]:
from accelerate import Accelerator
from transformers import GPT2LMHeadModel

# Initialize the Accelerator
accelerator = Accelerator()

# Configure the training arguments
num_epochs = 20
learning_rate = 5e-5

# Initialize the GPT-2 model and optimizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Prepare the model and optimizer for training with Accelerator
model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)


In [5]:
num_epochs=40
epoch=20

In [6]:
from torch.optim import AdamW
from tqdm import tqdm
import os

# Create directory for saving models
os.makedirs("model/tiny_shakespeare", exist_ok=True)

# Fine-tuning loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    epoch_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    
    for step, batch in enumerate(epoch_iterator):
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        accelerator.backward(loss)
        optimizer.step()

        # Update progress bar with current loss
        epoch_iterator.set_postfix({
            "Loss": f"{loss.item():.4f}",
            "Avg Loss": f"{total_loss/(step+1):.4f}"
        }, refresh=True)

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

    # Save the model every 5 epochs
    # if (epoch + 1) % 5 == 0:
    #    model_save_path = f"model/tiny_shakespeare/model_checkpoint_epoch_{epoch + 1}"
    #    accelerator.wait_for_everyone()
    #    unwrapped_model = accelerator.unwrap_model(model)
    #    unwrapped_model.save_pretrained(model_save_path)
    #    tokenizer.save_pretrained(model_save_path)
    #    print(f"Model saved at {model_save_path}")
    
    clear_gpu_memory()

# Save final model
final_model_path = "model/tiny_shakespeare/final_model.pt"
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final model saved at {final_model_path}")

clear_gpu_memory()

Epoch 1/40:   0%|                                                                                                                                                                | 0/1 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Epoch 1/40: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.77it/s, Loss=14.4406, Avg Loss=14.4406]


Epoch 1/40 - Average Loss: 14.4406
GPU memory cleared


Epoch 2/40: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.90it/s, Loss=12.9112, Avg Loss=12.9112]


Epoch 2/40 - Average Loss: 12.9112
GPU memory cleared


Epoch 3/40: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.29it/s, Loss=10.0546, Avg Loss=10.0546]


Epoch 3/40 - Average Loss: 10.0546
GPU memory cleared


Epoch 4/40: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.50it/s, Loss=9.0267, Avg Loss=9.0267]


Epoch 4/40 - Average Loss: 9.0267
GPU memory cleared


Epoch 5/40: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.43it/s, Loss=6.4970, Avg Loss=6.4970]


Epoch 5/40 - Average Loss: 6.4970
GPU memory cleared


Epoch 6/40: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.40it/s, Loss=7.4582, Avg Loss=7.4582]


Epoch 6/40 - Average Loss: 7.4582
GPU memory cleared


Epoch 7/40: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.48it/s, Loss=6.9754, Avg Loss=6.9754]


Epoch 7/40 - Average Loss: 6.9754
GPU memory cleared


Epoch 8/40: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.25it/s, Loss=6.2760, Avg Loss=6.2760]


Epoch 8/40 - Average Loss: 6.2760
GPU memory cleared


Epoch 9/40: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.54it/s, Loss=2.0550, Avg Loss=2.0550]


Epoch 9/40 - Average Loss: 2.0550
GPU memory cleared


Epoch 10/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.28it/s, Loss=2.2235, Avg Loss=2.2235]


Epoch 10/40 - Average Loss: 2.2235
GPU memory cleared


Epoch 11/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.55it/s, Loss=0.4841, Avg Loss=0.4841]


Epoch 11/40 - Average Loss: 0.4841
GPU memory cleared


Epoch 12/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.43it/s, Loss=0.1839, Avg Loss=0.1839]


Epoch 12/40 - Average Loss: 0.1839
GPU memory cleared


Epoch 13/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.41it/s, Loss=0.1890, Avg Loss=0.1890]


Epoch 13/40 - Average Loss: 0.1890
GPU memory cleared


Epoch 14/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.35it/s, Loss=0.0141, Avg Loss=0.0141]


Epoch 14/40 - Average Loss: 0.0141
GPU memory cleared


Epoch 15/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.38it/s, Loss=0.0247, Avg Loss=0.0247]


Epoch 15/40 - Average Loss: 0.0247
GPU memory cleared


Epoch 16/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.41it/s, Loss=0.0098, Avg Loss=0.0098]


Epoch 16/40 - Average Loss: 0.0098
GPU memory cleared


Epoch 17/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.37it/s, Loss=0.0174, Avg Loss=0.0174]


Epoch 17/40 - Average Loss: 0.0174
GPU memory cleared


Epoch 18/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.38it/s, Loss=0.0020, Avg Loss=0.0020]


Epoch 18/40 - Average Loss: 0.0020
GPU memory cleared


Epoch 19/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.61it/s, Loss=0.0013, Avg Loss=0.0013]


Epoch 19/40 - Average Loss: 0.0013
GPU memory cleared


Epoch 20/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.45it/s, Loss=2.6749, Avg Loss=2.6749]


Epoch 20/40 - Average Loss: 2.6749
GPU memory cleared


Epoch 21/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.51it/s, Loss=0.0212, Avg Loss=0.0212]


Epoch 21/40 - Average Loss: 0.0212
GPU memory cleared


Epoch 22/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.45it/s, Loss=0.0014, Avg Loss=0.0014]


Epoch 22/40 - Average Loss: 0.0014
GPU memory cleared


Epoch 23/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.45it/s, Loss=0.0018, Avg Loss=0.0018]


Epoch 23/40 - Average Loss: 0.0018
GPU memory cleared


Epoch 24/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.26it/s, Loss=0.0007, Avg Loss=0.0007]


Epoch 24/40 - Average Loss: 0.0007
GPU memory cleared


Epoch 25/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.45it/s, Loss=1.6958, Avg Loss=1.6958]


Epoch 25/40 - Average Loss: 1.6958
GPU memory cleared


Epoch 26/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.36it/s, Loss=0.0008, Avg Loss=0.0008]


Epoch 26/40 - Average Loss: 0.0008
GPU memory cleared


Epoch 27/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.24it/s, Loss=0.0000, Avg Loss=0.0000]


Epoch 27/40 - Average Loss: 0.0000
GPU memory cleared


Epoch 28/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.48it/s, Loss=2.5043, Avg Loss=2.5043]


Epoch 28/40 - Average Loss: 2.5043
GPU memory cleared


Epoch 29/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.41it/s, Loss=0.0008, Avg Loss=0.0008]


Epoch 29/40 - Average Loss: 0.0008
GPU memory cleared


Epoch 30/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.71it/s, Loss=2.4185, Avg Loss=2.4185]


Epoch 30/40 - Average Loss: 2.4185
GPU memory cleared


Epoch 31/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.40it/s, Loss=2.3568, Avg Loss=2.3568]


Epoch 31/40 - Average Loss: 2.3568
GPU memory cleared


Epoch 32/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.46it/s, Loss=2.5044, Avg Loss=2.5044]


Epoch 32/40 - Average Loss: 2.5044
GPU memory cleared


Epoch 33/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.31it/s, Loss=0.0001, Avg Loss=0.0001]


Epoch 33/40 - Average Loss: 0.0001
GPU memory cleared


Epoch 34/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.56it/s, Loss=0.0001, Avg Loss=0.0001]


Epoch 34/40 - Average Loss: 0.0001
GPU memory cleared


Epoch 35/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.53it/s, Loss=0.0003, Avg Loss=0.0003]


Epoch 35/40 - Average Loss: 0.0003
GPU memory cleared


Epoch 36/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.37it/s, Loss=2.5011, Avg Loss=2.5011]


Epoch 36/40 - Average Loss: 2.5011
GPU memory cleared


Epoch 37/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.41it/s, Loss=2.3069, Avg Loss=2.3069]


Epoch 37/40 - Average Loss: 2.3069
GPU memory cleared


Epoch 38/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.43it/s, Loss=2.4574, Avg Loss=2.4574]


Epoch 38/40 - Average Loss: 2.4574
GPU memory cleared


Epoch 39/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.52it/s, Loss=0.0001, Avg Loss=0.0001]


Epoch 39/40 - Average Loss: 0.0001
GPU memory cleared


Epoch 40/40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.42it/s, Loss=2.3220, Avg Loss=2.3220]


Epoch 40/40 - Average Loss: 2.3220
GPU memory cleared
Final model saved at model/tiny_shakespeare/final_model.pt
GPU memory cleared


In [9]:
num_epochs=40
epoch=20

In [15]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import re

def generate_poem(prompt, model_path, tokenizer_path, max_words=50, max_seq_len=100, temperature=1.0):
    # Load the fine-tuned model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    
    # Set the padding token and padding side
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'

    # Set the prompt and generate the text
    input_ids = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_len)
    attention_mask = torch.ones_like(input_ids)

    # Calculate max tokens based on word count
    max_tokens = min(max_words * 5, max_seq_len)  # Assuming each word has an average of 5 tokens
    
    # Generate text with proper temperature handling
    output_ids = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + max_tokens,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,  # Enable sampling for temperature to work
        temperature=temperature,  # Now this will work with do_sample=True
        top_p=0.9,  # Optional: add top-p sampling for better quality
    )

    # Convert the token IDs to text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return generated_text

def post_process_poem(poem):
    # Remove any extra spaces
    poem = re.sub(r'\s+', ' ', poem).strip()

    # Capitalize the first letter of each sentence
    sentences = re.split(r'(?<=[\.\?!])\s', poem)
    formatted_sentences = [sentence.capitalize() for sentence in sentences]
    formatted_poem = ' '.join(formatted_sentences)

    # Add line breaks for readability
    line_breaks = re.compile(r'(?<=[,;:?!])\s')
    formatted_poem = line_breaks.sub('\n', formatted_poem)

    # Clean up repetitive patterns (like multiple # characters)
    formatted_poem = re.sub(r'#+', '', formatted_poem)  # Remove hash sequences
    formatted_poem = re.sub(r'grill+', 'grill', formatted_poem)  # Reduce repeated words
    
    return formatted_poem

# Example usage
model_path = 'model/tiny_shakespeare/final_model.pt'
tokenizer_path = 'gpt2'
prompt = "love"
max_words = 50
temperature = 0.1

generated_poem = generate_poem(prompt, model_path, tokenizer_path, max_words=max_words, temperature=temperature)
formatted_poem = post_process_poem(generated_poem)
print(formatted_poem)


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from accelerate import Accelerator

# TinyLlama - 1.1B parameters
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

# 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration for efficient fine-tuning
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id  # Use EOS token as pad token
tokenizer.padding_side = "left"

# Load the dataset
dataset = load_dataset(
    "text",
    data_files={"train": "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"}
)

# Split the continuous text into smaller chunks
def split_text(text, max_length=512):
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

split_texts = split_text(dataset["train"]["text"][0])

# Tokenize the split_texts
tokenized_texts = tokenizer(split_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

class ShiftedDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        input_ids = self.encodings["input_ids"][idx]
        attention_mask = self.encodings["attention_mask"][idx]
        labels = input_ids[1:].tolist() + [tokenizer.eos_token_id]
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": torch.tensor(labels)}

    def __len__(self):
        return len(self.encodings["input_ids"])

# Create a DataLoader
train_dataset = ShiftedDataset(tokenized_texts)
# Increased batch size for 16GB VRAM
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2) 

In [4]:
from accelerate import Accelerator

# Initialize the Accelerator
accelerator = Accelerator()

# Configure the training arguments
num_epochs = 20
learning_rate = 2e-4  # Adjusted for TinyLlama

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

# Prepare the model and optimizer for training with Accelerator
model, optimizer, train_dataloader = accelerator.prepare(model, optimizer, train_dataloader)

# Learning rate scheduler
from transformers import get_linear_schedule_with_warmup

num_training_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=num_training_steps
)

In [5]:
from torch.optim import AdamW
from tqdm import tqdm
import os

# Create directory for saving models
os.makedirs("model/tiny_shakespeare", exist_ok=True)

# Fine-tuning loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    epoch_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    
    for step, batch in enumerate(epoch_iterator):
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        accelerator.backward(loss)
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        lr_scheduler.step()

        # Update progress bar with current loss
        epoch_iterator.set_postfix({
            "Loss": f"{loss.item():.4f}",
            "Avg Loss": f"{total_loss/(step+1):.4f}"
        }, refresh=True)

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

    # Save checkpoint every 5 epochs
    #if (epoch + 1) % 5 == 0:
    #   checkpoint_path = f"model/tiny_shakespeare/checkpoint_epoch_{epoch + 1}"
    #    accelerator.wait_for_everyone()
    #    unwrapped_model = accelerator.unwrap_model(model)
    #    unwrapped_model.save_pretrained(checkpoint_path)
    #    tokenizer.save_pretrained(checkpoint_path)
    #    print(f"Checkpoint saved at {checkpoint_path}")
    
    clear_gpu_memory()

# Save final model
final_model_path = "model/tiny_shakespeare/TinyLlama-1.1B-shakespeare.pt"
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final model saved at {final_model_path}")

clear_gpu_memory()

Epoch 1/20:   0%|                                                                                                                                  | 0/1 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
Epoch 1/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.68it/s, Loss=10.8990, Avg Loss=10.8990]


Epoch 1/20 - Average Loss: 10.8990
GPU memory cleared


Epoch 2/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.18it/s, Loss=10.8990, Avg Loss=10.8990]


Epoch 2/20 - Average Loss: 10.8990
GPU memory cleared


Epoch 3/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.26it/s, Loss=10.8902, Avg Loss=10.8902]


Epoch 3/20 - Average Loss: 10.8902
GPU memory cleared


Epoch 4/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.09it/s, Loss=10.8668, Avg Loss=10.8668]


Epoch 4/20 - Average Loss: 10.8668
GPU memory cleared


Epoch 5/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.06it/s, Loss=10.8381, Avg Loss=10.8381]


Epoch 5/20 - Average Loss: 10.8381
Checkpoint saved at model/tiny_shakespeare/checkpoint_epoch_5
GPU memory cleared


Epoch 6/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.17it/s, Loss=10.7936, Avg Loss=10.7936]

Epoch 6/20 - Average Loss: 10.7936





GPU memory cleared


Epoch 7/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.12it/s, Loss=10.7410, Avg Loss=10.7410]


Epoch 7/20 - Average Loss: 10.7410
GPU memory cleared


Epoch 8/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.20it/s, Loss=10.6745, Avg Loss=10.6745]


Epoch 8/20 - Average Loss: 10.6745
GPU memory cleared


Epoch 9/20: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.13it/s, Loss=10.5985, Avg Loss=10.5985]


Epoch 9/20 - Average Loss: 10.5985
GPU memory cleared


Epoch 10/20: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.15it/s, Loss=10.5049, Avg Loss=10.5049]


Epoch 10/20 - Average Loss: 10.5049
Checkpoint saved at model/tiny_shakespeare/checkpoint_epoch_10
GPU memory cleared


Epoch 11/20: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.15it/s, Loss=10.4039, Avg Loss=10.4039]

Epoch 11/20 - Average Loss: 10.4039





GPU memory cleared


Epoch 12/20: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.21it/s, Loss=10.2771, Avg Loss=10.2771]


Epoch 12/20 - Average Loss: 10.2771
GPU memory cleared


Epoch 13/20: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.11it/s, Loss=10.1590, Avg Loss=10.1590]


Epoch 13/20 - Average Loss: 10.1590
GPU memory cleared


Epoch 14/20: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.20it/s, Loss=10.0184, Avg Loss=10.0184]


Epoch 14/20 - Average Loss: 10.0184
GPU memory cleared


Epoch 15/20: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.18it/s, Loss=9.8657, Avg Loss=9.8657]


Epoch 15/20 - Average Loss: 9.8657
Checkpoint saved at model/tiny_shakespeare/checkpoint_epoch_15
GPU memory cleared


Epoch 16/20: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.20it/s, Loss=9.7102, Avg Loss=9.7102]

Epoch 16/20 - Average Loss: 9.7102





GPU memory cleared


Epoch 17/20: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.26it/s, Loss=9.5676, Avg Loss=9.5676]


Epoch 17/20 - Average Loss: 9.5676
GPU memory cleared


Epoch 18/20: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.22it/s, Loss=9.4280, Avg Loss=9.4280]


Epoch 18/20 - Average Loss: 9.4280
GPU memory cleared


Epoch 19/20: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.23it/s, Loss=9.2872, Avg Loss=9.2872]


Epoch 19/20 - Average Loss: 9.2872
GPU memory cleared


Epoch 20/20: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.19it/s, Loss=9.1130, Avg Loss=9.1130]


Epoch 20/20 - Average Loss: 9.1130
Checkpoint saved at model/tiny_shakespeare/checkpoint_epoch_20
GPU memory cleared
Final model saved at model/tiny_shakespeare/TinyLlama-1.1B-shakespeare.pt
GPU memory cleared


In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import re

def generate_poem(prompt, model_path, tokenizer_path, max_words=50, max_seq_len=512, temperature=0.8):
    # Load the fine-tuned model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map="auto",
        torch_dtype=torch.float16
    )
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    
    # Set the padding token and padding side
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'

    # Set the prompt and generate the text
    input_ids = tokenizer.encode(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_seq_len)
    
    # Move input to the same device as model
    input_ids = input_ids.to(model.device)
    
    # Generate text
    output_ids = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + max_words * 2,  # Approximate token count
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        temperature=temperature,
        top_p=0.9,
        repetition_penalty=1.1
    )

    # Convert the token IDs to text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return generated_text

def post_process_poem(poem):
    # Remove any extra spaces
    poem = re.sub(r'\s+', ' ', poem).strip()

    # Capitalize the first letter of each sentence
    sentences = re.split(r'(?<=[\.\!\?])\s', poem)
    formatted_sentences = [sentence.capitalize() for sentence in sentences if sentence.strip()]
    formatted_poem = ' '.join(formatted_sentences)

    # Add line breaks for readability
    formatted_poem = re.sub(r'([\.\!\?])\s', r'\1\n', formatted_poem)
    
    return formatted_poem

In [9]:
# Example usage
model_path = 'model/tiny_shakespeare/TinyLlama-1.1B-shakespeare.pt'
tokenizer_path = 'gpt2'
prompt = "love"
max_words = 50
temperature = 0.1

generated_poem = generate_poem(prompt, model_path, tokenizer_path, max_words=max_words, temperature=temperature)
formatted_poem = post_process_poem(generated_poem)
print(formatted_poem)

Love dangerousowingaces orher accumulate inwarder unique diminish.
Dwight accumulate elusiveear othersclly accumulate you … poorerife degree lessct ent e deperfrom uniquewered acc diminish under accumulate no h italy e team def putphing unique'splease diminish#


In [None]:
full_cleanup()