In [33]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AdamW  # or from torch.optim import AdamW



In [20]:
model_name = "t5-small"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)




In [21]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [22]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Change to TaskType.SEQ_CLS for classification
    inference_mode=False,         # Enable training mode
    r=8,                          # Rank of the LoRA decomposition
    lora_alpha=16,                # Scaling factor
    lora_dropout=0.1,             # Dropout probability
    target_modules=["q", "v"]  # Target layers to apply LoRA
)


In [23]:
peft_model = get_peft_model(model, lora_config)


In [24]:
dataset = load_dataset("squad")

In [28]:
def preprocess_function(examples):
    # Combine question and context into a single input string
    inputs = [f"question: {question}  context: {context}" for question, context in zip(examples["question"], examples["context"])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    
    # Extract the first answer text from the "answers" field, handling the structure of the dataset
    labels = [ans["text"][0] if len(ans["text"]) > 0 else "" for ans in examples["answers"]]
    tokenized_labels = tokenizer(labels, max_length=128, truncation=True, padding="max_length").input_ids
    
    model_inputs["labels"] = tokenized_labels
    return model_inputs


In [29]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)


Map: 100%|██████████| 87599/87599 [00:24<00:00, 3551.50 examples/s]
Map: 100%|██████████| 10570/10570 [00:03<00:00, 3247.92 examples/s]


In [34]:
# Prepare PyTorch Dataloader
train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=True, batch_size=8)

# Step 4: Define Training Arguments and Optimizer
optimizer = AdamW(peft_model.parameters(), lr=5e-5)



In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model.to(device)
device

device(type='cuda')

In [37]:
num_epochs = 3
peft_model.train()


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=512, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=512, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
               

In [51]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q", "v"]
)

# Get PEFT model
peft_model = get_peft_model(model, lora_config)
peft_model.to(device)

# Load dataset
dataset = load_dataset("squad")

def preprocess_function(examples):
    # Combine question and context into a single input string
    inputs = [f"question: {question}  context: {context}" 
             for question, context in zip(examples["question"], examples["context"])]
    
    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    
    # Extract answers
    labels = [ans["text"][0] if len(ans["text"]) > 0 else "" 
             for ans in examples["answers"]]
    
    # Tokenize labels
    with tokenizer.as_target_tokenizer():
        tokenized_labels = tokenizer(labels, max_length=128, truncation=True, padding="max_length")
    
    model_inputs["labels"] = tokenized_labels["input_ids"]
    
    # Convert everything to PyTorch tensors
    return {
        "input_ids": torch.tensor(model_inputs["input_ids"]),
        "attention_mask": torch.tensor(model_inputs["attention_mask"]),
        "labels": torch.tensor(model_inputs["labels"])
    }

# Preprocess dataset
print("Preprocessing dataset...")
tokenized_dataset = dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=dataset["train"].column_names
)

# Set format for pytorch
tokenized_dataset.set_format(type="torch")

# Create dataloader
train_dataloader = DataLoader(
    tokenized_dataset["train"], 
    shuffle=True, 
    batch_size=8
)

# Setup optimizer
optimizer = AdamW(peft_model.parameters(), lr=5e-5)

def train_epoch(model, train_dataloader, optimizer, device, use_amp=True):
    """
    Performs one epoch of training with proper error handling and progress tracking.
    """
    model.train()
    epoch_loss = 0
    scaler = GradScaler() if use_amp else None
    
    progress_bar = tqdm(train_dataloader, desc="Training")
    
    for batch in progress_bar:
        try:
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Clear gradients
            optimizer.zero_grad()

            # Forward pass with automatic mixed precision
            if use_amp:
                with autocast():
                    outputs = model(
                        input_ids=input_ids, 
                        attention_mask=attention_mask, 
                        labels=labels
                    )
                    loss = outputs.loss
                
                # Backward pass with gradient scaling
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(
                    input_ids=input_ids, 
                    attention_mask=attention_mask, 
                    labels=labels
                )
                loss = outputs.loss
                
                # Regular backward pass
                loss.backward()
                optimizer.step()

            # Update metrics
            current_loss = loss.item()
            epoch_loss += current_loss
            
            # Update progress bar
            progress_bar.set_postfix({'loss': f'{current_loss:.4f}'})

        except RuntimeError as e:
            if "out of memory" in str(e):
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                print(f"WARNING: out of memory error occurred. Clearing cache.")
                continue
            else:
                raise e

    # Calculate average loss
    avg_loss = epoch_loss / len(train_dataloader)
    return avg_loss

def train(model, train_dataloader, optimizer, num_epochs, device, use_amp=True):
    """
    Main training loop for multiple epochs.
    """
    for epoch in range(num_epochs):
        try:
            avg_loss = train_epoch(model, train_dataloader, optimizer, device, use_amp)
            print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
            
        except KeyboardInterrupt:
            print("Training interrupted by user")
            break
        except Exception as e:
            print(f"Error occurred during training: {str(e)}")
            raise e

# Set number of epochs and run training
num_epochs = 3
print("Starting training...")
train(
    model=peft_model,
    train_dataloader=train_dataloader,
    optimizer=optimizer,
    num_epochs=num_epochs,
    device=device,
    use_amp=True
)

# Save the fine-tuned model and tokenizer
print("Saving model...")
peft_model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("T5 model fine-tuned and saved successfully!")

Using device: cuda
Preprocessing dataset...


Map: 100%|██████████| 87599/87599 [00:26<00:00, 3254.25 examples/s]
Map: 100%|██████████| 10570/10570 [00:03<00:00, 3211.56 examples/s]
  scaler = GradScaler() if use_amp else None


Starting training...


  with autocast():
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training:   1%|          | 89/10950 [06:43<13:40:52,  4.53s/it, loss=21.9823]


Training interrupted by user
Saving model...
T5 model fine-tuned and saved successfully!
