In [None]:
%%capture
# GPU T4 Memory Cleanup and Environment Setup
import os
import gc
import torch

# Set memory optimization environment variables
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Clear any existing GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

# Install dependencies optimized for T4
if "COLAB_" not in "".join(os.environ.keys()):
    %pip install unsloth
else:
    # Colab environment - minimal installation
    %pip install --no-deps --upgrade timm
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton
    %pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    %pip install --no-deps unsloth

print("🧹 Memory cleaned and dependencies installed")


In [None]:
import os
import re
import io
import gc
from typing import Tuple, List, Dict, Any, Optional
from PIL import Image
import requests
import torch
from datasets import load_dataset, Dataset
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig

# Import Unsloth for vision models
from unsloth import FastVisionModel, get_chat_template

# Memory monitoring function
def print_gpu_memory():
    """Print current GPU memory usage."""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        total = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"🔍 GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {total:.2f}GB total")
    else:
        print("❌ CUDA not available")

def cleanup_memory():
    """Clean up GPU memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

print("📦 All dependencies loaded successfully!")
print_gpu_memory()


In [None]:
# T4 GPU Optimized Configuration (14GB Memory)
CONFIG = {
    # Model settings - Optimized for T4
    "model_name": "unsloth/gemma-2-2b-bnb-4bit",  # Smaller model for T4
    "max_seq_length": 1024,  # Reduced from 2048 for memory
    "load_in_4bit": True,
    
    # Dataset settings
    "dataset_name": "ngohongthai/exam-sixth_grade-instruct-dataset",
    "train_split": "train",
    
    # Training settings - Memory optimized
    "output_dir": "mathpal-gemma2b/t4_optimized",
    "max_steps": 50,  # Reduced for testing
    "per_device_train_batch_size": 1,  # Minimum for T4
    "gradient_accumulation_steps": 16,  # Increased to maintain effective batch size
    "learning_rate": 1e-4,  # Slightly reduced
    "warmup_ratio": 0.03,
    "weight_decay": 0.01,
    "logging_steps": 5,
    "save_steps": 25,
    
    # LoRA settings - Minimal for memory
    "lora_r": 8,  # Reduced from 16
    "lora_alpha": 16,
    "lora_dropout": 0.0,
    
    # System settings - T4 optimized
    "use_gradient_checkpointing": True,  # Essential for memory
    "report_to": None,
    "seed": 42,
    
    # Image processing - Aggressive limits
    "max_images_per_sample": 1,  # Reduced from 3
    "image_timeout": 3,  # Faster timeout
    "max_image_size": (224, 224),  # Limit image resolution
    
    # Memory management
    "dataloader_num_workers": 0,  # Reduce memory overhead
    "pin_memory": False,  # Disable for images
    "cleanup_frequency": 5,  # Cleanup every N steps
}

print(f"🔧 T4 Optimized Configuration:")
print(f"   Model: {CONFIG['model_name']}")
print(f"   Max sequence length: {CONFIG['max_seq_length']}")
print(f"   Effective batch size: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']}")
print(f"   LoRA rank: {CONFIG['lora_r']}")
print(f"   Max images per sample: {CONFIG['max_images_per_sample']}")

print_gpu_memory()


In [None]:
# Memory-Optimized Data Processing for T4
def download_image_memory_efficient(url: str, max_size: tuple = (224, 224), timeout: int = 3) -> Optional[Image.Image]:
    """Download and resize image with aggressive memory optimization."""
    try:
        if not url or not url.startswith(('http://', 'https://')):
            return None
            
        # Download with minimal memory footprint
        response = requests.get(url, timeout=timeout, stream=True)
        response.raise_for_status()
        
        content_type = response.headers.get('content-type', '')
        if not content_type.startswith('image/'):
            return None
        
        # Load and immediately resize to save memory
        image = Image.open(io.BytesIO(response.content))
        
        # Convert to RGB and resize aggressively
        if image.mode != 'RGB':
            image = image.convert('RGB')
            
        # Resize to fixed small size for memory efficiency
        image = image.resize(max_size, Image.Resampling.LANCZOS)
        
        # Validate final size
        if image.size[0] < 10 or image.size[1] < 10:
            return None
            
        return image
        
    except Exception:
        return None

def extract_images_minimal(text: str, max_images: int = 1, max_size: tuple = (224, 224)) -> Tuple[str, List[Image.Image]]:
    """Extract images with minimal memory usage."""
    image_pattern = r"!\[.*?\]\((.*?)\)"
    image_urls = re.findall(image_pattern, text)
    
    # Strict limit for T4
    image_urls = image_urls[:max_images]
    
    # Clean text more aggressively
    cleaned_text = re.sub(image_pattern, " [IMAGE] ", text)
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
    
    # Limit text length for memory
    if len(cleaned_text) > 512:
        cleaned_text = cleaned_text[:512] + "..."
    
    # Download images with memory optimization
    images = []
    for url in image_urls:
        image = download_image_memory_efficient(url, max_size=max_size, timeout=CONFIG["image_timeout"])
        if image:
            images.append(image)
        # Immediate cleanup
        gc.collect()
    
    return cleaned_text, images

def process_sample_memory_efficient(sample: Dict[str, str]) -> Dict[str, Any]:
    """Process sample with aggressive memory optimization."""
    try:
        # Process with strict limits
        question_text, question_images = extract_images_minimal(
            sample["question"], 
            max_images=CONFIG["max_images_per_sample"],
            max_size=CONFIG["max_image_size"]
        )
        
        solution_text, solution_images = extract_images_minimal(
            sample["solution"], 
            max_images=CONFIG["max_images_per_sample"],
            max_size=CONFIG["max_image_size"]
        )
        
        # Limit total images per sample
        all_images = question_images + solution_images
        if len(all_images) > CONFIG["max_images_per_sample"]:
            all_images = all_images[:CONFIG["max_images_per_sample"]]
            # Redistribute images
            question_images = all_images[:len(all_images)//2] if len(all_images) > 1 else all_images
            solution_images = all_images[len(all_images)//2:] if len(all_images) > 1 else []
        
        # Create content with memory efficiency
        user_content = [{"type": "text", "text": question_text}]
        for img in question_images:
            user_content.append({"type": "image", "image": img})
        
        assistant_content = [{"type": "text", "text": solution_text}]
        for img in solution_images:
            assistant_content.append({"type": "image", "image": img})
        
        messages = [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ]
        
        return {"messages": messages}
        
    except Exception as e:
        # Return minimal fallback
        return {
            "messages": [
                {"role": "user", "content": [{"type": "text", "text": "Sample error"}]},
                {"role": "assistant", "content": [{"type": "text", "text": "Error occurred"}]}
            ]
        }

print("✅ Memory-optimized data processing functions loaded")
print_gpu_memory()


In [None]:
# T4 Memory-Optimized Data Collator
class T4VisionDataCollator:
    """Ultra-lightweight data collator for T4 GPU."""
    
    def __init__(self, processor, max_length: int = 1024):
        self.processor = processor
        self.max_length = max_length
        self.placeholder_image = None
    
    def _get_tiny_placeholder(self):
        """Create minimal placeholder image."""
        if self.placeholder_image is None:
            # Smallest possible placeholder
            self.placeholder_image = Image.new('RGB', (32, 32), color=(250, 250, 250))
        return self.placeholder_image
    
    def _extract_content_minimal(self, messages: List[Dict]) -> Tuple[str, List[Image.Image]]:
        """Extract content with minimal memory usage."""
        images = []
        
        # Simple text extraction
        text_parts = []
        for message in messages:
            role = message.get("role", "")
            for content_item in message.get("content", []):
                if content_item.get("type") == "text":
                    text = content_item.get("text", "")
                    # Limit text length
                    if len(text) > 256:
                        text = text[:256] + "..."
                    text_parts.append(f"{role}: {text}")
                elif content_item.get("type") == "image":
                    img = content_item.get("image")
                    if img and hasattr(img, 'convert') and len(images) < 1:  # Max 1 image
                        # Ensure small size
                        if img.size[0] > 224 or img.size[1] > 224:
                            img = img.resize((224, 224), Image.Resampling.LANCZOS)
                        images.append(img.convert('RGB'))
        
        formatted_text = "\\n".join(text_parts)
        
        # Ensure at least one image for processor
        if not images:
            images = [self._get_tiny_placeholder()]
            formatted_text = "<image>\\n" + formatted_text
        
        return formatted_text, images[:1]  # Strict limit to 1 image
    
    def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        """Collate with aggressive memory optimization."""
        batch_texts = []
        batch_images = []
        
        for example in examples:
            messages = example.get("messages", [])
            text, images = self._extract_content_minimal(messages)
            batch_texts.append(text)
            batch_images.append(images)
        
        try:
            # Process with strict limits
            batch = self.processor(
                text=batch_texts,
                images=batch_images,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=self.max_length
            )
            
            # Create labels
            if "input_ids" in batch:
                labels = batch["input_ids"].clone()
                if hasattr(self.processor, 'tokenizer') and hasattr(self.processor.tokenizer, 'pad_token_id'):
                    labels[labels == self.processor.tokenizer.pad_token_id] = -100
                batch["labels"] = labels
            
            return batch
            
        except Exception as e:
            print(f"❌ Collator error: {e}")
            # Return minimal batch
            return {
                "input_ids": torch.tensor([[0]]),
                "labels": torch.tensor([[-100]])
            }

print("✅ T4-optimized data collator loaded")
print_gpu_memory()


In [None]:
# T4-Optimized Model Setup
def setup_gemma2b_t4(config: Dict[str, Any]):
    """Setup Gemma-2-2B model optimized for T4 GPU."""
    print("🔧 Loading Gemma-2-2B with T4 optimizations...")
    print_gpu_memory()
    
    try:
        # Clear memory before loading
        cleanup_memory()
        
        # Load smaller model with aggressive quantization
        model, processor = FastVisionModel.from_pretrained(
            config["model_name"],
            max_seq_length=config["max_seq_length"],
            load_in_4bit=True,  # Essential for T4
            use_gradient_checkpointing=config["use_gradient_checkpointing"]
        )
        
        print("🎯 Applying minimal LoRA configuration...")
        print_gpu_memory()
        
        # Apply LoRA with minimal settings for T4
        model = FastVisionModel.get_peft_model(
            model,
            # Minimal layer fine-tuning
            finetune_vision_layers=False,  # Disable to save memory
            finetune_language_layers=True,
            finetune_attention_modules=True,
            finetune_mlp_modules=False,  # Disable to save memory
            
            # Minimal LoRA settings
            r=config["lora_r"],  # Small rank
            lora_alpha=config["lora_alpha"],
            lora_dropout=config["lora_dropout"],
            bias="none",
            
            # Memory optimizations
            use_gradient_checkpointing=config["use_gradient_checkpointing"],
            random_state=config["seed"],
            use_rslora=False,
            
            # Minimal target modules for memory
            target_modules=["q_proj", "v_proj"],  # Only essential modules
            modules_to_save=["lm_head"]  # Minimal saves
        )
        
        # Setup chat template
        try:
            processor = get_chat_template(processor, "gemma")  # Use simpler template
        except:
            print("⚠️ Using default chat template")
        
        print("✅ T4-optimized model loaded successfully!")
        print_gpu_memory()
        
        return model, processor
        
    except Exception as e:
        print(f"❌ Model loading failed: {e}")
        cleanup_memory()
        raise

def prepare_dataset_t4(dataset_name: str, split: str, max_samples: int = 500) -> Dataset:
    """Prepare dataset with T4 memory constraints."""
    print(f"📥 Loading dataset (max {max_samples} samples for T4)...")
    
    try:
        # Load with limit for T4
        raw_dataset = load_dataset(dataset_name, split=split)
        
        # Limit dataset size for T4
        if len(raw_dataset) > max_samples:
            raw_dataset = raw_dataset.select(range(max_samples))
            print(f"⚠️ Limited to {max_samples} samples for T4 memory")
        
        print(f"🔄 Processing {len(raw_dataset)} samples...")
        
        processed_data = []
        errors = 0
        
        for i, sample in enumerate(raw_dataset):
            try:
                processed_sample = process_sample_memory_efficient(sample)
                processed_data.append(processed_sample)
                
                # Periodic cleanup for T4
                if i % 50 == 0:
                    cleanup_memory()
                    
            except Exception as e:
                errors += 1
                if errors <= 3:  # Limit error reporting
                    print(f"⚠️ Error processing sample {i}: {e}")
            
            if (i + 1) % 100 == 0:
                print(f"   Processed {i + 1}/{len(raw_dataset)} samples")
        
        success_rate = (len(processed_data) / len(raw_dataset)) * 100
        print(f"✅ Processed {len(processed_data)}/{len(raw_dataset)} samples ({success_rate:.1f}%)")
        
        # Final cleanup
        cleanup_memory()
        
        return Dataset.from_list(processed_data)
        
    except Exception as e:
        print(f"❌ Dataset preparation failed: {e}")
        cleanup_memory()
        raise

print("✅ T4-optimized model functions loaded")
print_gpu_memory()


In [None]:
# T4-Optimized Training Setup
def create_t4_trainer(model, processor, train_dataset, config: Dict[str, Any]):
    """Create ultra-memory-optimized trainer for T4."""
    print("🔧 Creating T4-optimized trainer...")
    print_gpu_memory()
    
    try:
        # Enable training with memory optimization
        FastVisionModel.for_training(model)
        
        # Create minimal data collator
        data_collator = T4VisionDataCollator(
            processor, 
            max_length=config["max_seq_length"]
        )
        
        # T4-optimized training arguments
        training_args = SFTConfig(
            # Basic settings - minimal for T4
            output_dir=config["output_dir"],
            max_steps=config["max_steps"],
            per_device_train_batch_size=config["per_device_train_batch_size"],
            gradient_accumulation_steps=config["gradient_accumulation_steps"],
            
            # Optimization - memory focused
            learning_rate=config["learning_rate"],
            warmup_ratio=config["warmup_ratio"],
            weight_decay=config["weight_decay"],
            
            # Memory-critical optimizations
            optim="adamw_8bit",  # 8-bit optimizer
            lr_scheduler_type="linear",  # Simpler scheduler
            
            # Aggressive memory settings
            gradient_checkpointing=True,  # Essential
            dataloader_pin_memory=False,  # Important for images
            dataloader_num_workers=0,  # Reduce overhead
            max_grad_norm=1.0,  # Conservative
            
            # Minimal logging
            logging_steps=config["logging_steps"],
            save_strategy="steps",
            save_steps=config["save_steps"],
            report_to=None,  # Disable reporting
            
            # Vision-specific - minimal
            remove_unused_columns=False,
            dataset_text_field="",
            dataset_kwargs={"skip_prepare_dataset": True},
            
            # Memory management
            save_total_limit=1,  # Keep only 1 checkpoint
            
            # Precision - mixed for T4
            fp16=True,  # Use FP16 for T4
            bf16=False,  # T4 doesn't support bf16 efficiently
            
            # Reproducibility
            seed=config["seed"],
        )
        
        # Create trainer with memory monitoring
        trainer = SFTTrainer(
            model=model,
            train_dataset=train_dataset,
            processing_class=processor.tokenizer,
            data_collator=data_collator,
            args=training_args
        )
        
        print("✅ T4-optimized trainer created!")
        print_gpu_memory()
        
        return trainer
        
    except Exception as e:
        print(f"❌ Trainer creation failed: {e}")
        cleanup_memory()
        raise

# Memory monitoring callback for T4
class T4MemoryCallback:
    """Callback to monitor and manage T4 memory during training."""
    
    def __init__(self, cleanup_frequency: int = 5):
        self.cleanup_frequency = cleanup_frequency
        self.step_count = 0
    
    def on_step_end(self, trainer, step):
        self.step_count += 1
        
        # Periodic memory cleanup
        if self.step_count % self.cleanup_frequency == 0:
            cleanup_memory()
            print_gpu_memory()
        
        # Emergency memory check
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1024**3
            if allocated > 12.0:  # Warning at 12GB
                print(f"⚠️ High memory usage: {allocated:.2f}GB - forcing cleanup")
                cleanup_memory()

print("✅ T4-optimized training functions loaded")
print_gpu_memory()


In [None]:
# Step 1: Pre-training Memory Check and Setup
print("🔍 T4 GPU Memory Analysis:")
print_gpu_memory()

# Create output directory
os.makedirs(CONFIG["output_dir"], exist_ok=True)
print(f"📁 Output directory: {CONFIG['output_dir']}")

# Initial cleanup
cleanup_memory()
print("🧹 Initial memory cleanup completed")
print_gpu_memory()


In [None]:
# Step 2: Load T4-Optimized Model
print("🔧 Loading Gemma-2-2B model optimized for T4...")
print("⚠️ This step requires careful memory management")

try:
    model, processor = setup_gemma2b_t4(CONFIG)
    print("✅ Model loaded successfully on T4!")
    
    # Verify model is on GPU
    if hasattr(model, 'device'):
        print(f"📍 Model device: {model.device}")
    
    print_gpu_memory()
    
except Exception as e:
    print(f"❌ Model loading failed: {e}")
    print("💡 Suggestions:")
    print("   - Restart kernel and clear all variables")
    print("   - Use an even smaller model if available")
    print("   - Consider upgrading to larger GPU")
    raise


In [None]:
# Step 3: Prepare Dataset (Limited for T4)
print("📊 Preparing dataset with T4 memory constraints...")

try:
    # Limited dataset for T4 memory
    train_dataset = prepare_dataset_t4(
        CONFIG["dataset_name"], 
        CONFIG["train_split"], 
        max_samples=300  # Very limited for T4
    )
    
    print(f"\n📈 T4 Dataset Statistics:")
    print(f"   Total samples: {len(train_dataset)}")
    print(f"   Memory-optimized for: {CONFIG['max_seq_length']} max sequence length")
    print(f"   Max images per sample: {CONFIG['max_images_per_sample']}")
    
    # Quick sample check
    if len(train_dataset) > 0:
        sample = train_dataset[0]
        messages = sample.get("messages", [])
        print(f"   Sample format: {len(messages)} messages")
        
        # Count images in first few samples
        total_images = 0
        for i, sample in enumerate(train_dataset[:10]):
            for msg in sample.get("messages", []):
                for content in msg.get("content", []):
                    if content.get("type") == "image":
                        total_images += 1
        
        print(f"   Images in first 10 samples: {total_images}")
    
    print_gpu_memory()
    
except Exception as e:
    print(f"❌ Dataset preparation failed: {e}")
    cleanup_memory()
    raise


In [None]:
# Step 4: Test T4 Data Collator
print("🧪 Testing T4-optimized data collator...")

try:
    # Create and test collator
    test_collator = T4VisionDataCollator(processor, max_length=CONFIG["max_seq_length"])
    
    # Test with minimal batch
    test_samples = [train_dataset[0]]
    test_batch = test_collator(test_samples)
    
    print("✅ Data collator test passed!")
    print(f"   Batch keys: {list(test_batch.keys())}")
    
    for key, value in test_batch.items():
        if hasattr(value, 'shape'):
            print(f"   {key}: {value.shape}")
            
    # Memory check after test
    print_gpu_memory()
    
    # Cleanup test batch
    del test_batch
    cleanup_memory()
    
except Exception as e:
    print(f"❌ Data collator test failed: {e}")
    print("💡 This indicates the model/data is too large for T4")
    cleanup_memory()
    raise


In [None]:
# Step 5: Create T4 Trainer and Start Training
print("🎯 Creating T4-optimized trainer...")

try:
    trainer = create_t4_trainer(model, processor, train_dataset, CONFIG)
    
    print(f"\n🚀 Starting T4-optimized training...")
    print(f"   Model: {CONFIG['model_name']}")
    print(f"   Dataset samples: {len(train_dataset)}")
    print(f"   Max steps: {CONFIG['max_steps']}")
    print(f"   Batch size: {CONFIG['per_device_train_batch_size']}")
    print(f"   Gradient accumulation: {CONFIG['gradient_accumulation_steps']}")
    print(f"   Effective batch size: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']}")
    print(f"   Learning rate: {CONFIG['learning_rate']}")
    print(f"   LoRA rank: {CONFIG['lora_r']}")
    print(f"   Max sequence length: {CONFIG['max_seq_length']}")
    
    # Add memory callback
    memory_callback = T4MemoryCallback(cleanup_frequency=CONFIG["cleanup_frequency"])
    
    print("\n💾 Starting training with memory monitoring...")
    print_gpu_memory()
    
    # Train with careful memory management
    trainer_stats = trainer.train()
    
    print("\n🎉 Training completed successfully on T4!")
    print(f"   Final loss: {trainer_stats.training_loss:.4f}")
    print_gpu_memory()
    
except Exception as e:
    print(f"\n❌ Training failed: {e}")
    print("\n💡 T4 Memory Troubleshooting:")
    print("   1. Reduce max_steps further (try 25)")
    print("   2. Reduce max_seq_length to 512")
    print("   3. Reduce gradient_accumulation_steps to 8")
    print("   4. Restart kernel and retry")
    
    # Emergency cleanup
    cleanup_memory()
    raise


In [None]:
# Step 6: Save T4-Optimized Model
print("💾 Saving T4-trained model...")

try:
    # Save with memory optimization
    model.save_pretrained_merged(
        CONFIG["output_dir"], 
        processor.tokenizer, 
        save_method="lora",
        maximum_memory_usage=0.8  # Limit memory usage during save
    )
    
    print(f"✅ Model saved successfully to {CONFIG['output_dir']}")
    print_gpu_memory()
    
except Exception as e:
    print(f"❌ Failed to save model: {e}")
    print("💡 Model training was successful but saving failed")

# Final cleanup
cleanup_memory()
print("\n🎉 T4-optimized fine-tuning pipeline completed!")
print(f"📁 Model artifacts (if saved): {CONFIG['output_dir']}")


In [None]:
# Step 7: Quick T4 Inference Test (Optional)
print("🔮 Testing inference on T4 (memory-optimized)...")

try:
    # Enable inference mode with memory optimization
    FastVisionModel.for_inference(model)
    
    # Simple test without images to save memory
    test_text = "user: Tính tổng của 5 + 3?\nassistant:"
    
    # Tokenize with memory limits
    inputs = processor.tokenizer(
        test_text,
        return_tensors="pt",
        max_length=256,  # Very short for T4
        truncation=True,
        padding=True
    ).to(model.device)
    
    print_gpu_memory()
    
    # Generate with conservative settings
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,  # Very limited
            temperature=0.7,
            do_sample=True,
            pad_token_id=processor.tokenizer.eos_token_id,
            use_cache=False  # Save memory
        )
    
    # Decode response
    response = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print("✅ T4 inference test successful!")
    print(f"\n📝 Test Response:")
    print(response)
    print_gpu_memory()
    
except Exception as e:
    print(f"❌ Inference test failed: {e}")
    print("💡 Model is trained but inference needs more memory optimization")

# Final cleanup
cleanup_memory()
print("\n🎯 T4 fine-tuning pipeline completed successfully!")
print("\n📋 T4 Optimization Summary:")
print(f"   • Model: Gemma-2-2B (reduced from 9B)")
print(f"   • Sequence length: {CONFIG['max_seq_length']} (reduced from 2048)")
print(f"   • LoRA rank: {CONFIG['lora_r']} (reduced from 16)")
print(f"   • Batch size: {CONFIG['per_device_train_batch_size']} with {CONFIG['gradient_accumulation_steps']} accumulation")
print(f"   • Memory management: Aggressive cleanup + FP16")
print(f"   • Dataset: Limited to 300 samples for T4")
