<a href="https://colab.research.google.com/github/ayagup/stablediffusion/blob/main/hf_lora_gpu_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)
from datasets import Dataset
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import time
import json
import gc
import os
from torch.utils.tensorboard import SummaryWriter

# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    device = torch.device("cuda")
else:
    print("Using CPU")
    device = torch.device("cpu")


In [None]:

class TensorBoardCallback:
    """Custom callback for TensorBoard logging"""

    def __init__(self, log_dir="./logs"):
        self.log_dir = log_dir
        self.writer = None

    def setup(self, trainer):
        """Setup TensorBoard writer"""
        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)
        self.writer = SummaryWriter(log_dir=self.log_dir)
        print(f"📊 TensorBoard logging to: {self.log_dir}")

    def on_log(self, trainer, logs=None):
        """Log metrics to TensorBoard"""
        if self.writer and logs:
            step = trainer.state.global_step

            for key, value in logs.items():
                if isinstance(value, (int, float)):
                    self.writer.add_scalar(f"train/{key}", value, step)

            # Log learning rate
            if hasattr(trainer, 'lr_scheduler') and trainer.lr_scheduler is not None:
                try:
                    current_lr = trainer.lr_scheduler.get_last_lr()[0]
                    self.writer.add_scalar("train/learning_rate", current_lr, step)
                except:
                    pass

            # Log GPU memory usage
            if torch.cuda.is_available():
                gpu_memory = torch.cuda.memory_allocated() / 1024**3
                self.writer.add_scalar("system/gpu_memory_gb", gpu_memory, step)

    def on_evaluate(self, trainer, logs=None):
        """Log evaluation metrics"""
        if self.writer and logs:
            step = trainer.state.global_step

            for key, value in logs.items():
                if isinstance(value, (int, float)) and key.startswith('eval_'):
                    clean_key = key.replace('eval_', '')
                    self.writer.add_scalar(f"eval/{clean_key}", value, step)

    def close(self):
        """Close TensorBoard writer"""
        if self.writer:
            self.writer.close()


In [None]:

class CustomTrainer(Trainer):
    """Custom Trainer with TensorBoard integration - FIXED"""

    def __init__(self, *args, **kwargs):
        self.tb_callback = kwargs.pop('tb_callback', None)
        super().__init__(*args, **kwargs)

        if self.tb_callback:
            self.tb_callback.setup(self)

    def log(self, logs, start_time=None):
        """FIXED: Override log method with correct signature"""
        # Call parent method with all arguments
        if start_time is not None:
            super().log(logs, start_time)
        else:
            super().log(logs)

        # Add TensorBoard logging
        if self.tb_callback:
            self.tb_callback.on_log(self, logs)

    def evaluate(self, *args, **kwargs):
        """Override evaluate to log to TensorBoard"""
        result = super().evaluate(*args, **kwargs)

        if self.tb_callback:
            self.tb_callback.on_evaluate(self, result)

        return result


In [None]:

class SQLLoRATrainer:
    def __init__(self, log_dir="./tensorboard_logs"):
        self.tokenizer = None
        self.model = None
        self.dataset = None
        self.trained_model = None
        self.tb_callback = TensorBoardCallback(log_dir)
        self.log_dir = log_dir

    def create_sql_dataset(self, num_samples=200):
        """
        Create a synthetic SQL problems dataset with consistent formatting
        """
        print(f"\n{'='*60}")
        print(f"🗃️  CREATING SQL DATASET")
        print(f"{'='*60}")

        # Define SQL problem templates with consistent structure
        sql_problems = []

        # Template 1: SELECT queries
        select_templates = [
            {
                "description": "Find all customers from a specific city",
                "sql": "SELECT * FROM customers WHERE city = 'New York';",
                "explanation": "This query selects all columns from the customers table where the city is New York."
            },
            {
                "description": "Get customer names and emails ordered by name",
                "sql": "SELECT name, email FROM customers ORDER BY name ASC;",
                "explanation": "This query retrieves customer names and emails, sorted alphabetically by name."
            },
            {
                "description": "Count total number of orders",
                "sql": "SELECT COUNT(*) FROM orders;",
                "explanation": "This query counts the total number of records in the orders table."
            },
            {
                "description": "Find products with price greater than 100",
                "sql": "SELECT product_name, price FROM products WHERE price > 100;",
                "explanation": "This query finds all products with a price greater than 100."
            },
            {
                "description": "Get unique customer cities",
                "sql": "SELECT DISTINCT city FROM customers;",
                "explanation": "This query returns unique city values from the customers table."
            }
        ]

        # Template 2: JOIN queries
        join_templates = [
            {
                "description": "Get customer names with their order details",
                "sql": "SELECT c.name, o.order_date, o.total FROM customers c JOIN orders o ON c.customer_id = o.customer_id;",
                "explanation": "This query joins customers and orders tables to show customer names with their order information."
            },
            {
                "description": "Find products ordered by each customer",
                "sql": "SELECT c.name, p.product_name FROM customers c JOIN orders o ON c.customer_id = o.customer_id JOIN order_items oi ON o.order_id = oi.order_id JOIN products p ON oi.product_id = p.product_id;",
                "explanation": "This query uses multiple joins to connect customers with the products they ordered."
            },
        ]

        # Template 3: Aggregate queries
        aggregate_templates = [
            {
                "description": "Calculate total sales by customer",
                "sql": "SELECT customer_id, SUM(total) as total_sales FROM orders GROUP BY customer_id;",
                "explanation": "This query calculates the total sales amount for each customer using GROUP BY and SUM."
            },
            {
                "description": "Find average product price by category",
                "sql": "SELECT category, AVG(price) as avg_price FROM products GROUP BY category;",
                "explanation": "This query calculates the average price of products in each category."
            },
        ]

        # Combine all templates
        all_templates = select_templates + join_templates + aggregate_templates

        # Generate training data with consistent format
        for i in range(num_samples):
            template = np.random.choice(all_templates)

            # Create training example with consistent structure
            problem = {
                "instruction": f"Write a SQL query to: {template['description']}",
                "output": f"```sql\n{template['sql']}\n```\n\nExplanation: {template['explanation']}"
            }

            sql_problems.append(problem)

        print(f"✅ Created {len(sql_problems)} SQL training examples")
        print(f"Sample problem:")
        print(f"  Instruction: {sql_problems[0]['instruction']}")
        print(f"  Output: {sql_problems[0]['output'][:100]}...")

        return sql_problems

    def format_training_data(self, sql_problems):
        """
        Format data for training with consistent text structure
        """
        print("\n🔧 Formatting training data...")

        formatted_data = []

        for problem in sql_problems:
            # Create instruction-following format with consistent structure
            text = f"### Instruction:\n{problem['instruction']}\n\n### Response:\n{problem['output']}<|endoftext|>"

            formatted_data.append({"text": text})

        # Create dataset
        dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))

        print(f"✅ Formatted {len(dataset)} training examples")
        print(f"Sample formatted text (first 200 chars):")
        print(dataset[0]['text'][:200] + "...")

        return dataset

    def load_base_model(self, model_name="microsoft/DialoGPT-medium"):
        """
        Load base model and tokenizer from Hugging Face
        """
        print(f"\n{'='*60}")
        print(f"📥 LOADING BASE MODEL: {model_name}")
        print(f"{'='*60}")

        # Load tokenizer
        print("Loading tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Add special tokens properly
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        if self.tokenizer.eos_token is None:
            self.tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})

        # Add custom tokens for SQL training
        special_tokens = {
            "additional_special_tokens": ["### Instruction:", "### Response:", "```sql", "```"]
        }
        num_added = self.tokenizer.add_special_tokens(special_tokens)

        print(f"✅ Tokenizer loaded. Vocab size: {len(self.tokenizer)}")
        print(f"Added {num_added} special tokens")
        print(f"Pad token: {self.tokenizer.pad_token} (ID: {self.tokenizer.pad_token_id})")
        print(f"EOS token: {self.tokenizer.eos_token} (ID: {self.tokenizer.eos_token_id})")

        # Load model
        print("Loading base model...")
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None,
            trust_remote_code=True,
        )

        # Resize embeddings for new tokens
        if num_added > 0:
            self.model.resize_token_embeddings(len(self.tokenizer))
            print(f"Resized model embeddings to {len(self.tokenizer)}")

        print(f"✅ Base model loaded successfully")
        print(f"Model parameters: {sum(p.numel() for p in self.model.parameters()):,}")

        return True

    def setup_lora_config(self):
        """
        Setup LoRA configuration for training
        """
        print(f"\n{'='*60}")
        print(f"⚙️  SETTING UP LORA CONFIGURATION")
        print(f"{'='*60}")

        # Define LoRA config
        lora_config = LoraConfig(
            r=16,  # rank
            lora_alpha=32,  # alpha parameter
            target_modules=["c_attn", "c_proj", "c_fc"],  # target modules for LoRA
            lora_dropout=0.1,  # dropout probability for LoRA layers
            bias="none",  # bias type
            task_type=TaskType.CAUSAL_LM,  # task type
        )

        # Prepare model for training
        if torch.cuda.is_available():
            self.model = prepare_model_for_kbit_training(self.model)

        # Get PEFT model
        self.model = get_peft_model(self.model, lora_config)

        # Print trainable parameters
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in self.model.parameters())

        print(f"✅ LoRA configuration applied")
        print(f"LoRA rank (r): {lora_config.r}")
        print(f"LoRA alpha: {lora_config.lora_alpha}")
        print(f"Target modules: {lora_config.target_modules}")
        print(f"Trainable parameters: {trainable_params:,}")
        print(f"Total parameters: {total_params:,}")
        print(f"Trainable %: {100 * trainable_params / total_params:.2f}%")

        return lora_config

    def tokenize_dataset(self, dataset, max_length=256):
        """
        Tokenize the dataset for training with proper padding and truncation
        """
        print(f"\n🔧 Tokenizing dataset with max_length={max_length}...")

        def tokenize_function(examples):
            # Tokenize the text with proper padding and truncation
            result = self.tokenizer(
                examples["text"],
                truncation=True,
                padding="max_length",  # Pad to max_length for consistent tensor sizes
                max_length=max_length,
                return_overflowing_tokens=False,
                return_attention_mask=True,
            )

            # For causal LM, labels are the same as input_ids
            result["labels"] = result["input_ids"].copy()

            return result

        # Tokenize dataset
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=dataset.column_names,
            desc="Tokenizing dataset"
        )

        # Verify the tokenization
        sample = tokenized_dataset[0]
        print(f"✅ Dataset tokenized.")
        print(f"Sample input_ids length: {len(sample['input_ids'])}")
        print(f"Sample attention_mask length: {len(sample['attention_mask'])}")
        print(f"Sample labels length: {len(sample['labels'])}")
        print(f"All samples have max_length={max_length}: {all(len(item['input_ids']) == max_length for item in tokenized_dataset)}")

        return tokenized_dataset

    def train_model(self, tokenized_dataset, output_dir="./sql-lora-model"):
        """
        Train the model with LoRA and TensorBoard logging
        """
        print(f"\n{'='*60}")
        print(f"🚀 STARTING LORA TRAINING WITH TENSORBOARD")
        print(f"{'='*60}")

        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,  # We're doing causal LM, not masked LM
            pad_to_multiple_of=8,  # Pad to multiple of 8 for efficiency
        )

        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=20,  # Reduced for faster training
            per_device_train_batch_size=2,  # Reduced batch size to avoid memory issues
            gradient_accumulation_steps=8,  # Increased to maintain effective batch size
            warmup_steps=50,
            learning_rate=2e-4,
            fp16=torch.cuda.is_available(),
            logging_dir=self.log_dir,
            logging_steps=10,
            save_steps=200,
            eval_steps=100,
            do_eval=True,
            save_total_limit=2,
            prediction_loss_only=True,
            remove_unused_columns=False,
            dataloader_pin_memory=False,
            report_to="tensorboard",
            run_name=f"sql-lora-{int(time.time())}",
            dataloader_drop_last=True,  # Drop last incomplete batch
        )

        print(f"📊 TensorBoard Configuration:")
        print(f"  Logging directory: {self.log_dir}")
        print(f"  Run name: {training_args.run_name}")
        print(f"\nTraining configuration:")
        print(f"  Epochs: {training_args.num_train_epochs}")
        print(f"  Batch size: {training_args.per_device_train_batch_size}")
        print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
        print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
        print(f"  Learning rate: {training_args.learning_rate}")
        print(f"  FP16: {training_args.fp16}")

        # Split dataset for training and validation
        train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
        train_dataset = train_test_split["train"]
        eval_dataset = train_test_split["test"]

        print(f"Training samples: {len(train_dataset)}")
        print(f"Validation samples: {len(eval_dataset)}")

        # Initialize custom trainer - FIXED
        trainer = CustomTrainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=data_collator,
            tokenizer=self.tokenizer,
            tb_callback=self.tb_callback,
        )

        # Print GPU memory before training
        if torch.cuda.is_available():
            initial_memory = torch.cuda.memory_allocated() / 1024**3
            print(f"GPU memory before training: {initial_memory:.1f}GB")

        # Start training
        print(f"\n🔥 Starting training...")
        print(f"🌐 To view TensorBoard, run in another cell:")
        print(f"   %load_ext tensorboard")
        print(f"   %tensorboard --logdir {self.log_dir}")

        start_time = time.time()

        try:
            trainer.train()
            training_time = time.time() - start_time
            print(f"\n✅ Training completed in {training_time:.2f} seconds!")

            # Save the model
            trainer.save_model()
            self.tokenizer.save_pretrained(output_dir)
            print(f"✅ Model saved to {output_dir}")

            # Store trained model
            self.trained_model = self.model

        except Exception as e:
            print(f"❌ Training failed: {e}")
            import traceback
            traceback.print_exc()
            return None

        # Print final GPU memory
        if torch.cuda.is_available():
            final_memory = torch.cuda.memory_allocated() / 1024**3
            print(f"GPU memory after training: {final_memory:.1f}GB")

        return trainer

    def test_trained_model(self, test_prompts=None):
        """
        Test the trained model with SQL prompts
        """
        print(f"\n{'='*60}")
        print(f"🧪 TESTING TRAINED MODEL")
        print(f"{'='*60}")

        if test_prompts is None:
            test_prompts = [
                "Write a SQL query to: Find all customers from California",
                "Write a SQL query to: Calculate the total revenue by product category",
                "Write a SQL query to: Get the top 10 best-selling products",
                "Write a SQL query to: Find customers who haven't placed any orders"
            ]

        if self.trained_model is None:
            print("❌ No trained model available")
            return []

        self.trained_model.eval()

        results = []

        for i, prompt in enumerate(test_prompts, 1):
            print(f"\n📝 Test {i}/{len(test_prompts)}:")
            print(f"{'─'*50}")
            print(f"💭 Prompt: {prompt}")
            print(f"{'─'*50}")

            try:
                # Format prompt
                formatted_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"

                # Tokenize
                inputs = self.tokenizer(
                    formatted_prompt,
                    return_tensors="pt",
                    truncation=True,
                    max_length=128,
                    padding=False
                )

                # Move to device
                inputs = {k: v.to(self.trained_model.device) for k, v in inputs.items()}

                # Generate
                with torch.no_grad():
                    outputs = self.trained_model.generate(
                        **inputs,
                        max_new_tokens=100,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.9,
                        pad_token_id=self.tokenizer.pad_token_id,
                        eos_token_id=self.tokenizer.eos_token_id,
                    )

                # Decode
                full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

                # Extract generated part
                if "### Response:" in full_response:
                    response = full_response.split("### Response:")[-1].strip()
                else:
                    response = full_response[len(formatted_prompt):].strip()

                print(f"🤖 Generated SQL:\n{response}")

                results.append({
                    'prompt': prompt,
                    'response': response,
                    'success': True
                })

            except Exception as e:
                print(f"❌ Error: {e}")
                results.append({
                    'prompt': prompt,
                    'response': f"Error: {e}",
                    'success': False
                })

            print(f"{'─'*50}")

        # Summary
        successful = sum(1 for r in results if r['success'])
        print(f"\n📊 Testing Summary:")
        print(f"Total tests: {len(test_prompts)}")
        print(f"Successful: {successful}")
        print(f"Failed: {len(test_prompts) - successful}")

        return results

    def close_tensorboard(self):
        """Close TensorBoard writer"""
        self.tb_callback.close()


In [None]:

def main():
    """
    Main training pipeline with TensorBoard
    """
    try:
        print("🚀 Starting SQL LoRA Training Pipeline with TensorBoard...")

        # Initialize trainer
        trainer = SQLLoRATrainer(log_dir="./tensorboard_logs")

        # Create SQL dataset - smaller for testing
        sql_problems = trainer.create_sql_dataset(num_samples=200)

        # Format training data
        dataset = trainer.format_training_data(sql_problems)

        # Load base model
        trainer.load_base_model("microsoft/DialoGPT-medium")

        # Setup LoRA
        lora_config = trainer.setup_lora_config()

        # Tokenize dataset with smaller max_length
        tokenized_dataset = trainer.tokenize_dataset(dataset, max_length=256)

        # Train model
        training_result = trainer.train_model(tokenized_dataset)

        if training_result is not None:
            # Test trained model
            test_results = trainer.test_trained_model()

            print(f"\n✅ SQL LoRA Training Pipeline Completed Successfully!")
        else:
            print(f"\n❌ Training failed")
            test_results = []

        # Close TensorBoard
        trainer.close_tensorboard()

        print(f"\n📊 TensorBoard Instructions:")
        print(f"1. In a new cell, run: %load_ext tensorboard")
        print(f"2. Then run: %tensorboard --logdir ./tensorboard_logs")

        return trainer, test_results

    except Exception as e:
        print(f"❌ Error in training pipeline: {e}")
        import traceback
        traceback.print_exc()
        return None, None


In [None]:

def quick_gpu_test():
    """
    Quick test to verify GPU setup
    """
    print("🔬 Quick GPU Test")

    if not torch.cuda.is_available():
        print("❌ CUDA not available. Please switch to GPU runtime.")
        print("Runtime -> Change runtime type -> GPU")
        return False

    try:
        # Test GPU operations
        x = torch.randn(1000, 1000).cuda()
        y = torch.randn(1000, 1000).cuda()
        z = torch.matmul(x, y)

        print(f"✅ GPU test passed!")
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")

        # Clean up
        del x, y, z
        torch.cuda.empty_cache()

        return True

    except Exception as e:
        print(f"❌ GPU test failed: {e}")
        return False


In [None]:

if __name__ == "__main__":
    print("🎯 SQL LoRA Training on GPU with TensorBoard (Fixed v3)")
    print("Make sure you're using GPU runtime in Colab!")

    # Quick GPU test
    if quick_gpu_test():
        # Run main training
        trainer_obj, results = main()

        if trainer_obj and results:
            print(f"\n🎉 Training completed successfully!")
            print(f"📊 Check TensorBoard for detailed metrics and visualizations.")
        else:
            print(f"\n❌ Training failed. Check the logs above.")
    else:
        print("Please switch to GPU runtime and try again.")

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./tensorboard_logs