In [None]:
# Install packages with specific versions to avoid conflicts
!pip install --no-deps unsloth
!pip install --upgrade --no-deps transformers==4.44.0
!pip install --upgrade trl==0.9.6
!pip install --upgrade torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0
!pip install --upgrade bitsandbytes xformers

# Verify installations
import sys
print(f"Python version: {sys.version}")

try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
except ImportError as e:
    print(f"PyTorch import error: {e}")

try:
    import transformers
    print(f"Transformers version: {transformers.__version__}")
except ImportError as e:
    print(f"Transformers import error: {e}")

try:
    import trl
    print(f"TRL version: {trl.__version__}")
except ImportError as e:
    print(f"TRL import error: {e}")

try:
    import unsloth
    print(f"Unsloth imported successfully")
except ImportError as e:
    print(f"Unsloth import error: {e}")

In [None]:
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template, standardize_sharegpt


In [None]:
# Workaround for Triton compiler issues
import os
os.environ["TRITON_INTERPRET"] = "1"  # Use interpreter instead of compiler
os.environ["TRITON_KERNEL_CACHE"] = "0"  # Disable kernel caching
os.environ["UNSLOTH_IS_PRESENT"] = "False"  # Disable Unsloth optimizations
os.environ["DISABLE_UNSLOTH"] = "1"  # Force disable Unsloth features

# Alternative: Use standard transformers if Unsloth fails
print("🔧 Setting up fallback to standard transformers training...")

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=2048,
    load_in_4bit=True,
    device_map="auto",  # Add explicit device mapping
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model, r=16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
    
)

In [None]:
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

In [None]:
dataset = load_dataset("mlabonne/FineTome-100k", split="train")

In [None]:
# Custom callback to monitor training progress
from transformers import TrainerCallback
import time

class ProgressCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None
        
    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        print("🔥 Training started!")
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            elapsed = time.time() - self.start_time
            step = state.global_step
            total_steps = state.max_steps
            
            loss = logs.get('train_loss', 'N/A')
            lr = logs.get('learning_rate', 'N/A')
            
            print(f"📊 Step {step}/{total_steps} | Loss: {loss:.4f} | LR: {lr:.2e} | Time: {elapsed:.1f}s")
            
            if torch.cuda.is_available():
                gpu_mem = torch.cuda.memory_allocated() / 1024**3
                print(f"   GPU Memory: {gpu_mem:.2f} GB")
    
    def on_train_end(self, args, state, control, **kwargs):
        total_time = time.time() - self.start_time
        print(f"🎉 Training completed in {total_time:.1f} seconds!")

# Create callback instance
progress_callback = ProgressCallback()

In [None]:
dataset = standardize_sharegpt(dataset)

In [None]:
# Tokenize the dataset properly for standard training
def tokenize_function(examples):
    # Tokenize the text and add labels
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=2048,
        return_tensors=None
    )
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Apply tokenization
print("🔄 Tokenizing dataset...")
dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,  # Remove original columns
    desc="Tokenizing dataset"
)

print(f"✅ Dataset tokenized. Sample count: {len(dataset)}")
print(f"✅ Sample keys: {dataset[0].keys()}")

In [None]:
# Ensure model is on GPU before creating trainer
if torch.cuda.is_available():
    model = model.cuda()
    print(f"✅ Model moved to GPU: {next(model.parameters()).device}")
else:
    print("❌ GPU not available")

# FALLBACK: Use standard Transformers Trainer instead of SFTTrainer
from transformers import Trainer, DataCollatorForLanguageModeling

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

# Create standard trainer (no Unsloth optimizations)
trainer = Trainer(
    model=model,
    train_dataset=dataset,
    data_collator=data_collator,
    callbacks=[progress_callback],
    args=TrainingArguments(
        per_device_train_batch_size=1,  # Smaller batch size for safety
        gradient_accumulation_steps=8,  # Compensate with more accumulation
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        # Use FP32 to avoid any precision issues
        fp16=False,
        bf16=False,
        logging_steps=1,
        logging_strategy="steps",
        output_dir="outputs",
        dataloader_pin_memory=False,
        remove_unused_columns=False,
        report_to=[],
        save_strategy="no",  # Don't save checkpoints during training
        evaluation_strategy="no",  # No evaluation
    ),
)

print("✅ Standard Trainer created (fallback mode - no Unsloth optimizations)")

In [None]:
# Check CUDA availability and device status
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name()}")
    print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"Memory cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

# Check if model is on GPU
print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

# Kaggle GPU and environment diagnostics
import os
print("=== Kaggle Environment Check ===")
print(f"Kaggle environment: {'KAGGLE_URL_BASE' in os.environ}")
print(f"GPU enabled: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"CUDA version: {torch.version.cuda}")
    
    # Clear cache for fresh start
    torch.cuda.empty_cache()
    print("GPU cache cleared")

# Check model device and memory
print(f"\nModel device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

# CRITICAL: Check Kaggle GPU Setup
print("🔍 KAGGLE GPU DIAGNOSTICS")
print("=" * 50)

# Check if we're in Kaggle
is_kaggle = 'KAGGLE_URL_BASE' in os.environ
print(f"Running in Kaggle: {is_kaggle}")

# Check CUDA
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

if not cuda_available:
    print("❌ GPU NOT DETECTED!")
    print("🔧 SOLUTION: Go to Kaggle Settings → Accelerator → Select 'GPU T4 x2'")
    print("Then restart the notebook!")
else:
    print("✅ GPU detected!")
    print(f"GPU count: {torch.cuda.device_count()}")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"CUDA version: {torch.version.cuda}")
    
    # Clear GPU cache
    torch.cuda.empty_cache()
    print("GPU cache cleared")
    
    # Check current GPU memory
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

# Check model location
try:
    model_device = next(model.parameters()).device
    print(f"Model device: {model_device}")
    
    if str(model_device) == 'cpu' and cuda_available:
        print("⚠️  Model is on CPU but GPU is available - will fix this!")
    elif str(model_device).startswith('cuda'):
        print("✅ Model is properly on GPU")
except:
    print("❌ Model not loaded yet")

print("=" * 50)

In [None]:
# Force model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    print("Model moved to GPU")
else:
    print("CUDA not available - running on CPU")

In [None]:
# Start training with progress monitoring
print("🚀 Starting training...")
print("=" * 50)

# Enable verbose logging
import logging
logging.basicConfig(level=logging.INFO)

# Monitor GPU before training
if torch.cuda.is_available():
    print(f"📊 Pre-training GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

# Start training
trainer_stats = trainer.train()

print("=" * 50)
print("✅ Training completed!")
print(f"📈 Final training loss: {trainer_stats.training_loss:.4f}")

# Monitor GPU after training
if torch.cuda.is_available():
    print(f"📊 Post-training GPU memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

In [None]:
model.save_pretrained("finetunned_model")