In [1]:
import time
notebook_start = time.time()

In [2]:
# Cell 1: Complete Environment Setup
!pip uninstall -y numpy torch torchvision torchaudio transformers peft bitsandbytes 2>/dev/null || echo "No packages to uninstall"

# Remove problematic directories manually
problematic_path = "/usr/local/lib/python3.11/dist-packages/~vidia-cudnn-cu12"


# Clear pip cache
!pip cache purge

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: torch 2.2.1+cu121
Uninstalling torch-2.2.1+cu121:
  Successfully uninstalled torch-2.2.1+cu121
Found existing installation: torchvision 0.17.1+cu121
Uninstalling torchvision-0.17.1+cu121:
  Successfully uninstalled torchvision-0.17.1+cu121
Found existing installation: torchaudio 2.2.1+cu121
Uninstalling torchaudio-2.2.1+cu121:
  Successfully uninstalled torchaudio-2.2.1+cu121
Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
Found existing installation: peft 0.10.0
Uninstalling peft-0.10.0:
  Successfully uninstalled peft-0.10.0
Found existing installation: bitsandbytes 0.43.0
Uninstalling bitsandbytes-0.43.0:
  Successfully uninstalled bitsandbytes-0.43.0
Files removed: 90


In [3]:
# Install NumPy FIRST with clean environment
!pip install -q --ignore-installed numpy==1.26.4

# Install PyTorch with CUDA 12.1 (Kaggle's version)
!pip install -q torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121

# Install transformer-related packages with compatible versions
!pip install -q transformers==4.41.2 peft==0.10.0 datasets==2.18.0 accelerate==0.29.1
!pip install -q bitsandbytes==0.43.0 einops==0.7.0

# Handle gymnasium separately to avoid conflicts
!pip install -q gymnasium==0.29.0 --no-deps

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
accelerate 0.29.1 requires torch>=1.10.0, which is not installed.
easyocr 1.7.2 requires torch, which is not installed.
easyocr 1.7.2 requires torchvision>=0.5, which is not installed.
torchmetrics 1.7.1 requires torch>=2.0.0, which is not installed.
pytorch-lightning 2.5.1.post0 requires torch>=2.1.0, which is not installed.
kaggle-environments 1.16.11 requires transformers>=4.33.1, which is not installed.
stable-baselines3 2.1.0 requires torch>=1.13, which is not installed.
sentence-transformers 3.4.1 requires torch>=1.11.0, which is not installed.

In [4]:
import sys
import os
import json
import numpy as np
import psutil
import torch
import torch.nn as nn
from typing import Optional
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    pipeline,
    GenerationConfig
)
from sentence_transformers import util as semantic_util

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.11/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


2025-05-30 21:30:17.549415: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748640617.586711     212 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748640617.597883     212 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# Model Loading
# =====================
MODEL_NAME = "gpt2"  # Change to "meta-llama/Llama-2-7b-chat-hf" for Llama

def print_memory():
    """Memory usage diagnostics for the environment"""
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.memory_allocated() / 1024**3
        print(f"GPU Memory: {gpu_mem:.2f}GB", end=" | ")
    ram = psutil.virtual_memory()
    print(f"RAM: {ram.percent}% ({ram.used/1024**3:.1f}/{ram.total/1024**3:.1f}GB)")

def load_model(model_name):
    """Load model with improved error handling and phi-1.5 specific settings"""
    print(f"\n=== Loading Model: {model_name} ===")
    print_memory()
    
    # Phi-1.5 specific configuration
    trust_remote_code = True  # Required for phi-1.5
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    
    # Quantization config for memory efficiency
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype
    )
    
    try:
        print("Attempting quantized load...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            trust_remote_code=trust_remote_code,
            device_map="auto",
            torch_dtype=torch_dtype
        )
        
        print("\n✅ Model loaded successfully!")
        print_memory()
        return model
        
    except Exception as e:
        print(f"\n❌ Model loading failed: {str(e)}")
        print("Attempting standard load without quantization...")
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                trust_remote_code=trust_remote_code,
                device_map="auto" if torch.cuda.is_available() else None,
                torch_dtype=torch_dtype
            )
            print("\n✅ Model loaded successfully without quantization!")
            print_memory()
            return model
        except Exception as e:
            print(f"\n❌ Standard load failed: {str(e)}")
            print("Attempting CPU-only fallback...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                trust_remote_code=trust_remote_code,
                device_map="cpu",
                torch_dtype=torch.float32
            )
            print("\n✅ Model loaded on CPU")
            print_memory()
            return model

model = load_model(MODEL_NAME)


=== Loading Model: gpt2 ===
RAM: 5.7% (1.3/31.4GB)
Attempting quantized load...

❌ Model loading failed: No GPU found. A GPU is needed for quantization.
Attempting standard load without quantization...

✅ Model loaded successfully without quantization!
RAM: 7.2% (1.8/31.4GB)


In [6]:
# Tokenizer Setup
# =====================
def load_tokenizer(model_name):
    """Load and configure tokenizer"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            padding_side="right"
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer loaded successfully")
        return tokenizer
    except Exception as e:
        print(f"Tokenizer loading failed: {str(e)}")
        raise

tokenizer = load_tokenizer(MODEL_NAME)

Tokenizer loaded successfully


In [7]:
# Data Preparation
# =====================
def prepare_dataset(file_path="/kaggle/input/database", max_samples=1000):
    """Prepare dataset with robust error handling"""
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"❌ Path not found: {file_path}")
            
        # Load JSONL file directly using datasets library
        dataset = load_dataset('json', data_files=file_path, split='train[:{}]'.format(max_samples))
        
        # If the dataset has multiple columns, we just want the text
        if 'text' not in dataset.features:
            # Try to find the first text-like column
            text_columns = [col for col in dataset.features if any(t in col.lower() for t in ['text', 'content', 'body'])]
            if text_columns:
                dataset = dataset.rename_column(text_columns[0], 'text')
            else:
                # If no text column found, combine all string columns
                string_columns = [col for col in dataset.features if dataset.features[col].dtype == 'string']
                if string_columns:
                    def combine_columns(examples):
                        return {'text': ' '.join(str(examples[col]) for col in string_columns)}
                    dataset = dataset.map(combine_columns)
                else:
                    raise ValueError("No text columns found in dataset")
        
        print(f"✅ Loaded dataset with {len(dataset)} samples")
        return dataset
        
    except Exception as e:
        print(f"\n❌ Dataset preparation failed: {str(e)}")
        print("Creating minimal fallback dataset...")
        return Dataset.from_dict({"text": ["Sample text " + str(i) for i in range(10)]})

def safe_tokenize(examples):
    """Tokenization with explicit numpy workarounds"""
    try:
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            max_length=90,
            padding="max_length",
            return_tensors="pt"
        )
        # Convert to lists explicitly
        return {
            "input_ids": tokenized["input_ids"].tolist(),
            "attention_mask": tokenized["attention_mask"].tolist(),
            "labels": tokenized["input_ids"].tolist()
        }
    except RuntimeError as e:
        if "Numpy is not available" in str(e):
            # Fallback using pure Python
            return {
                "input_ids": [[0]*512],
                "attention_mask": [[1]*512],
                "labels": [[0]*512]
            }
        raise

try:
    print("\n=== Starting Processing ===")
    dataset = prepare_dataset()
    
    # Small batch test first
    test_batch = dataset.select(range(2))
    test_tokenized = test_batch.map(safe_tokenize, batched=True)
    
    # If test succeeds, process full dataset
    tokenized_dataset = dataset.map(safe_tokenize, batched=True, batch_size=4)
    tokenized_dataset.set_format(type='torch')
    
    print("✅ Processing completed successfully!")
    
except Exception as e:
    print(f"\n❌ Error: {str(e)}")
    print("Creating minimal fallback dataset...")
    tokenized_dataset = Dataset.from_dict({
        "input_ids": [[0,1,2,3]],
        "attention_mask": [[1,1,1,1]],
        "labels": [[0,1,2,3]]
    })
    tokenized_dataset.set_format(type='torch')


=== Starting Processing ===

❌ Dataset preparation failed: Unable to find '/kaggle/input/database'
Creating minimal fallback dataset...


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

✅ Processing completed successfully!


In [8]:
# Training Configuration
# =====================
def configure_training(model):
    """Configure training parameters and LoRA setup"""
    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

    # LoRA configuration
    peft_config = LoraConfig(
        r=16,  
        lora_alpha=32,
        target_modules=["attn.c_attn", "attn.c_proj", "mlp.c_fc", "mlp.c_proj"],  # GPT-2 compatible modules
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        fan_in_fan_out=True
    )

    # Training arguments optimized for Kaggle
    training_args = TrainingArguments(
        output_dir="/kaggle/working/phi1.5-lora-results",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=1,  # Reduced for Kaggle
        learning_rate=2e-5,
        optim="adamw_torch",
        logging_steps=10,
        save_steps=500,
        fp16=torch.cuda.is_available(),
        max_grad_norm=0.3,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        report_to="none"
    )

    # Prepare model for training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)

    # Print trainable parameters
    model.print_trainable_parameters()
    
    return model, training_args

In [9]:
# Training Execution
# =====================
def train_model(model, tokenized_dataset, training_args):
    """Execute the training process"""
    # Disable cache if gradient checkpointing is enabled
    if training_args.gradient_checkpointing:
        model.config.use_cache = False
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
                             'attention_mask': torch.stack([f['attention_mask'] for f in data]),
                             'labels': torch.stack([f['input_ids'] for f in data])}
    )
    
    print("Starting training...")
    print_memory()
    trainer.train()
    print("Training completed!")
    return trainer

def generate_contrastive_examples(example):
    """Generate contrastive examples for training"""
    # Generate negative sample by:
    # 1. Random Q/A from different category
    # 2. GPT-generated incorrect answer
    # 3. Perturbed correct answer
    return {
        'anchor': example['answer'],
        'positive': augment_answer(example['answer']),
        'negative': get_negative_sample(example)
    }

In [10]:
# Model Saving
# =====================
def save_model_artifacts(
    model, 
    tokenizer, 
    training_args: Optional[TrainingArguments] = None, 
    output_dir: str = "/kaggle/working/gpt2-lora-trained"
) -> str:
    """
    Save all model artifacts with comprehensive verification.
    Handles both single-file and sharded model formats.
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    print(f"\n💾 Saving model artifacts to: {output_dir}")
    
    # For LoRA models - DON'T merge adapters before saving
    # We want to save the adapter separately
    print("💽 Saving model and adapter...")
    
    # Save the entire model (base model + adapter)
    model.save_pretrained(
        output_dir,
        safe_serialization=True,
        state_dict=model.state_dict()  # Save the complete state including LoRA
    )
    
    # Save tokenizer
    print("🔤 Saving tokenizer...")
    tokenizer.save_pretrained(output_dir)
    
    # Save training arguments if provided
    if training_args is not None:
        print("📝 Saving training arguments...")
        try:
            args_path = os.path.join(output_dir, "training_args.json")
            if hasattr(training_args, 'to_dict'):
                with open(args_path, "w") as f:
                    json.dump(training_args.to_dict(), f, indent=2)
            elif hasattr(training_args, 'to_json_string'):
                with open(args_path, "w") as f:
                    f.write(training_args.to_json_string())
            else:
                print("⚠️ Warning: TrainingArguments has no serialization method")
        except Exception as e:
            print(f"⚠️ Warning: Failed to save training args - {str(e)}")
    
    # Verify the adapter files were saved
    required_files = ['adapter_config.json', 'adapter_model.safetensors']
    missing_files = []
    for file in required_files:
        if not os.path.exists(os.path.join(output_dir, file)):
            missing_files.append(file)
    
    if missing_files:
        print(f"\n⚠️ Warning: Missing adapter files: {missing_files}")
        print("Trying alternative save method...")
        # Explicitly save the adapter
        model.save_pretrained(
            output_dir,
            safe_serialization=True,
            adapter_only=True  # This ensures adapter files are saved
        )
    
    print("\n🔍 Verifying saved files:")
    for file in os.listdir(output_dir):
        size = os.path.getsize(os.path.join(output_dir, file)) / 1024
        print(f"- {file} ({size:.2f} KB)")
    
    return output_dir

In [11]:
# Model Loading and Testing
# =====================
def load_and_test_model(
    model_path: str = "/kaggle/working/gpt2-lora-trained", 
    max_length: int = 250,
    test_prompts: Optional[list] = None,
    is_peft_model: bool = True
):
    """
    Load and test a saved model with comprehensive error handling
    """
    print(f"\n🔍 Preparing to load model from: {model_path}")
    
    # Verify model directory exists
    if not os.path.exists(model_path):
        raise ValueError(f"Model directory {model_path} does not exist")
    
    # Show directory contents for debugging
    print("\n📂 Model directory contents:")
    for f in sorted(os.listdir(model_path)):
        size = os.path.getsize(os.path.join(model_path, f)) / 1024
        print(f"- {f} ({size:.2f} KB)")
    
    try:
        print("\n🔄 Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            local_files_only=True
        )
        
        print("\n🔄 Loading model...")
        if is_peft_model:
            # First check if we have adapter files
            adapter_files = [
                f for f in os.listdir(model_path) 
                if f.startswith('adapter_') or f == 'adapter_config.json'
            ]
            
            if not adapter_files:
                print("⚠️ No adapter files found. Loading as regular model.")
                model = AutoModelForCausalLM.from_pretrained(
                    model_path,
                    device_map="auto",
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                    local_files_only=True
                )
            else:
                print(f"Found adapter files: {adapter_files}")
                # Load base model first
                base_model = AutoModelForCausalLM.from_pretrained(
                    model_path,
                    device_map="auto",
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                    local_files_only=True
                )
                
                # Then load the PEFT adapter
                model = PeftModel.from_pretrained(
                    base_model,
                    model_path,
                    local_files_only=True
                )
                
                # Merge and unload for inference
                model = model.merge_and_unload()
        else:
            # For regular models
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                device_map="auto",
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                local_files_only=True
            )
            
        print("\n🎉 Model loaded successfully!")
        
        # Default test prompts if none provided
        if test_prompts is None:
            test_prompts = [
                "What is hardware wallet?? ",
                "What is Proof of Work (PoW)?? ",
                "What is cryptography?? ",
                "What is Peer-to-Peer (P2P)?? ",
                "What is block chain?? ",
                "What is private key?? "
            ]
        
        # Create pipeline - REMOVED device parameter since we're using device_map="auto"
        print("\n🚀 Creating text generation pipeline...")
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )
        
        # Run tests
        print("\n🧪 Running generation tests...")
        for i, prompt in enumerate(test_prompts, 1):
            print(f"\n🔹 Test {i}: {prompt}")
            output = pipe(
                prompt,
                max_length=max_length,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                num_return_sequences=1,
                repetition_penalty=1.2
            )
            print("💬 Response:", output[0]['generated_text'])
            
        return model, tokenizer
        
    except Exception as e:
        print(f"\n❌ Critical error loading model: {str(e)}")
        print("\n🛠️ Debugging info:")
        print(f"- Path: {os.path.abspath(model_path)}")
        print(f"- Directory exists: {os.path.exists(model_path)}")
        if os.path.exists(model_path):
            print("- Contents:", os.listdir(model_path))
        raise

In [12]:
# Enhanced Model Wrapper
# =====================
class ConstrainedCryptoModel(nn.Module):
    def __init__(self, base_model, tokenizer, knowledge_base=None):
        super().__init__()
        self.model = base_model
        self.tokenizer = tokenizer
        self.knowledge_base = knowledge_base
        
        # Technical term dictionary for crypto concepts
        self.technical_terms = {
            'PoW': ['consensus', 'mining', 'difficulty', 'hashrate'],
            'blockchain': ['ledger', 'immutable', 'blocks', 'decentralized'],
            'cryptography': ['encryption', 'keys', 'security', 'algorithm'],
            'wallet': ['private key', 'public key', 'address', 'seed phrase']
        }
        
        # Banned phrases and sequences
        self.banned_sequences = [
            "I don't know", "as an AI", "I'm not sure", 
            "I can't answer", "my training data"
        ]
        
        # Required technical terms
        self.force_included = [
            "blockchain", "cryptography", "decentralized",
            "consensus", "encryption", "keys"
        ]

    def forward(self, input_ids, **kwargs):
        outputs = self.model(input_ids, **kwargs)
        outputs.logits = self.apply_crypto_constraints(outputs.logits, input_ids)
        return outputs
    
    def apply_crypto_constraints(self, logits, input_ids):
        """Apply multi-dimensional constraints for crypto-specific generation"""
        generated_text = self.tokenizer.decode(input_ids[0])
        
        # 1. Force inclusion of key technical terms
        for term in self.force_included:
            if term in generated_text.lower():
                term_ids = self.tokenizer.encode(term, add_special_tokens=False)
                for tid in term_ids:
                    logits[0, -1, tid] += 5.0
        
        # 2. Ban non-technical or uncertain phrases
        for phrase in self.banned_sequences:
            phrase_ids = self.tokenizer.encode(phrase, add_special_tokens=False)
            if len(phrase_ids) > 0:
                logits[0, -1, phrase_ids[0]] = -float('inf')
        
        # 3. Validate technical consistency
        last_term = self.get_last_term(generated_text)
        if last_term in self.technical_terms:
            for term in self.technical_terms[last_term]:
                term_ids = self.tokenizer.encode(term, add_special_tokens=False)
                for tid in term_ids:
                    logits[0, -1, tid] += 3.0
        
        return logits
    
    def get_last_term(self, text):
        """Extract the main technical term from the query"""
        question_words = ["what", "how", "explain", "describe"]
        words = [w for w in text.lower().split() if w not in question_words]
        return words[-1].strip('?') if words else ""

In [13]:
# Enhanced Generation
# =====================
CRYPTO_GENERATION_CONFIG = GenerationConfig(
    max_new_tokens=150,
    no_repeat_ngram_size=4,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    top_k=40,
    repetition_penalty=1.15,
    num_beams=3,
    early_stopping=True
)

def generate_with_validation(model, tokenizer, prompt, max_length=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # First pass generation
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        generation_config=CRYPTO_GENERATION_CONFIG
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Validation checks
    validation_passed = True
    validation_notes = []
    
    # 1. Technical term check
    last_term = model.get_last_term(prompt)
    if last_term in model.technical_terms:
        missing = [t for t in model.technical_terms[last_term] 
                  if t.lower() not in response.lower()]
        if missing:
            validation_passed = False
            validation_notes.append(f"Missing technical terms: {missing}")
    
    # 2. Hallucination check
    if any(phrase in response for phrase in model.banned_sequences):
        validation_passed = False
        validation_notes.append("Potential hallucination")
    
    # Generate final output
    if not validation_passed:
        print(f"⚠️ Validation issues: {validation_notes}")
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            generation_config=CRYPTO_GENERATION_CONFIG,
            bad_words_ids=[[tid] for tid in tokenizer.encode(" ".join(model.banned_sequences), add_special_tokens=False)]
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return {
        'response': response,
        'validation_passed': validation_passed,
        'validation_notes': validation_notes
    }

In [15]:
# =====================
# Main Training Flow (Fixed Version)
# =====================
if __name__ == "__main__":
    # Initialize with memory check
    print("=== Initializing Training Process ===")
    print_memory()
    
    # 1. Load model and tokenizer with error handling
    try:
        print("\n=== Loading Model and Tokenizer ===")
        model = load_model(MODEL_NAME)
        tokenizer = load_tokenizer(MODEL_NAME)
        print("✅ Model and tokenizer loaded successfully")
    except Exception as e:
        print(f"❌ Failed to load model/tokenizer: {str(e)}")
        raise

    # 2. Dataset preparation with robust error handling
    try:
        print("\n=== Preparing Dataset ===")
        dataset = prepare_dataset()
        
        # Small validation batch first
        print("Running validation on small batch...")
        test_batch = dataset.select(range(min(2, len(dataset))))
        test_tokenized = test_batch.map(safe_tokenize, batched=True)
        
        # Full dataset processing
        print("Processing full dataset...")
        tokenized_dataset = dataset.map(
            safe_tokenize,
            batched=True,
            batch_size=4,
            remove_columns=dataset.column_names
        )
        tokenized_dataset.set_format(type='torch')
        print(f"✅ Dataset prepared with {len(tokenized_dataset)} samples")
        print_memory()
        
    except Exception as e:
        print(f"\n❌ Dataset processing failed: {str(e)}")
        print("Creating minimal fallback dataset...")
        tokenized_dataset = Dataset.from_dict({
            "input_ids": [[0,1,2,3]],
            "attention_mask": [[1,1,1,1]],
            "labels": [[0,1,2,3]]
        })
        tokenized_dataset.set_format(type='torch')

    # 3. Training configuration
    try:
        print("\n=== Configuring Training ===")
        model, training_args = configure_training(model)
        print("✅ Training configured with parameters:")
        print(f"- Batch size: {training_args.per_device_train_batch_size}")
        print(f"- Gradient accumulation: {training_args.gradient_accumulation_steps}")
        print(f"- Epochs: {training_args.num_train_epochs}")
        print(f"- Learning rate: {training_args.learning_rate}")
        print_memory()
    except Exception as e:
        print(f"❌ Training configuration failed: {str(e)}")
        raise

    # 4. Training execution
    try:
        print("\n=== Starting Training ===")
        trainer = train_model(model, tokenized_dataset, training_args)
        print("✅ Training completed successfully!")
        print_memory()
    except Exception as e:
        print(f"❌ Training failed: {str(e)}")
        raise

    # 5. Model saving
    try:
        print("\n=== Saving Model Artifacts ===")
        model_path = save_model_artifacts(model, tokenizer, training_args)
        print(f"✅ Model saved to: {model_path}")
    except Exception as e:
        print(f"❌ Model saving failed: {str(e)}")
        raise

    # 6. Model testing and validation
    try:
        print("\n=== Testing Model ===")
        # Test prompts covering key crypto concepts
        custom_prompts = [
            "What is software wallet, and what's the difference between hardware and software wallet?",
            "Explain Proof of Work (PoW) in simple terms",
            "Describe blockchain technology in 2-3 sentences",
            "What is the cryptographic purpose of public/private key pairs?",
            "How does peer-to-peer (P2P) networking relate to cryptocurrency?"
        ]
        
        # Load and test the saved model
        loaded_model, loaded_tokenizer = load_and_test_model(
            model_path, 
            test_prompts=custom_prompts, 
            is_peft_model=True
        )
        
        # Fix: Ensure model has device attribute before validation
        if not hasattr(loaded_model, 'device'):
            loaded_model.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Generate and validate responses
        print("\n=== Generating Validated Responses ===")
        for i, query in enumerate(custom_prompts, 1):
            print(f"\n🔹 Test {i}: {query}")
            inputs = loaded_tokenizer(query, return_tensors="pt").to(loaded_model.device)
            outputs = loaded_model.generate(**inputs, max_length=200)
            response = loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Comprehensive prompt quality assessment
            print("\n=== Prompt Quality Assessment ===")
            print(f"1. Clarity: {'✅' if len(query.split()) <= 15 else '⚠️'} (Length: {len(query.split())} words)")
            print(f"2. Specificity: {'✅' if '?' in query else '⚠️'} (Contains explicit question)")
            print(f"3. Technical Focus: {'✅' if any(term.lower() in query.lower() for term in ['wallet', 'PoW', 'blockchain', 'crypt', 'P2P']) else '⚠️'}")
            print(f"4. Avoids Ambiguity: {'✅' if not any(word in query for word in ['it', 'this', 'that']) else '⚠️'}")
            print(f"5. Answer Format Hint: {'✅' if any(phrase in query for phrase in ['explain', 'describe', 'what is']) else '⚠️'}")
            
            print("\n💬 Generated Response:")
            print(response)
            
            # Technical validation
            validation_passed = True
            validation_notes = []
            
            # Check for technical terms
            required_terms = {
                'wallet': ['private key', 'public key', 'address', 'security'],
                'PoW': ['consensus', 'mining', 'difficulty', 'hash'],
                'blockchain': ['ledger', 'immutable', 'blocks', 'decentralized'],
                'key': ['encryption', 'signature', 'asymmetric', 'cryptography'],
                'P2P': ['network', 'nodes', 'decentralized', 'direct']
            }
            
            for term, keywords in required_terms.items():
                if term.lower() in query.lower():
                    missing = [kw for kw in keywords if kw.lower() not in response.lower()]
                    if missing:
                        validation_passed = False
                        validation_notes.append(f"Missing technical terms for {term}: {missing}")
            
            # Check for hallucinations
            banned_phrases = ["I don't know", "as an AI", "I'm not sure", "I can't answer"]
            if any(phrase in response for phrase in banned_phrases):
                validation_passed = False
                validation_notes.append("Contains uncertain/hallucinated content")
            
            print(f"\n✅ Validation: {'PASSED' if validation_passed else 'FAILED'}")
            if not validation_passed:
                print(f"⚠️ Issues: {', '.join(validation_notes)}")
        
        print("\n=== Training and Evaluation Complete ===")
        
    except Exception as e:
        print(f"❌ Model testing failed: {str(e)}")
        raise

=== Initializing Training Process ===
RAM: 9.4% (2.5/31.4GB)

=== Loading Model and Tokenizer ===

=== Loading Model: gpt2 ===
RAM: 9.4% (2.5/31.4GB)
Attempting quantized load...

❌ Model loading failed: No GPU found. A GPU is needed for quantization.
Attempting standard load without quantization...

✅ Model loaded successfully without quantization!
RAM: 10.8% (2.9/31.4GB)
Tokenizer loaded successfully
✅ Model and tokenizer loaded successfully

=== Preparing Dataset ===

❌ Dataset preparation failed: Unable to find '/kaggle/input/database'
Creating minimal fallback dataset...
Running validation on small batch...


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Processing full dataset...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

✅ Dataset prepared with 10 samples
RAM: 10.9% (3.0/31.4GB)

=== Configuring Training ===
trainable params: 2,359,296 || all params: 126,799,104 || trainable%: 1.8606566809809635
✅ Training configured with parameters:
- Batch size: 1
- Gradient accumulation: 4
- Epochs: 1
- Learning rate: 2e-05
RAM: 10.9% (3.0/31.4GB)

=== Starting Training ===
Starting training...
RAM: 10.9% (3.0/31.4GB)


Step,Training Loss


Training completed!
✅ Training completed successfully!
RAM: 10.6% (2.9/31.4GB)

=== Saving Model Artifacts ===

💾 Saving model artifacts to: /kaggle/working/gpt2-lora-trained
💽 Saving model and adapter...
🔤 Saving tokenizer...
📝 Saving training arguments...

🔍 Verifying saved files:
- README.md (4.96 KB)
- merges.txt (445.62 KB)
- training_args.json (3.86 KB)
- adapter_config.json (0.66 KB)
- tokenizer_config.json (0.49 KB)
- tokenizer.json (2058.55 KB)
- vocab.json (779.45 KB)
- adapter_model.safetensors (9227.88 KB)
- special_tokens_map.json (0.13 KB)
✅ Model saved to: /kaggle/working/gpt2-lora-trained

=== Testing Model ===

🔍 Preparing to load model from: /kaggle/working/gpt2-lora-trained

📂 Model directory contents:
- README.md (4.96 KB)
- adapter_config.json (0.66 KB)
- adapter_model.safetensors (9227.88 KB)
- merges.txt (445.62 KB)
- special_tokens_map.json (0.13 KB)
- tokenizer.json (2058.55 KB)
- tokenizer_config.json (0.49 KB)
- training_args.json (3.86 KB)
- vocab.json (779.

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🎉 Model loaded successfully!

🚀 Creating text generation pipeline...

🧪 Running generation tests...

🔹 Test 1: What is software wallet, and what's the difference between hardware and software wallet?
💬 Response: What is software wallet, and what's the difference between hardware and software wallet?
Well there are two different types of wallets: a physical Bitcoin (which has no electronic signature) or an encrypted digital one. The latter allows users to have secure connections with their funds without any risk whatsoever; however it also offers many advantages over both paper-based payment systems such as PayPal which rely on signing documents instead like your bank account numbers etc… So you can use either type for cash transfers that don't require multiple signatures in order not need authentication by third parties... This will allow those who want to transfer money from banks using traditional currencies but still retain security when they spend online rather than having them si

In [16]:
notebook_end = time.time()
print(f"Total notebook execution time: {notebook_end - notebook_start:.2f} seconds")

Total notebook execution time: 520.17 seconds
