In [1]:
import time
notebook_start = time.time()

In [2]:
# Cell 1: Environment Setup - FIXED
# =================================
import os
import sys
import json
import shutil
import numpy as np
import torch
import transformers
from datasets import Dataset, load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["NO_TF"] = "1"  # Prevent TensorFlow import issues

# Install required packages
!pip uninstall -y tensorflow  # Remove to prevent conflicts
!pip install --upgrade pip setuptools wheel
!pip install numpy==1.26.4 scipy==1.11.4
!pip install torch==2.2.1+cpu torchvision==0.17.1+cpu torchaudio==2.2.1+cpu --index-url https://download.pytorch.org/whl/cpu
!pip install transformers==4.41.2 peft==0.10.0 datasets==2.18.0 accelerate==0.29.1
!pip install einops==0.7.0 tokenizers==0.19.1 sentencepiece==0.2.0
!pip install scikit-learn==1.2.2 matplotlib==3.7.2
!pip install langchain==0.1.16 faiss-cpu==1.7.4 tqdm==4.66.2 pandas==2.2.2

# Verify installations
print("\n=== Core Package Versions ===")
print(f"Python: {sys.version}")
print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

Looking in indexes: https://download.pytorch.org/whl/cpu

=== Core Package Versions ===
Python: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
NumPy: 1.26.4
PyTorch: 2.2.1+cpu
Transformers: 4.41.2
CUDA available: False


In [3]:
# Cell 2: Model Loading - FIXED
# ============================
MODEL_NAME = "gpt2"

def print_memory():
    """Memory usage diagnostics"""
    import psutil
    ram = psutil.virtual_memory()
    print(f"RAM: {ram.percent:.1f}% ({ram.used/1024**3:.1f}/{ram.total/1024**3:.1f}GB)")

def load_model(model_name):
    print(f"\n=== Loading Model: {model_name} ===")
    print_memory()
    
    device = "cpu"
    torch_dtype = torch.float32
    
    try:
        print("Attempting standard CPU load...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map=None,
            torch_dtype=torch_dtype
        ).to(device)
        print("\n✅ Model loaded successfully on CPU!")
        return model
    except Exception as e:
        print(f"\n❌ Standard load failed: {str(e)}")
        raise RuntimeError("Unable to load model on CPU")

model = load_model(MODEL_NAME)


=== Loading Model: gpt2 ===
RAM: 4.7% (1.0/31.4GB)
Attempting standard CPU load...

✅ Model loaded successfully on CPU!


In [4]:
# Cell 3: Tokenizer Setup - FIXED
# ==============================
def load_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            padding_side="right"
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer loaded successfully")
        return tokenizer
    except Exception as e:
        print(f"Tokenizer loading failed: {str(e)}")
        raise

tokenizer = load_tokenizer(MODEL_NAME)

Tokenizer loaded successfully


In [10]:
# Cell 4: Dataset Preparation - EXECUTE IMMEDIATELY
# ==================================
from pathlib import Path
from datasets import load_dataset, Dataset, DatasetDict
import json
import re
import os
from transformers import AutoTokenizer

def extract_qna_from_notebook(notebook_path):
    """Extract Q&A pairs from Jupyter notebook cells"""
    try:
        with open(notebook_path, 'r', encoding='utf-8') as f:
            notebook = json.load(f)
        
        qna_pairs = []
        qna_patterns = [
            r"(?:^|\n)(Q|Question)[:：]?\s*(.+?)\s*(A|Answer)[:：]?\s*(.+?)(?=\n\s*(?:Q|Question|$))",
            r"##\s*(.+?)\s*\n([\s\S]+?)\n(?:##|\Z)",
            r"\"\"\"\s*Q:(.+?)\nA:(.+?)\"\"\"",
            r"<qa>\n<q>(.+?)</q>\n<a>(.+?)</a>\n</qa>"
        ]
        
        for cell in notebook['cells']:
            if cell['cell_type'] == 'markdown':
                text = ''.join(cell['source'])
                
                for pattern in qna_patterns:
                    matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
                    if matches:
                        for match in matches:
                            if len(match) == 4:
                                question = match[1].strip()
                                answer = match[3].strip()
                            elif len(match) == 2:
                                question = match[0].strip()
                                answer = match[1].strip()
                            else:
                                continue
                                
                            if question and answer:
                                qna_pairs.append(f"Q: {question}\nA: {answer}")
        
        return qna_pairs
    
    except Exception as e:
        print(f"Error processing notebook: {str(e)}")
        return []

def create_fallback_dataset():
    """Create fallback dataset using reliable public datasets"""
    try:
        return load_dataset("cryptodatadog/crypto-qa-fixed", split='train[:100]')
    except Exception as e:
        print(f"⚠️ Primary fallback failed: {str(e)}")
        try:
            dataset = load_dataset("wiki_qa", split='train[:200]')
            crypto_keywords = ['bitcoin', 'blockchain', 'crypto', 'ethereum', 'wallet', 'mining']
            filtered = dataset.filter(
                lambda x: any(kw in x['question'].lower() for kw in crypto_keywords)
            return filtered.select(range(min(100, len(filtered))))
        except Exception as e2:
            print(f"⚠️ Secondary fallback failed: {str(e2)}")
            manual_qa = [
                "Q: What is Bitcoin? A: Bitcoin is a decentralized digital currency.",
                "Q: What is Ethereum? A: Ethereum is a blockchain platform with smart contract functionality.",
                "Q: What is a blockchain? A: A distributed ledger technology recording transactions across networks.",
                "Q: What is a crypto wallet? A: Software/hardware storing private keys to manage cryptocurrencies.",
                "Q: What is mining in cryptocurrency? A: The process of validating transactions and creating new coins.",
                "Q: What is DeFi? A: Decentralized finance using blockchain without traditional intermediaries.",
                "Q: What is an NFT? A: Non-fungible token representing unique digital ownership.",
                "Q: What is a smart contract? A: Self-executing contracts with terms directly written into code.",
                "Q: What is proof-of-stake? A: Consensus mechanism where validators stake crypto to secure network.",
                "Q: What is a DAO? A: Decentralized Autonomous Organization governed by smart contracts."
            ]
            return Dataset.from_dict({"text": manual_qa})

def prepare_dataset(file_path="/kaggle/input/database4", max_samples=1000):
    try:
        print(f"\n🔍 Searching for dataset at: {file_path}")
        data_path = Path(file_path)
        
        if not data_path.exists():
            raise FileNotFoundError(f"Dataset path not found: {data_path}")
        
        # 1. Check for specific notebook file
        specific_notebook = data_path / "database-0604.ipynb"
        if specific_notebook.exists():
            print(f"✅ Found specific notebook file: {specific_notebook}")
            qna_pairs = extract_qna_from_notebook(specific_notebook)
            
            if qna_pairs:
                print(f"Extracted {len(qna_pairs)} Q&A pairs from notebook")
                return Dataset.from_dict({"text": qna_pairs[:max_samples]})
            else:
                print("No Q&A pairs found in notebook, checking other sources")
        
        # 2. Look for other notebook files
        notebook_files = list(data_path.rglob('*.ipynb'))
        if notebook_files:
            print(f"Found {len(notebook_files)} notebook files")
            for notebook_path in notebook_files:
                if notebook_path == specific_notebook:
                    continue
                print(f"Processing notebook: {notebook_path.name}")
                qna_pairs = extract_qna_from_notebook(notebook_path)
                if qna_pairs:
                    print(f"Extracted {len(qna_pairs)} Q&A pairs from {notebook_path.name}")
                    return Dataset.from_dict({"text": qna_pairs[:max_samples]})
        
        # 3. Try loading Q&A pairs from JSON
        qna_files = list(data_path.rglob("*qna*.json")) + list(data_path.rglob("*qa*.json"))
        if qna_files:
            print(f"Found {len(qna_files)} Q&A files")
            dataset = load_dataset('json', data_files=[str(f) for f in qna_files], 
                                  split=f'train[:{max_samples}]')
            return dataset
        
        # 4. Fallback to raw text extraction
        text_files = list(data_path.rglob('*.txt')) + list(data_path.rglob('*.md'))
        if text_files:
            print(f"Found {len(text_files)} text files")
            texts = []
            for file in text_files[:10]:
                try:
                    with open(file, 'r', encoding='utf-8') as f:
                        texts.append(f.read())
                except Exception as e:
                    print(f"Skipped file {file.name}: {str(e)}")
            return Dataset.from_dict({"text": texts[:max_samples]})
        
        raise ValueError("No usable data files found")
    
    except Exception as e:
        print(f"\n❌ Dataset loading failed: {str(e)}")
        print("Creating fallback dataset...")
        return create_fallback_dataset()

def safe_tokenize(examples, tokenizer):
    """Tokenization with error handling"""
    try:
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": tokenized["input_ids"].tolist(),
            "attention_mask": tokenized["attention_mask"].tolist(),
            "labels": tokenized["input_ids"].tolist()
        }
    except Exception as e:
        print(f"Tokenization error: {str(e)}")
        return {
            "input_ids": [[0]*256],
            "attention_mask": [[1]*256],
            "labels": [[0]*256]
        }

# --- IMMEDIATE EXECUTION STARTS HERE ---
print("\n=== Starting Data Processing ===")
dataset = prepare_dataset("/kaggle/input/database4")

# Show sample data
print("\nSample data:")
for i in range(min(3, len(dataset))):
    print(f"\nSample {i+1}:")
    print(dataset[i]['text'][:200] + "...")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize dataset
tokenized_dataset = dataset.map(
    lambda x: safe_tokenize(x, tokenizer), 
    batched=True, 
    batch_size=4
)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split dataset
split_datasets = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_dataset = DatasetDict({
    "train": split_datasets["train"],
    "test": split_datasets["test"]
})
print(f"✅ Dataset split: {len(tokenized_dataset['train'])} train, {len(tokenized_dataset['test'])} test")

SyntaxError: '(' was never closed (1530929372.py, line 59)

In [None]:
# Cell 5: Training Configuration - OPTIMIZED FOR SMALL DATA
# =========================================================
model.gradient_checkpointing_enable()

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "Wqkv", "out_proj", "fc1", "fc2"  # Phi-2 specific modules
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Cosine scheduler with proper warmup
training_args = TrainingArguments(
    output_dir=f"./phi-2-crypto-expert",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=2e-5,
    optim="adamw_torch",
    logging_steps=10,
    eval_strategy="epoch",
    eval_steps = 50,
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),  # Auto-enable FP16 if GPU available
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    weight_decay=0.001,
    use_cpu=True
)

# Prepare model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [11]:
from datasets import Dataset, concatenate_datasets  # Add missing import

# Augment small datasets
if len(dataset) < 500:
    print("Applying dataset augmentation...")
    
    # 1. Paraphrase existing samples
    def paraphrase(text):
        """Simple in-place augmentation"""
        replacements = [
            ("cryptocurrency", "crypto"),
            ("blockchain", "distributed ledger"),
            ("wallet", "digital wallet"),
            ("transaction", "tx"),
            ("decentralized", "distributed"),
            ("mining", "validation process"),
            ("exchange", "trading platform"),
            ("token", "digital asset"),
            ("coin", "digital currency"),
            ("key", "cryptographic key"),
            ("proof", "consensus mechanism")
        ]
        for a, b in replacements:
            text = text.replace(a, b)
        return text
    
    # 2. Add paraphrased versions
    original_texts = dataset['text']
    new_texts = [paraphrase(t) for t in original_texts]
    
    # Create augmented dataset
    augmented = Dataset.from_dict({"text": new_texts})
    dataset = concatenate_datasets([dataset, augmented])
    
    # 3. Add Q&A formatting
    def format_qa(text):
        keywords = ["crypto", "blockchain", "bitcoin", "ethereum", 
                    "wallet", "defi", "nft", "key", "proof", "transaction"]
        words = text.split()
        topic = next((word for word in words if word.lower() in keywords), words[0])
        return f"Question: Explain {topic} in cryptocurrency?\nAnswer: {text}"
    
    # Apply formatting
    dataset = dataset.map(lambda x: {"text": format_qa(x['text'])}, 
                         batched=False,
                         load_from_cache_file=False)
    
    # 4. Shuffle dataset
    dataset = dataset.shuffle(seed=42)
    
    print(f"Augmented dataset size: {len(dataset)}")
    print("✅ Applied paraphrasing, Q&A formatting, and shuffling")

Applying dataset augmentation...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Augmented dataset size: 20
✅ Applied paraphrasing, Q&A formatting, and shuffling


In [12]:
# Cell 6: Training Execution - FIXED
# =================================
def train_model(model, tokenized_dataset, training_args):
    """Execute the training process"""
    model.config.use_cache = False  # Disable cache for gradient checkpointing
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        data_collator=data_collator
    )
    
    print("\n=== Starting Training ===")
    trainer.train()
    
    # Save model
    output_dir = training_args.output_dir
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"\n✅ Model saved to {output_dir}")
    return trainer

trainer = train_model(model, tokenized_dataset, training_args)

NameError: name 'data_collator' is not defined

In [None]:
# Cell 7: Enhanced Model Saving with Shard Support
# ===============================================

# Add missing import
from typing import Optional

def save_model_artifacts(
    model, 
    tokenizer, 
    training_args: Optional[TrainingArguments] = None, 
    output_dir: str = "/kaggle/working/gpt2-lora-trained"
) -> str:
    """
    Save all model artifacts with comprehensive verification.
    Handles both single-file and sharded model formats.
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    print(f"\n💾 Saving model artifacts to: {output_dir}")
    
    # For LoRA models - DON'T merge adapters before saving
    # We want to save the adapter separately
    print("💽 Saving model and adapter...")
    
    # Save the entire model (base model + adapter)
    model.save_pretrained(
        output_dir,
        safe_serialization=True,
        state_dict=model.state_dict()  # Save the complete state including LoRA
    )
    
    # Save tokenizer
    print("🔤 Saving tokenizer...")
    tokenizer.save_pretrained(output_dir)
    
    # Save training arguments if provided
    if training_args is not None:
        print("📝 Saving training arguments...")
        try:
            args_path = os.path.join(output_dir, "training_args.json")
            if hasattr(training_args, 'to_dict'):
                with open(args_path, "w") as f:
                    json.dump(training_args.to_dict(), f, indent=2)
            elif hasattr(training_args, 'to_json_string'):
                with open(args_path, "w") as f:
                    f.write(training_args.to_json_string())
            else:
                print("⚠️ Warning: TrainingArguments has no serialization method")
        except Exception as e:
            print(f"⚠️ Warning: Failed to save training args - {str(e)}")
    
    # Verify the adapter files were saved
    required_files = ['adapter_config.json', 'adapter_model.safetensors']
    missing_files = []
    for file in required_files:
        if not os.path.exists(os.path.join(output_dir, file)):
            missing_files.append(file)
    
    if missing_files:
        print(f"\n⚠️ Warning: Missing adapter files: {missing_files}")
        print("Trying alternative save method...")
        # Explicitly save the adapter
        model.save_pretrained(
            output_dir,
            safe_serialization=True,
            adapter_only=True  # This ensures adapter files are saved
        )
    
    print("\n🔍 Verifying saved files:")
    for file in os.listdir(output_dir):
        size = os.path.getsize(os.path.join(output_dir, file)) / 1024
        print(f"- {file} ({size:.2f} KB)")
    
    return output_dir

In [None]:
# Cell 8: Robust Model Loading and Testing with PEFT support - FIXED
# ========================================================
# Add missing imports
from peft import PeftModel
from transformers import pipeline

def load_and_test_model(
    model_path: str = "/kaggle/working/gpt2-lora-trained", 
    max_length: int = 160,
    test_prompts: Optional[list] = None,
    is_peft_model: bool = True
):
    """
    Load and test a saved model with comprehensive error handling
    """
    print(f"\n🔍 Preparing to load model from: {model_path}")
    
    # Verify model directory exists
    if not os.path.exists(model_path):
        raise ValueError(f"Model directory {model_path} does not exist")
    
    # Show directory contents for debugging
    print("\n📂 Model directory contents:")
    for f in sorted(os.listdir(model_path)):
        size = os.path.getsize(os.path.join(model_path, f)) / 1024
        print(f"- {f} ({size:.2f} KB)")
    
    try:
        print("\n🔄 Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        print("\n🔄 Loading model...")
        if is_peft_model:
            # First check if we have adapter files
            adapter_files = [
                f for f in os.listdir(model_path) 
                if f.startswith('adapter_') or f == 'adapter_config.json'
            ]
            
            if not adapter_files:
                print("⚠️ No adapter files found. Loading as regular model.")
                model = AutoModelForCausalLM.from_pretrained(
                    model_path,
                    torch_dtype=torch.float32
                )
            else:
                print(f"Found adapter files: {adapter_files}")
                # Load base model first
                base_model = AutoModelForCausalLM.from_pretrained(
                    "meta-llama/Llama-2-7b-chat-hf",  # Load original base model
                    torch_dtype=torch.float32
                )
                
                # Then load the PEFT adapter
                model = PeftModel.from_pretrained(
                    base_model,
                    model_path
                )
                
                # Merge and unload for inference
                model = model.merge_and_unload()
        else:
            # For regular models
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float32
            )
            
        print("\n🎉 Model loaded successfully!")
        model.eval()  # Set to evaluation mode
        
        # Default test prompts if none provided
        if test_prompts is None:
            test_prompts = [
                "What is hardware wallet?? ",
                "What is Proof of Work (PoW)?? ",
                "What is cryptography?? ",
                "What is Peer-to-Peer (P2P)?? ",
                "What is block chain?? ",
                "What is private key?? "
            ]
        
        # Create pipeline
        print("\n🚀 Creating text generation pipeline...")
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=-1  # Force CPU usage
        )
        
        # Run tests
        print("\n🧪 Running generation tests...")
        for i, prompt in enumerate(test_prompts, 1):
            print(f"\n🔹 Test {i}: {prompt}")
            output = pipe(
                prompt,
                max_length=max_length,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                num_return_sequences=1,
                repetition_penalty=1.2
            )
            print("💬 Response:", output[0]['generated_text'])
            
        return model, tokenizer
        
    except Exception as e:
        print(f"\n❌ Critical error loading model: {str(e)}")
        print("\n🛠️ Debugging info:")
        print(f"- Path: {os.path.abspath(model_path)}")
        print(f"- Directory exists: {os.path.exists(model_path)}")
        if os.path.exists(model_path):
            print("- Contents:", os.listdir(model_path))
        raise

In [None]:
# Main execution
if __name__ == "__main__":
    model_path = "/kaggle/working/gpt2-lora-trained"
    
    # Save model artifacts
    save_model_artifacts(model, tokenizer, training_args)
    
    # Load with explicit path and PEFT flag
    load_and_test_model(model_path, is_peft_model=True)
    
    # Test with custom prompts
    custom_prompts = [
        "What is software wallet, and what's the difference between hardware and software wallet? ",
        "What is PoW? ",
        "Explain PoW in 1 sentence. ",
        "Describe the key features of PoW using 3 words. ",
        "What is PoM? Is it something related to cryptography? ",
        "What is a cryptographic product? ",
        "What is P2P? ",
        "What is block chain? ",
        "What is public key, and what's the difference between private and public key? "
    ]
    load_and_test_model(model_path, test_prompts=custom_prompts, is_peft_model=True)

In [None]:
notebook_end = time.time()
print(f"Total notebook execution time: {notebook_end - notebook_start:.2f} seconds")