In [1]:
import time
notebook_start = time.time()

In [2]:
# Cell 1: Environment Setup - FIXED
# =================================
import os
import sys
import json
import shutil
import numpy as np
import torch
import transformers
from datasets import Dataset, load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["NO_TF"] = "1"  # Prevent TensorFlow import issues

# Install required packages
!pip uninstall -y tensorflow  # Remove to prevent conflicts
!pip install --upgrade pip setuptools wheel
!pip install numpy==1.26.4 scipy==1.11.4
!pip install torch==2.2.1+cpu torchvision==0.17.1+cpu torchaudio==2.2.1+cpu --index-url https://download.pytorch.org/whl/cpu
!pip install transformers==4.41.2 peft==0.10.0 datasets==2.18.0 accelerate==0.29.1
!pip install einops==0.7.0 tokenizers==0.19.1 sentencepiece==0.2.0
!pip install scikit-learn==1.2.2 matplotlib==3.7.2
!pip install langchain==0.1.16 faiss-cpu==1.7.4 tqdm==4.66.2 pandas==2.2.2

# Verify installations
print("\n=== Core Package Versions ===")
print(f"Python: {sys.version}")
print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.11/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
Looking in indexes: https://download.pytorch.org/whl/cpu

=== Core Package Versions ===
Python: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
NumPy: 1.26.4
PyTorch: 2.2.1+cpu
Transformers: 4.41.2
CUDA available: False


In [3]:
# Cell 2: Model Loading - FIXED
# ============================
MODEL_NAME = "gpt2"

def print_memory():
    """Memory usage diagnostics"""
    import psutil
    ram = psutil.virtual_memory()
    print(f"RAM: {ram.percent:.1f}% ({ram.used/1024**3:.1f}/{ram.total/1024**3:.1f}GB)")

def load_model(model_name):
    print(f"\n=== Loading Model: {model_name} ===")
    print_memory()
    
    device = "cpu"
    torch_dtype = torch.float32
    
    try:
        print("Attempting standard CPU load...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map=None,
            torch_dtype=torch_dtype
        ).to(device)
        print("\n✅ Model loaded successfully on CPU!")
        return model
    except Exception as e:
        print(f"\n❌ Standard load failed: {str(e)}")
        raise RuntimeError("Unable to load model on CPU")

model = load_model(MODEL_NAME)


=== Loading Model: gpt2 ===
RAM: 4.5% (1.0/31.4GB)
Attempting standard CPU load...

✅ Model loaded successfully on CPU!


In [4]:
# Cell 3: Tokenizer Setup - FIXED
# ==============================
def load_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            padding_side="right"
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer loaded successfully")
        return tokenizer
    except Exception as e:
        print(f"Tokenizer loading failed: {str(e)}")
        raise

tokenizer = load_tokenizer(MODEL_NAME)

Tokenizer loaded successfully


In [5]:
# Cell 4: Dataset Preparation - FIXED
# ==================================
def prepare_dataset(file_path="/kaggle/input/database4", max_samples=1000):
    try:
        # Check if path exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Dataset path not found: {file_path}")
        
        print(f"Loading dataset from: {file_path}")
        
        # Check if it's a directory or file
        if os.path.isdir(file_path):
            # Load all JSON files in directory
            json_files = [f for f in os.listdir(file_path) if f.endswith('.json')]
            if not json_files:
                raise ValueError(f"No JSON files found in directory: {file_path}")
            
            data_files = [os.path.join(file_path, f) for f in json_files]
            print(f"Found {len(json_files)} JSON files")
            dataset = load_dataset('json', data_files=data_files, split=f'train[:{max_samples}]')
        else:
            # Single file
            dataset = load_dataset('json', data_files=file_path, split=f'train[:{max_samples}]')
        
        # Ensure text column exists
        if 'text' not in dataset.column_names:
            # Try to find a text-like column
            text_candidates = [col for col in dataset.column_names 
                              if any(keyword in col.lower() for keyword in ['text', 'content', 'body', 'article'])]
            
            if text_candidates:
                print(f"Renaming column '{text_candidates[0]}' to 'text'")
                dataset = dataset.rename_column(text_candidates[0], 'text')
            else:
                # If no text-like column, concatenate all string columns
                print("Concatenating all string columns to create 'text'")
                string_cols = [col for col in dataset.column_names if dataset.features[col].dtype == 'string']
                
                def combine_columns(examples):
                    return {'text': ' '.join(str(examples[col]) for col in string_cols)}
                
                dataset = dataset.map(combine_columns, batched=True)
        
        print(f"✅ Loaded dataset with {len(dataset)} samples")
        return dataset
    
    except Exception as e:
        print(f"\n❌ Dataset loading failed: {str(e)}")
        print("Creating fallback dataset...")
        return create_fallback_dataset()

def create_fallback_dataset():
    """Create cryptocurrency sample dataset"""
    sample_texts = [
        "Cryptocurrency is a digital asset designed to work as a medium of exchange.",
        "Blockchain technology enables secure peer-to-peer transactions.",
        "Bitcoin was the first decentralized cryptocurrency created in 2009.",
        "Ethereum introduced smart contracts to blockchain technology.",
        "DeFi (Decentralized Finance) aims to recreate traditional financial systems without intermediaries.",
        "NFTs (Non-Fungible Tokens) represent unique digital assets on the blockchain.",
        "Cryptocurrency mining involves validating transactions and adding them to the blockchain.",
        "Stablecoins are cryptocurrencies pegged to stable assets like the US dollar.",
        "Cryptocurrency exchanges allow users to trade digital assets.",
        "Wallet security is crucial for protecting cryptocurrency holdings."
    ]
    return Dataset.from_dict({"text": sample_texts})

def safe_tokenize(examples):
    """Tokenization with error handling"""
    try:
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            max_length=128,  # Reduced length for efficiency
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": tokenized["input_ids"].tolist(),
            "attention_mask": tokenized["attention_mask"].tolist(),
            "labels": tokenized["input_ids"].tolist()
        }
    except Exception:
        return {
            "input_ids": [[0]*128],
            "attention_mask": [[1]*128],
            "labels": [[0]*128]
        }

print("\n=== Starting Data Processing ===")
dataset = prepare_dataset("/kaggle/input/database4")

# Tokenize dataset
tokenized_dataset = dataset.map(safe_tokenize, batched=True, batch_size=4)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split dataset
split_datasets = tokenized_dataset.train_test_split(test_size=0.2)
tokenized_dataset = DatasetDict({
    "train": split_datasets["train"],
    "test": split_datasets["test"]
})
print(f"✅ Dataset split: {len(tokenized_dataset['train'])} train, {len(tokenized_dataset['test'])} test")


=== Starting Data Processing ===
Loading dataset from: /kaggle/input/database4

❌ Dataset loading failed: No JSON files found in directory: /kaggle/input/database4
Creating fallback dataset...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

✅ Dataset split: 8 train, 2 test


In [6]:
# Cell 5: Training Configuration - FIXED
# =====================================
# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# LoRA configuration
peft_config = LoraConfig(
    r=8,  # Reduced from 16 for CPU efficiency
    lora_alpha=16,
    target_modules=["attn.c_attn", "attn.c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training arguments optimized for CPU
training_args = TrainingArguments(
    output_dir=f"./{MODEL_NAME}-crypto-expert",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,  # Reduced accumulation steps
    num_train_epochs=1,
    learning_rate=1e-4,  # Lower learning rate for CPU
    optim="adamw_torch",
    logging_steps=5,
    # FIXED: Match evaluation and save strategies
    evaluation_strategy="epoch",  # Changed to match save_strategy
    save_strategy="epoch",
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    # FIXED: Use updated parameter name
    use_cpu=True  # Instead of deprecated no_cuda=True
)

# Prepare model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainable params: 442,368 || all params: 124,882,176 || trainable%: 0.35422829275492446




In [7]:
# Cell 6: Training Execution - FIXED
# =================================
def train_model(model, tokenized_dataset, training_args):
    """Execute the training process"""
    model.config.use_cache = False  # Disable cache for gradient checkpointing
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        data_collator=data_collator
    )
    
    print("\n=== Starting Training ===")
    trainer.train()
    
    # Save model
    output_dir = training_args.output_dir
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"\n✅ Model saved to {output_dir}")
    return trainer

trainer = train_model(model, tokenized_dataset, training_args)


=== Starting Training ===


Epoch,Training Loss,Validation Loss
1,No log,3.624588





✅ Model saved to ./gpt2-crypto-expert


In [9]:
# Cell 7: Enhanced Model Saving with Shard Support
# ===============================================

# Add missing import
from typing import Optional

def save_model_artifacts(
    model, 
    tokenizer, 
    training_args: Optional[TrainingArguments] = None, 
    output_dir: str = "/kaggle/working/gpt2-lora-trained"
) -> str:
    """
    Save all model artifacts with comprehensive verification.
    Handles both single-file and sharded model formats.
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    print(f"\n💾 Saving model artifacts to: {output_dir}")
    
    # For LoRA models - DON'T merge adapters before saving
    # We want to save the adapter separately
    print("💽 Saving model and adapter...")
    
    # Save the entire model (base model + adapter)
    model.save_pretrained(
        output_dir,
        safe_serialization=True,
        state_dict=model.state_dict()  # Save the complete state including LoRA
    )
    
    # Save tokenizer
    print("🔤 Saving tokenizer...")
    tokenizer.save_pretrained(output_dir)
    
    # Save training arguments if provided
    if training_args is not None:
        print("📝 Saving training arguments...")
        try:
            args_path = os.path.join(output_dir, "training_args.json")
            if hasattr(training_args, 'to_dict'):
                with open(args_path, "w") as f:
                    json.dump(training_args.to_dict(), f, indent=2)
            elif hasattr(training_args, 'to_json_string'):
                with open(args_path, "w") as f:
                    f.write(training_args.to_json_string())
            else:
                print("⚠️ Warning: TrainingArguments has no serialization method")
        except Exception as e:
            print(f"⚠️ Warning: Failed to save training args - {str(e)}")
    
    # Verify the adapter files were saved
    required_files = ['adapter_config.json', 'adapter_model.safetensors']
    missing_files = []
    for file in required_files:
        if not os.path.exists(os.path.join(output_dir, file)):
            missing_files.append(file)
    
    if missing_files:
        print(f"\n⚠️ Warning: Missing adapter files: {missing_files}")
        print("Trying alternative save method...")
        # Explicitly save the adapter
        model.save_pretrained(
            output_dir,
            safe_serialization=True,
            adapter_only=True  # This ensures adapter files are saved
        )
    
    print("\n🔍 Verifying saved files:")
    for file in os.listdir(output_dir):
        size = os.path.getsize(os.path.join(output_dir, file)) / 1024
        print(f"- {file} ({size:.2f} KB)")
    
    return output_dir

In [12]:
# Cell 8: Robust Model Loading and Testing with PEFT support - FIXED
# ========================================================
# Add missing imports
from peft import PeftModel
from transformers import pipeline

def load_and_test_model(
    model_path: str = "/kaggle/working/gpt2-lora-trained", 
    max_length: int = 160,
    test_prompts: Optional[list] = None,
    is_peft_model: bool = True
):
    """
    Load and test a saved model with comprehensive error handling
    """
    print(f"\n🔍 Preparing to load model from: {model_path}")
    
    # Verify model directory exists
    if not os.path.exists(model_path):
        raise ValueError(f"Model directory {model_path} does not exist")
    
    # Show directory contents for debugging
    print("\n📂 Model directory contents:")
    for f in sorted(os.listdir(model_path)):
        size = os.path.getsize(os.path.join(model_path, f)) / 1024
        print(f"- {f} ({size:.2f} KB)")
    
    try:
        print("\n🔄 Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        
        print("\n🔄 Loading model...")
        if is_peft_model:
            # First check if we have adapter files
            adapter_files = [
                f for f in os.listdir(model_path) 
                if f.startswith('adapter_') or f == 'adapter_config.json'
            ]
            
            if not adapter_files:
                print("⚠️ No adapter files found. Loading as regular model.")
                model = AutoModelForCausalLM.from_pretrained(
                    model_path,
                    torch_dtype=torch.float32
                )
            else:
                print(f"Found adapter files: {adapter_files}")
                # Load base model first
                base_model = AutoModelForCausalLM.from_pretrained(
                    "gpt2",  # Load original base model
                    torch_dtype=torch.float32
                )
                
                # Then load the PEFT adapter
                model = PeftModel.from_pretrained(
                    base_model,
                    model_path
                )
                
                # Merge and unload for inference
                model = model.merge_and_unload()
        else:
            # For regular models
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float32
            )
            
        print("\n🎉 Model loaded successfully!")
        model.eval()  # Set to evaluation mode
        
        # Default test prompts if none provided
        if test_prompts is None:
            test_prompts = [
                "What is hardware wallet?? ",
                "What is Proof of Work (PoW)?? ",
                "What is cryptography?? ",
                "What is Peer-to-Peer (P2P)?? ",
                "What is block chain?? ",
                "What is private key?? "
            ]
        
        # Create pipeline
        print("\n🚀 Creating text generation pipeline...")
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=-1  # Force CPU usage
        )
        
        # Run tests
        print("\n🧪 Running generation tests...")
        for i, prompt in enumerate(test_prompts, 1):
            print(f"\n🔹 Test {i}: {prompt}")
            output = pipe(
                prompt,
                max_length=max_length,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                num_return_sequences=1,
                repetition_penalty=1.2
            )
            print("💬 Response:", output[0]['generated_text'])
            
        return model, tokenizer
        
    except Exception as e:
        print(f"\n❌ Critical error loading model: {str(e)}")
        print("\n🛠️ Debugging info:")
        print(f"- Path: {os.path.abspath(model_path)}")
        print(f"- Directory exists: {os.path.exists(model_path)}")
        if os.path.exists(model_path):
            print("- Contents:", os.listdir(model_path))
        raise

In [13]:
# Main execution
if __name__ == "__main__":
    model_path = "/kaggle/working/gpt2-lora-trained"
    
    # Save model artifacts
    save_model_artifacts(model, tokenizer, training_args)
    
    # Load with explicit path and PEFT flag
    load_and_test_model(model_path, is_peft_model=True)
    
    # Test with custom prompts
    custom_prompts = [
        "What is software wallet, and what's the difference between hardware and software wallet? ",
        "What is PoW? ",
        "Explain PoW in 1 sentence. ",
        "Describe the key features of PoW using 3 words. ",
        "What is PoM? Is it something related to cryptography? ",
        "What is a cryptographic product? ",
        "What is P2P? ",
        "What is block chain? ",
        "What is public key, and what's the difference between private and public key? "
    ]
    load_and_test_model(model_path, test_prompts=custom_prompts, is_peft_model=True)


💾 Saving model artifacts to: /kaggle/working/gpt2-lora-trained
💽 Saving model and adapter...
🔤 Saving tokenizer...
📝 Saving training arguments...

🔍 Verifying saved files:
- merges.txt (445.62 KB)
- adapter_model.safetensors (1733.91 KB)
- README.md (4.96 KB)
- tokenizer_config.json (0.49 KB)
- training_args.json (3.82 KB)
- adapter_config.json (0.62 KB)
- vocab.json (779.45 KB)
- special_tokens_map.json (0.13 KB)
- tokenizer.json (2058.55 KB)

🔍 Preparing to load model from: /kaggle/working/gpt2-lora-trained

📂 Model directory contents:
- README.md (4.96 KB)
- adapter_config.json (0.62 KB)
- adapter_model.safetensors (1733.91 KB)
- merges.txt (445.62 KB)
- special_tokens_map.json (0.13 KB)
- tokenizer.json (2058.55 KB)
- tokenizer_config.json (0.49 KB)
- training_args.json (3.82 KB)
- vocab.json (779.45 KB)

🔄 Loading tokenizer...

🔄 Loading model...
Found adapter files: ['adapter_model.safetensors', 'adapter_config.json']


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🎉 Model loaded successfully!

🚀 Creating text generation pipeline...

🧪 Running generation tests...

🔹 Test 1: What is hardware wallet?? 
💬 Response: What is hardware wallet?? ???


I do not have a physical machine but I can read the instructions on my PC, and it works fine. It also comes with USB 2.0 support which allows me to use any mobile device (with an internet connection) without having your phone in play! However, if you're using Android phones or tablets that don't include SD cards... then please take note of this FAQ!! My only other question: How will i get Bitcoin? You must add all required permissions for both bitcoin address(s), transaction amount/time block header etc.. The answer may be different depending upon what kind crypto currency would work best as opposed just accepting digital payments from BTC addresses so longas they are valid; e-mailing them via their verified email account has been

🔹 Test 2: What is Proof of Work (PoW)?? 
💬 Response: What is Proof of Work 

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🎉 Model loaded successfully!

🚀 Creating text generation pipeline...

🧪 Running generation tests...

🔹 Test 1: What is software wallet, and what's the difference between hardware and software wallet? 
💬 Response: What is software wallet, and what's the difference between hardware and software wallet?  It has two parts: a physical card that can store data on it (such as your phone number) or an online account where you may send money to other people. There are no separate accounts in Bitcoin wallets for any of these purposes; they just hold transactions sent from one computer into another at different points over time using addresses found within each transaction record so there isn't much need when trying out new things like bitcoin-like apps which allow users access multiple computers simultaneously while storing all their information locally across devices such Asynchronous Payment System.
In this article we'll look briefly how most developers use Linux Wallet, including its builtin

In [14]:
notebook_end = time.time()
print(f"Total notebook execution time: {notebook_end - notebook_start:.2f} seconds")

Total notebook execution time: 560.63 seconds
