In [3]:
import time
notebook_start = time.time()

In [4]:
# Kaggle Environment Setup
import os
import sys
import torch
import psutil
import numpy as np
import json
import re
import shutil
import time
from collections import defaultdict
from typing import Dict, List, Tuple, Any, Optional
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    GenerationConfig,
    pipeline
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from IPython.display import clear_output
import matplotlib.pyplot as plt

print("=== Initializing Environment ===")

# Verify GPU availability
if torch.cuda.is_available():
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("No GPU detected - running in CPU mode")

# Memory diagnostics
def print_memory():
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.memory_allocated() / 1024**3
        print(f"GPU Memory: {gpu_mem:.2f}GB", end=" | ")
    ram = psutil.virtual_memory()
    print(f"RAM: {ram.percent}% ({ram.used/1024**3:.1f}/{ram.total/1024**3:.1f}GB)")

print("\nInitial system status:")
print_memory()

=== Initializing Environment ===
No GPU detected - running in CPU mode

Initial system status:
RAM: 5.7% (1.3/31.4GB)


In [5]:
# Install required packages with version pinning
!pip install -q torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers==4.41.2 datasets==2.18.0 peft==0.10.0 accelerate==0.29.1 bitsandbytes==0.43.0

# Verify installations
import importlib
for pkg in ['torch', 'transformers', 'datasets', 'peft', 'bitsandbytes']:
    try:
        importlib.import_module(pkg)
        print(f"✅ {pkg} installed successfully")
    except ImportError:
        print(f"❌ {pkg} not installed")

[0m✅ torch installed successfully
✅ transformers installed successfully
✅ datasets installed successfully
✅ peft installed successfully
✅ bitsandbytes installed successfully


In [6]:
# Model Loading
MODEL_NAME = "gpt2"

def load_model(model_name: str):
    print(f"\n=== Loading {model_name} ===")
    
    # Configure tokenizer with robust padding setup
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            print("✅ Set pad_token to eos_token")
        print("✅ Tokenizer loaded successfully")
    except Exception as e:
        print(f"❌ Tokenizer loading failed: {e}")
        print("Creating fallback tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token
        print("✅ Created fallback tokenizer")

    # Configure quantization
    if torch.cuda.is_available():
        print("Configuring for GPU with 4-bit quantization")
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
    else:
        print("Configuring for CPU without quantization")
        bnb_config = None

    # Model loading with error recovery
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )
        print("✅ Model loaded with configured settings")
    except Exception as e:
        print(f"⚠️ Primary load failed: {e}")
        print("Attempting fallback to basic CPU loading...")
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="cpu",
                torch_dtype=torch.float32
            )
            print("✅ Model loaded on CPU")
        except Exception as e:
            print(f"❌ All loading attempts failed: {e}")
            print("Creating minimal model...")
            model = AutoModelForCausalLM.from_pretrained("gpt2")
            print("✅ Created minimal model")

    print("\nFinal memory status:")
    print_memory()
    return model, tokenizer

# Load model and tokenizer
try:
    model, tokenizer = load_model(MODEL_NAME)
except Exception as e:
    print(f"\n❌ Critical error loading model: {e}")
    # Ultimate fallback
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained("gpt2")
    print("✅ Created ultimate fallback model")


=== Loading gpt2 ===
✅ Set pad_token to eos_token
✅ Tokenizer loaded successfully
Configuring for CPU without quantization
✅ Model loaded with configured settings

Final memory status:
RAM: 5.8% (1.3/31.4GB)


In [7]:
# Dataset Preparation
def prepare_dataset(file_path: str, tokenizer, max_samples: int = 1000) -> Dataset:
    print(f"\n=== Preparing Dataset: {file_path} ===")
    
    # Robust dataset creation with multiple fallbacks
    try:
        # Verify file existence
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Dataset path not found: {file_path}")
        
        # Try multiple loading methods
        try:
            dataset = load_dataset('json', data_files=file_path, split='train')
            print(f"✅ Raw dataset loaded | Samples: {len(dataset)}")
        except Exception as e:
            print(f"⚠️ Standard load failed: {e}")
            try:
                with open(file_path, 'r') as f:
                    data = [json.loads(line) for line in f]
                dataset = Dataset.from_list(data)
                print(f"✅ Dataset loaded from JSON lines | Samples: {len(dataset)}")
            except:
                print("⚠️ JSON lines load failed")
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    dataset = Dataset.from_dict(data)
                    print(f"✅ Dataset loaded from JSON dict | Samples: {len(dataset)}")
                except:
                    raise ValueError("Unsupported file format")
        
        # Standardize text column
        if 'text' not in dataset.features:
            print("⚠️ No 'text' column found, attempting to locate content")
            text_col = None
            for col in ['content', 'body', 'article', 'sentence', 'response', 'answer']:
                if col in dataset.features:
                    text_col = col
                    break
            
            if text_col:
                print(f"⚠️ Renaming '{text_col}' to 'text'")
                dataset = dataset.rename_column(text_col, 'text')
            else:
                print("⚠️ Creating text column from first string column")
                def create_text(examples):
                    for k, v in examples.items():
                        if isinstance(v, str):
                            return {'text': v}
                    return {'text': str(examples)}
                dataset = dataset.map(create_text)
        
        # Apply sampling if needed
        if len(dataset) > max_samples:
            print(f"⚠️ Large dataset ({len(dataset)} samples), sampling to {max_samples}")
            dataset = dataset.select(range(max_samples))
        
        # Add basic quality metrics
        text_lens = [len(text.split()) for text in dataset['text']]
        print(f"Dataset stats - Avg tokens: {np.mean(text_lens):.1f}, Min: {min(text_lens)}, Max: {max(text_lens)}")
        
        return dataset
        
    except Exception as e:
        print(f"❌ Dataset preparation failed: {e}")
        print("Creating minimal fallback dataset...")
        return Dataset.from_dict({"text": [
            "Blockchain is a decentralized ledger technology.",
            "Cryptocurrencies use public-key cryptography for security.",
            "Proof of Work requires miners to solve computational puzzles.",
            "Hardware wallets provide offline storage for private keys.",
            "Smart contracts enable automated transactions on blockchain networks."
        ]})

# Tokenization with error handling
def tokenize_dataset(dataset, tokenizer):
    print("\n=== Tokenizing Dataset ===")
    
    def tokenize_function(examples):
        try:
            return tokenizer(
                examples["text"],
                truncation=True,
                max_length=128,
                padding="max_length",
                return_tensors="pt"
            )
        except Exception as e:
            print(f"⚠️ Tokenization error: {e}")
            # Fallback to empty sample
            return {
                'input_ids': torch.zeros(128, dtype=torch.long),
                'attention_mask': torch.zeros(128, dtype=torch.long)
            }
    
    try:
        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
        print("✅ Tokenization completed successfully")
        return tokenized_dataset
    except Exception as e:
        print(f"❌ Tokenization failed: {e}")
        print("Creating minimal tokenized dataset...")
        return Dataset.from_dict({
            "input_ids": [torch.tensor([tokenizer.bos_token_id] * 128)],
            "attention_mask": [torch.tensor([1] * 128)]
        })

# Training Monitor
class TrainingMonitor:
    """Real-time training monitoring with visualization"""
    def __init__(self):
        self.metrics = defaultdict(list)
        self.start_time = time.time()
        
    def update(self, **kwargs):
        for k, v in kwargs.items():
            self.metrics[k].append(v)
        
    def display_dashboard(self, epoch=None, step=None):
        try:
            clear_output(wait=True)
            fig, axes = plt.subplots(1, 3, figsize=(15, 4))
            
            # Plot 1: Training Loss
            if self.metrics.get('loss'):
                axes[0].plot(self.metrics['loss'])
                axes[0].set_title("Training Loss")
                axes[0].set_xlabel("Steps")
            
            # Plot 2: Learning Rate
            if self.metrics.get('learning_rate'):
                axes[1].plot(self.metrics['learning_rate'])
                axes[1].set_title("Learning Rate")
                axes[1].set_xlabel("Steps")
            
            # Plot 3: Hardware Usage
            hardware_metrics = [
                psutil.cpu_percent(),
                torch.cuda.memory_allocated()/1e9 if torch.cuda.is_available() else 0,
                psutil.virtual_memory().percent
            ]
            axes[2].bar(['CPU', 'GPU', 'RAM'], hardware_metrics, color=['blue', 'green', 'purple'])
            axes[2].set_title("Hardware Usage")
            axes[2].set_ylim(0, 100)
            
            # Add title
            title = "Training Monitor"
            if epoch is not None:
                title += f" | Epoch {epoch+1}"
            if step is not None:
                title += f" | Step {step}"
            plt.suptitle(title)
            
            plt.tight_layout()
            plt.show()
            
            # Print stats
            if self.metrics.get('loss'):
                print(f"Current loss: {self.metrics['loss'][-1]:.4f}")
            print(f"CPU: {hardware_metrics[0]:.1f}% | GPU Mem: {hardware_metrics[1]:.2f}GB | RAM: {hardware_metrics[2]:.1f}%")
            
        except Exception as e:
            print(f"⚠️ Dashboard error: {e}")

In [8]:
# Main Training Function
def main(model, tokenizer):
    # Configuration
    DATASET_PATH = "/kaggle/input/database-0530"
    OUTPUT_DIR = "/kaggle/working/output"
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    print("\n" + "="*50)
    print("STARTING TRAINING PROCESS")
    print("="*50)
    
    try:
        # 1. Dataset Preparation
        print("\n=== PHASE 1: DATASET PREPARATION ===")
        print_memory()
        dataset = prepare_dataset(DATASET_PATH, tokenizer, max_samples=500)
        tokenized_dataset = tokenize_dataset(dataset, tokenizer)
        
        # 2. Training Configuration
        print("\n=== PHASE 2: TRAINING CONFIGURATION ===")
        print_memory()
        
        # LoRA configuration for GPT-2
        peft_config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=["c_attn", "c_proj", "c_fc"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            num_train_epochs=1,
            learning_rate=2e-5,
            fp16=torch.cuda.is_available(),
            logging_steps=5,
            save_strategy="no",
            report_to="none",
            optim="adamw_torch",
            max_grad_norm=0.3,
            warmup_ratio=0.1
        )
        
        # Apply LoRA
        try:
            model = prepare_model_for_kbit_training(model)
            model = get_peft_model(model, peft_config)
            model.print_trainable_parameters()
            print("✅ LoRA configured successfully")
        except Exception as e:
            print(f"❌ LoRA configuration failed: {e}")
            print("Proceeding without LoRA...")
        
        # 3. Training Execution
        print("\n=== PHASE 3: TRAINING EXECUTION ===")
        print_memory()
        
        # Data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False
        )
        
        # Create trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset,
            data_collator=data_collator,
        )
        
        # Start training with monitoring
        monitor = TrainingMonitor()
        print("Starting training...")
        
        try:
            trainer.train()
            print("✅ Training completed successfully")
        except Exception as e:
            print(f"❌ Training failed: {e}")
            print("Attempting to save partial model...")
        
        # 4. Model Saving
        print("\n=== PHASE 4: MODEL SAVING ===")
        try:
            model.save_pretrained(OUTPUT_DIR)
            tokenizer.save_pretrained(OUTPUT_DIR)
            print(f"✅ Model saved to {OUTPUT_DIR}")
            
            # Verify files
            files = os.listdir(OUTPUT_DIR)
            print(f"Saved files: {', '.join(files)}")
        except Exception as e:
            print(f"❌ Model saving failed: {e}")
        
        # 5. Model Testing
        print("\n=== PHASE 5: MODEL TESTING ===")
        try:
            test_prompts = [
                "Blockchain technology is",
                "A hardware wallet is",
                "Proof of Work consensus",
                "Public-key cryptography"
            ]
            
            generation_config = GenerationConfig(
                max_new_tokens=50,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.2
            )
            
            # Move model to appropriate device
            device = "cuda" if torch.cuda.is_available() else "cpu"
            model.to(device)
            
            for i, prompt in enumerate(test_prompts):
                print(f"\n🔹 Test {i+1}: {prompt}")
                inputs = tokenizer(prompt, return_tensors="pt").to(device)
                
                outputs = model.generate(
                    **inputs, 
                    generation_config=generation_config
                )
                response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                print(f"💬 Response: {response[len(prompt):].strip()}")
            
            print("\n=== TRAINING PROCESS COMPLETE ===")
            
        except Exception as e:
            print(f"❌ Testing failed: {e}")
            
    except Exception as e:
        print(f"\n❌ Critical error in training process: {str(e)}")
        print("Attempting emergency model save...")
        try:
            model.save_pretrained(OUTPUT_DIR)
            tokenizer.save_pretrained(OUTPUT_DIR)
            print("✅ Emergency model save completed")
        except:
            print("❌ Emergency save failed")

# Execute training
if __name__ == "__main__":
    main(model, tokenizer)


STARTING TRAINING PROCESS

=== PHASE 1: DATASET PREPARATION ===
RAM: 5.8% (1.3/31.4GB)

=== Preparing Dataset: /kaggle/input/database-0530 ===
⚠️ Standard load failed: Unable to find '/kaggle/input/database-0530'
⚠️ JSON lines load failed
❌ Dataset preparation failed: Unsupported file format
Creating minimal fallback dataset...

=== Tokenizing Dataset ===


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

✅ Tokenization completed successfully

=== PHASE 2: TRAINING CONFIGURATION ===
RAM: 5.8% (1.3/31.4GB)
trainable params: 1,179,648 || all params: 125,619,456 || trainable%: 0.939064725769868
✅ LoRA configured successfully

=== PHASE 3: TRAINING EXECUTION ===
RAM: 5.8% (1.3/31.4GB)
Starting training...




Step,Training Loss


✅ Training completed successfully

=== PHASE 4: MODEL SAVING ===




✅ Model saved to /kaggle/working/output
Saved files: tokenizer_config.json, adapter_config.json, tokenizer.json, README.md, special_tokens_map.json, merges.txt, vocab.json, adapter_model.safetensors

=== PHASE 5: MODEL TESTING ===

🔹 Test 1: Blockchain technology is
💬 Response: a decentralized peer-to--that's an asset, uh... I think it all works out to be in the same place.
It turns into something that has some sort of security and privacy benefits if you can do just what we're doing for

🔹 Test 2: A hardware wallet is
💬 Response: the next most common way to get a Bitcoin address, and this includes any non-public keys.


 I've already written an article on how you can use it for storing private key cryptography addresses (as well as other wallets). You have several

🔹 Test 3: Proof of Work consensus
💬 Response: .
            The above is just a rough sketch, but the concept can be implemented in various ways as well by any and all developers with this article: https://www2k-3d4a1c8e9b5

In [9]:
notebook_end = time.time()
print(f"Total notebook execution time: {notebook_end - notebook_start:.2f} seconds")

Total notebook execution time: 31.88 seconds
