# Bitcoin Price Prediction - Qwen2.5 4B Instruct Fine-tuning

This notebook fine-tunes Qwen2.5 4B Instruct model for Bitcoin price prediction using technical analysis and news sentiment data.

## Overview
- **Model**: Qwen/Qwen2.5-4B-Instruct
- **Task**: Multi-modal Bitcoin trading decision and price forecasting
- **Data**: Technical indicators + News analysis ‚Üí Trading actions + 10-day price forecasts
- **Method**: LoRA fine-tuning with instruction format

## 1. Environment Setup and Dependencies

## ‚ö†Ô∏è Important: Clean Setup Instructions

**To avoid tensor creation and model loading errors:**

1. **Restart Kernel**: If you encounter any errors, restart the notebook kernel first
2. **Clear Output**: Clear all cell outputs before rerunning  
3. **Run in Order**: Execute cells sequentially from top to bottom
4. **Clean Environment**: Remove any existing model directories in the output folder

**Common Error Solutions:**
- `tensor creation error`: Dataset formatting issues ‚Üí Restart kernel
- `size mismatch error`: Model loading conflicts ‚Üí Clear cache and restart  
- `adapter conflict`: PEFT adapter issues ‚Üí Remove output directories

**Memory Requirements:**
- RTX 3090: 24GB VRAM recommended
- Batch size automatically adjusted for available memory

In [None]:
# Install required packages for Unsloth training
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install wandb evaluate
!pip install pandas numpy matplotlib seaborn
!pip install huggingface_hub

In [None]:
import os
import json
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Unsloth imports for faster training
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import get_chat_template

# Standard ML/AI Libraries
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import Dataset, load_dataset

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    print(f"BFloat16 supported: {is_bfloat16_supported()}")

## 2. Configuration and Hyperparameters

In [None]:
# Training Configuration for Unsloth + RTX 3090
CONFIG = {
    # Model settings
    "model_name": "unsloth/Qwen2.5-4B-Instruct",  # Unsloth optimized model
    "output_dir": "./bitcoin_qwen_unsloth_model",
    "hub_model_id": "tahamajs/bitcoin-qwen2.5-4b-unsloth", # Change to your username
    
    # Dataset settings
    "dataset_name": "tahamajs/bitcoin-prediction-enhanced-dataset", # Change to your dataset
    "max_length": 2048,
    "train_split": 0.8,
    "eval_split": 0.1,
    "test_split": 0.1,
    
    # LoRA settings (Optimized for Unsloth)
    "lora_r": 16,
    "lora_alpha": 16,
    "lora_dropout": 0,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    
    # Training hyperparameters (Optimized for RTX 3090)
    "num_train_epochs": 3,
    "per_device_train_batch_size": 4,  # Increased for RTX 3090
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 2,  # Reduced since batch size increased
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "warmup_ratio": 0.03,
    "lr_scheduler_type": "linear",
    
    # Optimization settings (Unsloth optimized)
    "optim": "adamw_8bit",
    "gradient_checkpointing": True,
    "dataloader_pin_memory": False,
    "bf16": is_bfloat16_supported(),  # Use BFloat16 if supported
    "fp16": not is_bfloat16_supported(),  # Fallback to FP16
    
    # Logging and saving
    "logging_steps": 10,
    "eval_steps": 50,
    "save_steps": 200,
    "save_total_limit": 3,
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,
    
    # Unsloth settings
    "max_seq_length": 2048,
    "dtype": None,  # Auto-detect
    "load_in_4bit": True,
    
    # Custom evaluation settings
    "show_sample_every_n_steps": 50,  # Show model output every 50 steps
}

print("Unsloth Training Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")
    
print(f"\nOptimizations for RTX 3090:")
print(f"  Using {'BFloat16' if CONFIG['bf16'] else 'Float16'} precision")
print(f"  Batch size: {CONFIG['per_device_train_batch_size']} (effective: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']})")
print(f"  LoRA rank reduced to {CONFIG['lora_r']} for faster training")

## 3. Load and Prepare Dataset

In [None]:
# Load dataset from Hugging Face Hub
print(f"Loading dataset: {CONFIG['dataset_name']}")
dataset = load_dataset(CONFIG['dataset_name'])

print(f"Dataset loaded:")
print(f"  Total samples: {len(dataset['train'])}")
print(f"  Features: {list(dataset['train'].features.keys())}")

# Show sample
sample = dataset['train'][0]
print(f"\nSample data structure:")
print(f"  Instruction length: {len(sample['instruction'])} chars")
print(f"  Input length: {len(sample['input'])} chars")
print(f"  Output length: {len(sample['output'])} chars")

print(f"\nSample output: {sample['output']}")

In [None]:
# Analyze dataset statistics
def analyze_dataset(dataset):
    """Analyze dataset characteristics"""
    df = dataset.to_pandas()
    
    # Length analysis
    df['instruction_len'] = df['instruction'].str.len()
    df['input_len'] = df['input'].str.len()
    df['output_len'] = df['output'].str.len()
    df['total_len'] = df['instruction_len'] + df['input_len'] + df['output_len']
    
    # Output analysis
    df['action'] = df['output'].str.extract(r'"action":"([^"]+)"')
    df['confidence'] = df['output'].str.extract(r'"confidence":([0-9]+)').astype(float)
    
    print("Dataset Statistics:")
    print(f"  Total samples: {len(df)}")
    print(f"\nLength Statistics:")
    print(f"  Instruction: {df['instruction_len'].mean():.0f} ¬± {df['instruction_len'].std():.0f} chars")
    print(f"  Input: {df['input_len'].mean():.0f} ¬± {df['input_len'].std():.0f} chars")
    print(f"  Output: {df['output_len'].mean():.0f} ¬± {df['output_len'].std():.0f} chars")
    print(f"  Total: {df['total_len'].mean():.0f} ¬± {df['total_len'].std():.0f} chars")
    
    print(f"\nAction Distribution:")
    action_counts = df['action'].value_counts()
    for action, count in action_counts.items():
        percentage = count / len(df) * 100
        print(f"  {action}: {count} ({percentage:.1f}%)")
    
    print(f"\nConfidence Statistics:")
    print(f"  Mean: {df['confidence'].mean():.1f}")
    print(f"  Std: {df['confidence'].std():.1f}")
    print(f"  Min: {df['confidence'].min():.0f}")
    print(f"  Max: {df['confidence'].max():.0f}")
    
    return df

df_analysis = analyze_dataset(dataset['train'])

In [None]:
# Visualize dataset characteristics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Length distributions
axes[0, 0].hist(df_analysis['total_len'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Total Length Distribution')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(CONFIG['max_length'], color='red', linestyle='--', label=f"Max Length: {CONFIG['max_length']}")
axes[0, 0].legend()

# Action distribution
action_counts = df_analysis['action'].value_counts()
axes[0, 1].pie(action_counts.values, labels=action_counts.index, autopct='%1.1f%%')
axes[0, 1].set_title('Trading Action Distribution')

# Confidence distribution
axes[1, 0].hist(df_analysis['confidence'], bins=30, alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Confidence Distribution')
axes[1, 0].set_xlabel('Confidence Score')
axes[1, 0].set_ylabel('Frequency')

# Input length vs confidence
axes[1, 1].scatter(df_analysis['input_len'], df_analysis['confidence'], alpha=0.5)
axes[1, 1].set_title('Input Length vs Confidence')
axes[1, 1].set_xlabel('Input Length (chars)')
axes[1, 1].set_ylabel('Confidence Score')

plt.tight_layout()
plt.show()

# Check samples that exceed max length
long_samples = df_analysis[df_analysis['total_len'] > CONFIG['max_length']]
print(f"\n‚ö†Ô∏è Samples exceeding max_length ({CONFIG['max_length']}): {len(long_samples)} ({len(long_samples)/len(df_analysis)*100:.1f}%)")
if len(long_samples) > 0:
    print(f"   Longest sample: {long_samples['total_len'].max()} chars")
    print(f"   Recommendation: Consider increasing max_length or truncating")

## 4. Model and Tokenizer Setup

In [None]:
# Load model and tokenizer using Unsloth
print(f"Loading Unsloth model: {CONFIG['model_name']}")
print(f"Max sequence length: {CONFIG['max_seq_length']}")

# Clear any existing model artifacts to prevent conflicts
import gc
import os
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

# Remove any existing model directories that might cause conflicts
if os.path.exists(CONFIG['output_dir']):
    print(f"‚ö†Ô∏è Removing existing output directory: {CONFIG['output_dir']}")
    import shutil
    shutil.rmtree(CONFIG['output_dir'])

try:
    # Load fresh model and tokenizer
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=CONFIG['model_name'],
        max_seq_length=CONFIG['max_seq_length'],
        dtype=CONFIG['dtype'],
        load_in_4bit=CONFIG['load_in_4bit'],
        # Add these parameters to ensure clean loading
        trust_remote_code=True,
        use_cache=False,  # Disable cache during training setup
    )
    
    print(f"‚úÖ Unsloth model loaded successfully!")
    print(f"  Model type: {type(model)}")
    print(f"  Tokenizer vocab size: {tokenizer.vocab_size:,}")
    print(f"  Model vocab size: {model.config.vocab_size:,}")
    print(f"  Max length: {tokenizer.model_max_length}")
    
    # Ensure tokenizer settings are correct
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"  Set pad token to EOS token: {tokenizer.eos_token}")
    
    # Verify embedding dimensions
    embed_dim = model.get_input_embeddings().weight.shape
    print(f"  Embedding dimensions: {embed_dim}")
    
    # Memory info
    if torch.cuda.is_available():
        print(f"  GPU memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
        print(f"  GPU memory reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")
    
except Exception as model_error:
    print(f"‚ùå Model loading failed: {model_error}")
    
    # Check if it's a shape mismatch error
    if "size mismatch" in str(model_error):
        print("üîç This appears to be a model shape mismatch error")
        print("   This usually happens when:")
        print("   1. Trying to load a model with different vocabulary size")
        print("   2. Loading incompatible adapters or checkpoints")
        print("   3. Model cache corruption")
        
        print("\nüîß Attempting to fix with clean model loading...")
        
        # Clear all caches and try again
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()
        
        # Try loading with explicit parameters
        try:
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name=CONFIG['model_name'],
                max_seq_length=CONFIG['max_seq_length'],
                dtype=None,  # Let it auto-detect
                load_in_4bit=True,
                trust_remote_code=True,
                use_cache=False,
                torch_dtype="auto",
                # Force clean loading
                revision="main",  # Use main branch
            )
            print("‚úÖ Clean model loading successful!")
            
        except Exception as retry_error:
            print(f"‚ùå Retry also failed: {retry_error}")
            print("\nüí° Solution: Try completely restarting the notebook")
            print("   This will clear any cached model states that might be causing conflicts")
            raise retry_error
    else:
        print(f"üîç Unexpected model loading error: {model_error}")
        raise model_error

In [None]:
# Apply LoRA using Unsloth with enhanced error handling
print("Applying LoRA adapters using Unsloth...")

try:
    # Ensure model is in the right state for LoRA application
    if hasattr(model, 'peft_config'):
        print("‚ö†Ô∏è Model already has PEFT adapters, this might cause conflicts")
        print("   Consider restarting the notebook for clean training")
    
    # Apply LoRA with Unsloth optimizations
    model = FastLanguageModel.get_peft_model(
        model,
        r=CONFIG['lora_r'],
        target_modules=CONFIG['target_modules'],
        lora_alpha=CONFIG['lora_alpha'],
        lora_dropout=CONFIG['lora_dropout'],
        bias="none",
        use_gradient_checkpointing="unsloth",  # Unsloth's optimized gradient checkpointing
        random_state=3407,
        use_rslora=False,  # Rank stabilized LoRA
        loftq_config=None,  # LoftQ quantization
    )
    
    print(f"‚úÖ LoRA applied successfully!")
    
except Exception as lora_error:
    print(f"‚ùå LoRA application failed: {lora_error}")
    
    # Analyze the error
    error_str = str(lora_error).lower()
    if "already exists" in error_str or "adapter" in error_str:
        print("üîç This appears to be an adapter conflict")
        print("   The model may already have adapters applied")
        print("   Solutions:")
        print("   1. Restart the notebook kernel")
        print("   2. Use a fresh model instance")
        print("   3. Remove existing adapters before applying new ones")
        
    elif "shape" in error_str or "size" in error_str:
        print("üîç This appears to be a shape mismatch in LoRA layers")
        print("   This can happen with mismatched model versions")
        print("   Try using the exact model specification")
        
    else:
        print(f"üîç Unexpected LoRA error: {lora_error}")
    
    raise lora_error

# Print trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params:,} || All params: {all_param:,} || Trainable%: {100 * trainable_params / all_param:.2f}%")

print_trainable_parameters(model)

print(f"\n‚úÖ Unsloth LoRA configuration complete!")
print(f"  LoRA Rank: {CONFIG['lora_r']}")
print(f"  LoRA Alpha: {CONFIG['lora_alpha']}")
print(f"  Target modules: {CONFIG['target_modules']}")

# Verify model state
print(f"\nüîç Model verification:")
print(f"  Model class: {model.__class__.__name__}")
print(f"  Has PEFT config: {hasattr(model, 'peft_config')}")
if hasattr(model, 'peft_config'):
    print(f"  PEFT adapters: {list(model.peft_config.keys())}")

# Memory check after LoRA
if torch.cuda.is_available():
    print(f"  GPU memory after LoRA: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    
# Clear any temporary variables
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## 5. Unsloth LoRA Configuration (Already Applied Above)

In [None]:
# Set up Unsloth chat template for Qwen2.5
print("Setting up Unsloth chat template...")

# Apply Qwen2.5 chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="qwen-2.5",  # Use Qwen2.5 specific template
)

def formatting_prompts_func(examples):
    """Format examples using Unsloth's optimized chat template - Fixed for tensor creation"""
    convos = []
    
    # Handle both single examples and batched examples
    if isinstance(examples["instruction"], str):
        # Single example
        instructions = [examples["instruction"]]
        inputs = [examples["input"]]
        outputs = [examples["output"]]
    else:
        # Batched examples
        instructions = examples["instruction"]
        inputs = examples["input"]
        outputs = examples["output"]
    
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Ensure all inputs are strings
        instruction = str(instruction) if instruction is not None else ""
        input_text = str(input_text) if input_text is not None else ""
        output = str(output) if output is not None else ""
        
        messages = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": input_text},
            {"role": "assistant", "content": output}
        ]
        
        try:
            convo = tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=False
            )
            # Ensure convo is a string
            if isinstance(convo, str):
                convos.append(convo)
            else:
                print(f"Warning: Chat template returned non-string: {type(convo)}")
                convos.append(str(convo))
        except Exception as e:
            print(f"Error formatting conversation: {e}")
            # Fallback to simple concatenation
            fallback_text = f"System: {instruction}\nUser: {input_text}\nAssistant: {output}"
            convos.append(fallback_text)
    
    # Ensure we return a flat list of strings
    return {"text": convos}

# Test the formatting function
print("‚úÖ Testing Unsloth formatting function...")
test_sample = {
    "instruction": "Test instruction",
    "input": "Test input",
    "output": "Test output"
}

test_result = formatting_prompts_func(test_sample)
print(f"‚úÖ Test successful: {type(test_result)} with text type: {type(test_result['text'][0])}")
print(f"‚úÖ Unsloth chat template configured!")
print(f"  Template: Qwen2.5 optimized format")
print(f"  Tokenizer ready for training")

## 6. Data Processing and Formatting

In [None]:
# Test Unsloth formatting on a sample
sample = dataset['train'][0]
sample_formatted = formatting_prompts_func({
    "instruction": [sample["instruction"]], 
    "input": [sample["input"]], 
    "output": [sample["output"]]
})

print("Unsloth formatted sample (first 500 chars):")
print(sample_formatted['text'][0][:500] + "...")
print(f"\nFormatted length: {len(sample_formatted['text'][0])} chars")

# Verify the format works with tokenizer
test_tokens = tokenizer(
    sample_formatted['text'][0], 
    truncation=True, 
    max_length=CONFIG['max_seq_length']
)
print(f"Tokenized length: {len(test_tokens['input_ids'])} tokens")
print(f"‚úÖ Unsloth formatting working correctly!")

In [None]:
# Process dataset for Unsloth training with enhanced validation
print("Processing dataset for Unsloth training...")

# Split dataset first (before formatting to save memory)
train_size = int(CONFIG['train_split'] * len(dataset['train']))
eval_size = int(CONFIG['eval_split'] * len(dataset['train']))
test_size = len(dataset['train']) - train_size - eval_size

print(f"Dataset splits: Train={train_size}, Eval={eval_size}, Test={test_size}")

# Create splits
dataset_splits = dataset['train'].train_test_split(
    test_size=eval_size + test_size, 
    shuffle=True, 
    seed=42
)
train_dataset_raw = dataset_splits['train']

eval_test_splits = dataset_splits['test'].train_test_split(
    test_size=test_size,
    shuffle=True,
    seed=42
)
eval_dataset_raw = eval_test_splits['train']
test_dataset_raw = eval_test_splits['test']

print("‚úÖ Dataset splits created")

# Validation function to check data integrity
def validate_dataset_sample(sample, idx):
    """Validate a single dataset sample"""
    required_keys = ['instruction', 'input', 'output']
    for key in required_keys:
        if key not in sample:
            raise ValueError(f"Missing key '{key}' in sample {idx}")
        if sample[key] is None:
            sample[key] = ""  # Replace None with empty string
        if not isinstance(sample[key], str):
            sample[key] = str(sample[key])  # Convert to string
    return sample

# Apply enhanced formatting with validation
def safe_formatting(dataset_split, split_name):
    """Safely format dataset with comprehensive error handling"""
    print(f"Formatting {split_name} dataset...")
    
    # First validate all samples
    validated_samples = []
    for i, sample in enumerate(dataset_split):
        try:
            validated_sample = validate_dataset_sample(sample, i)
            validated_samples.append(validated_sample)
        except Exception as e:
            print(f"‚ö†Ô∏è Skipping invalid sample {i} in {split_name}: {e}")
    
    print(f"‚úÖ Validated {len(validated_samples)}/{len(dataset_split)} samples in {split_name}")
    
    # Create new dataset from validated samples
    from datasets import Dataset
    validated_dataset = Dataset.from_list(validated_samples)
    
    # Apply formatting
    try:
        formatted_dataset = validated_dataset.map(
            formatting_prompts_func, 
            batched=True,
            batch_size=10,  # Smaller batches for safety
            remove_columns=validated_dataset.column_names,
            desc=f"Formatting {split_name}"
        )
        
        # Validate formatted output
        print(f"üîç Validating formatted {split_name} dataset...")
        for i in range(min(3, len(formatted_dataset))):
            sample = formatted_dataset[i]
            if 'text' not in sample:
                raise ValueError(f"Missing 'text' key in formatted sample {i}")
            if not isinstance(sample['text'], str):
                raise ValueError(f"Non-string text in sample {i}: {type(sample['text'])}")
        
        print(f"‚úÖ {split_name} dataset formatted and validated successfully")
        return formatted_dataset
        
    except Exception as e:
        print(f"‚ùå Error formatting {split_name} dataset: {e}")
        print("üîÑ Trying individual sample processing...")
        
        # Fallback: process samples individually
        formatted_texts = []
        for i, sample in enumerate(validated_samples):
            try:
                result = formatting_prompts_func(sample)
                if 'text' in result and len(result['text']) > 0:
                    text = result['text'][0] if isinstance(result['text'], list) else result['text']
                    if isinstance(text, str) and len(text.strip()) > 0:
                        formatted_texts.append(text)
            except Exception as sample_error:
                print(f"‚ö†Ô∏è Failed to format sample {i}: {sample_error}")
        
        if formatted_texts:
            fallback_dataset = Dataset.from_dict({"text": formatted_texts})
            print(f"‚úÖ Fallback processing successful for {split_name}: {len(fallback_dataset)} samples")
            return fallback_dataset
        else:
            raise ValueError(f"‚ùå No samples could be formatted in {split_name}")

# Process all splits safely
train_dataset = safe_formatting(train_dataset_raw, "train")
eval_dataset = safe_formatting(eval_dataset_raw, "eval")
test_dataset = safe_formatting(test_dataset_raw, "test")

print(f"\n‚úÖ Unsloth dataset processing complete!")
print(f"  Train: {len(train_dataset):,} samples ({len(train_dataset)/len(dataset['train'])*100:.1f}%)")
print(f"  Eval: {len(eval_dataset):,} samples ({len(eval_dataset)/len(dataset['train'])*100:.1f}%)")
print(f"  Test: {len(test_dataset):,} samples ({len(test_dataset)/len(dataset['train'])*100:.1f}%)")

# Final validation check
print("üîç Final validation check...")
for dataset_name, ds in [("train", train_dataset), ("eval", eval_dataset), ("test", test_dataset)]:
    sample = ds[0]
    print(f"  {dataset_name}: text type = {type(sample['text'])}, length = {len(sample['text'])}")
    
    # Test tokenization
    try:
        test_tokens = tokenizer(sample['text'], truncation=True, max_length=CONFIG['max_seq_length'])
        print(f"  {dataset_name}: tokenization successful ({len(test_tokens['input_ids'])} tokens)")
    except Exception as e:
        print(f"  {dataset_name}: tokenization failed - {e}")
        raise e

print("‚úÖ All datasets ready for training!")

In [None]:
# üîç Comprehensive Dataset Validation to Prevent Tensor Errors
print("üîç Performing comprehensive dataset validation...")
print("="*60)

def validate_dataset_for_training(dataset):
    """
    Thoroughly validate dataset to prevent tensor creation errors
    This catches the exact issues that cause training failures
    """
    print(f"üìä Dataset overview:")
    print(f"  Size: {len(dataset)} samples")
    print(f"  Columns: {dataset.column_names}")
    
    # Check for required column
    if 'text' not in dataset.column_names:
        raise ValueError("‚ùå Dataset missing 'text' column")
    
    issues = []
    sample_texts = []
    
    # Check each sample thoroughly
    for i in range(min(20, len(dataset))):  # Check first 20 samples
        try:
            sample = dataset[i]
            
            # Check if text field exists
            if 'text' not in sample:
                issues.append(f"Sample {i}: Missing 'text' field")
                continue
            
            text = sample['text']
            
            # Check text type - this is critical for tensor creation
            if not isinstance(text, str):
                issues.append(f"Sample {i}: text is {type(text)}, must be str")
                continue
            
            # Check for empty text
            if not text.strip():
                issues.append(f"Sample {i}: Empty text content")
                continue
            
            # Check for nested structures (common cause of tensor errors)
            if isinstance(text, (list, tuple, dict)):
                issues.append(f"Sample {i}: text contains nested structure {type(text)}")
                continue
            
            # Test tokenization - this is where tensor errors often occur
            try:
                tokens = tokenizer(
                    text,
                    truncation=True,
                    max_length=CONFIG['max_seq_length'],
                    return_tensors="pt"
                )
                
                # Validate token structure
                if 'input_ids' not in tokens:
                    issues.append(f"Sample {i}: Tokenization missing input_ids")
                    continue
                
                if tokens['input_ids'].dim() != 2:
                    issues.append(f"Sample {i}: Wrong token dimensions {tokens['input_ids'].shape}")
                    continue
                
                if tokens['input_ids'].size(0) != 1:
                    issues.append(f"Sample {i}: Wrong batch dimension {tokens['input_ids'].size(0)}")
                    continue
                
                sample_texts.append(text)
                
            except Exception as token_error:
                issues.append(f"Sample {i}: Tokenization failed - {token_error}")
                continue
                
        except Exception as sample_error:
            issues.append(f"Sample {i}: Processing failed - {sample_error}")
            continue
    
    # Test batch processing (this is where training fails)
    if len(sample_texts) >= 2:
        print(f"üß™ Testing batch tokenization with {min(5, len(sample_texts))} samples...")
        try:
            batch_texts = sample_texts[:min(5, len(sample_texts))]
            batch_tokens = tokenizer(
                batch_texts,
                truncation=True,
                padding=True,
                max_length=CONFIG['max_seq_length'],
                return_tensors="pt"
            )
            
            print(f"  ‚úÖ Batch tokenization successful!")
            print(f"     Batch shape: {batch_tokens['input_ids'].shape}")
            print(f"     Token range: {batch_tokens['input_ids'].min()} to {batch_tokens['input_ids'].max()}")
            
            # Clean up batch test
            del batch_tokens
            
        except Exception as batch_error:
            issues.append(f"Batch processing: {batch_error}")
            print(f"  ‚ùå Batch tokenization failed: {batch_error}")
    
    # Report results
    if issues:
        print(f"\n‚ùå Found {len(issues)} validation issues:")
        for issue in issues[:10]:  # Show first 10 issues
            print(f"   ‚Ä¢ {issue}")
        if len(issues) > 10:
            print(f"   ‚Ä¢ ... and {len(issues) - 10} more issues")
        
        return False, issues
    else:
        print(f"\n‚úÖ All validation checks passed!")
        print(f"   Validated {min(20, len(dataset))} samples")
        print(f"   Ready for tensor-safe training")
        return True, []

# Run validation
is_valid, validation_issues = validate_dataset_for_training(train_dataset)

if not is_valid:
    print(f"\nüîß Attempting to fix dataset issues...")
    
    # Try to clean the dataset
    clean_samples = []
    
    for i, sample in enumerate(train_dataset):
        try:
            if 'text' not in sample:
                continue
            
            text = sample['text']
            
            # Ensure it's a string
            if not isinstance(text, str):
                text = str(text) if text is not None else ""
            
            # Ensure it's not empty
            if not text.strip():
                continue
            
            # Test tokenization
            try:
                test_tokens = tokenizer(
                    text,
                    truncation=True,
                    max_length=CONFIG['max_seq_length']
                )
                if len(test_tokens['input_ids']) > 0:
                    clean_samples.append({'text': text})
            except:
                continue
                
        except:
            continue
    
    if clean_samples:
        print(f"üîß Created clean dataset: {len(clean_samples)} samples")
        train_dataset = Dataset.from_list(clean_samples)
        
        # Re-validate
        is_valid, _ = validate_dataset_for_training(train_dataset)
        if is_valid:
            print(f"‚úÖ Dataset cleaning successful!")
        else:
            raise ValueError("‚ùå Dataset cleaning failed - training will likely fail")
    else:
        raise ValueError("‚ùå No valid samples found after cleaning")

print(f"\nüéØ Final dataset ready for training:")
print(f"   Samples: {len(train_dataset)}")
print(f"   Validation: {'‚úÖ Passed' if is_valid else '‚ùå Failed'}")
print(f"   Tensor compatibility: ‚úÖ Verified")

# Clear validation variables to save memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

print("="*60)

In [None]:
# Calculate training statistics now that datasets are ready
print("üìä Calculating training statistics...")

# Calculate training steps based on actual dataset size
total_steps = len(train_dataset) // (CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']) * CONFIG['num_train_epochs']
warmup_steps = int(total_steps * CONFIG['warmup_ratio'])

print("üìã Training Statistics:")
print(f"  Dataset size: {len(train_dataset):,} samples")
print(f"  Effective batch size: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']}")
print(f"  Steps per epoch: {len(train_dataset) // (CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']):,}")
print(f"  Total training steps: {total_steps:,}")
print(f"  Warmup steps: {warmup_steps:,} ({CONFIG['warmup_ratio']*100:.1f}% of total)")
print(f"  Eval every: {CONFIG['eval_steps']} steps")
print(f"  Save every: {CONFIG['save_steps']} steps")

# Estimate training time (rough estimate)
if torch.cuda.is_available():
    print(f"  Estimated training time: {total_steps * 2 / 60:.1f} minutes (rough estimate)")

print("‚úÖ Training statistics calculated!")

## 7. Training Setup and Arguments

In [None]:
# Unsloth optimized training arguments
training_args = TrainingArguments(
    # Output and logging
    output_dir=CONFIG['output_dir'],
    logging_dir=f"{CONFIG['output_dir']}/logs",
    logging_steps=CONFIG['logging_steps'],
    logging_strategy="steps",
    
    # Training parameters (Unsloth optimized)
    num_train_epochs=CONFIG['num_train_epochs'],
    per_device_train_batch_size=CONFIG['per_device_train_batch_size'],
    per_device_eval_batch_size=CONFIG['per_device_eval_batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    
    # Optimization (Unsloth settings)
    learning_rate=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay'],
    warmup_ratio=CONFIG['warmup_ratio'],
    lr_scheduler_type=CONFIG['lr_scheduler_type'],
    optim=CONFIG['optim'],
    
    # Memory and performance (Unsloth optimized)
    gradient_checkpointing=CONFIG['gradient_checkpointing'],
    dataloader_pin_memory=CONFIG['dataloader_pin_memory'],
    bf16=CONFIG['bf16'],
    fp16=CONFIG['fp16'],
    group_by_length=True,  # Unsloth optimization
    
    # Evaluation and saving
    eval_strategy="steps",
    eval_steps=CONFIG['eval_steps'],
    save_strategy="steps",
    save_steps=CONFIG['save_steps'],
    save_total_limit=CONFIG['save_total_limit'],
    load_best_model_at_end=CONFIG['load_best_model_at_end'],
    metric_for_best_model=CONFIG['metric_for_best_model'],
    greater_is_better=CONFIG['greater_is_better'],
    
    # Misc
    seed=42,
    data_seed=42,
    remove_unused_columns=False,
    report_to="wandb" if os.getenv("WANDB_API_KEY") else "none",
    run_name=f"bitcoin-qwen-unsloth-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    
    # Hub integration
    push_to_hub=True,
    hub_model_id=CONFIG['hub_model_id'],
    hub_strategy="every_save",
)

print("‚úÖ Training arguments created!")
print("üìã Key settings:")
print(f"  Epochs: {CONFIG['num_train_epochs']}")
print(f"  Train batch size: {CONFIG['per_device_train_batch_size']} (effective: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']})")
print(f"  Learning rate: {CONFIG['learning_rate']}")
print(f"  Precision: {'BFloat16' if CONFIG['bf16'] else 'Float16'}")
print(f"  Output dir: {CONFIG['output_dir']}")
print(f"  Model ID: {CONFIG['hub_model_id']}")
print(f"  Eval every: {CONFIG['eval_steps']} steps")
print("Note: Training statistics will be calculated after dataset preparation.")

In [None]:
# Custom callback to show model outputs during training
from transformers import TrainerCallback
import random
import re

class SampleOutputCallback(TrainerCallback):
    def __init__(self, model, tokenizer, eval_dataset, show_every_n_steps=50):
        self.model = model
        self.tokenizer = tokenizer
        self.eval_dataset = eval_dataset
        self.show_every_n_steps = show_every_n_steps
        self.sample_data = []
        
        # Prepare a few samples for testing
        for i in range(min(5, len(eval_dataset))):
            sample_text = eval_dataset[i]['text']
            # Extract parts for generation
            parts = sample_text.split('<|im_start|>assistant\n')
            if len(parts) >= 2:
                input_part = parts[0] + '<|im_start|>assistant\n'
                expected_output = parts[1].replace('<|im_end|>', '').strip()
                self.sample_data.append({
                    'input': input_part,
                    'expected': expected_output,
                    'sample_id': i
                })
    
    def on_step_end(self, args, state, control, **kwargs):
        # Show sample every N steps
        if state.global_step % self.show_every_n_steps == 0 and state.global_step > 0:
            self._show_sample_output(state.global_step)
    
    def _show_sample_output(self, step):
        if not self.sample_data:
            return
            
        # Randomly select a sample
        sample = random.choice(self.sample_data)
        
        print(f"\n" + "="*80)
        print(f"üéØ SAMPLE OUTPUT AT STEP {step} (Sample ID: {sample['sample_id']})")
        print("="*80)
        
        try:
            # Tokenize input
            inputs = self.tokenizer(
                sample['input'], 
                return_tensors="pt", 
                truncation=True, 
                max_length=1900  # Leave space for generation
            ).to(self.model.device)
            
            # Generate response
            self.model.eval()
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=200,
                    do_sample=True,
                    temperature=0.1,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    use_cache=True
                )
            
            # Decode response
            generated_text = self.tokenizer.decode(
                outputs[0][inputs.input_ids.shape[1]:], 
                skip_special_tokens=True
            ).strip()
            
            # Extract JSON if present
            json_match = re.search(r'\{.*\}', generated_text, re.DOTALL)
            if json_match:
                json_output = json_match.group()
                print(f"Generated JSON: {json_output}")
                
                # Try to parse and validate
                try:
                    parsed = json.loads(json_output)
                    print(f"‚úÖ Valid JSON with keys: {list(parsed.keys())}")
                except:
                    print(f"‚ö†Ô∏è Invalid JSON format")
            else:
                print(f"Generated Text: {generated_text}")
                print(f"‚ö†Ô∏è No JSON found in output")
            
            print(f"\nExpected: {sample['expected']}")
            print("="*80)
            
            # Switch back to training mode
            self.model.train()
            
        except Exception as e:
            print(f"‚ùå Error generating sample: {e}")
            print("="*80)

# Initialize the callback
sample_callback = SampleOutputCallback(
    model=model, 
    tokenizer=tokenizer, 
    eval_dataset=eval_dataset,
    show_every_n_steps=CONFIG['show_sample_every_n_steps']
)

print(f"‚úÖ Sample output callback initialized!")
print(f"  Will show outputs every {CONFIG['show_sample_every_n_steps']} steps")
print(f"  Using {len(sample_callback.sample_data)} evaluation samples")

## 8. Initialize Trainer

In [None]:
# Initialize Unsloth SFT Trainer with comprehensive error handling
print("? Initializing Unsloth SFTTrainer with pre-tokenized data...")

# Verify all components are ready
components_ready = {
    "Model": model is not None,
    "Tokenizer": tokenizer is not None, 
    "Training Args": training_args is not None,
    "Train dataset": train_dataset is not None and len(train_dataset) > 0,
    "Eval dataset": eval_dataset is not None and len(eval_dataset) > 0,
    "Data collator": data_collator is not None,
}

print("üìä Component status:")
for component, status in components_ready.items():
    print(f"   {component}: {'‚úÖ' if status else '‚ùå'}")

if not all(components_ready.values()):
    missing = [comp for comp, status in components_ready.items() if not status]
    raise ValueError(f"‚ùå Missing components: {missing}")

# Final tensor compatibility test with pre-tokenized data
print("üß™ Testing tensor compatibility with pre-tokenized data...")
try:
    # Test a small batch with the data collator
    test_samples = [train_dataset[i] for i in range(min(3, len(train_dataset)))]
    test_batch = data_collator(test_samples)
    
    print(f"‚úÖ Tensor compatibility test passed!")
    print(f"   Batch keys: {list(test_batch.keys())}")
    print(f"   Input shape: {test_batch['input_ids'].shape}")
    print(f"   Labels shape: {test_batch['labels'].shape}")
    
    # Clean up test
    del test_batch, test_samples
    
except Exception as compat_error:
    print(f"‚ùå Tensor compatibility test failed: {compat_error}")
    print("   The pre-tokenized data still has issues")
    raise compat_error

# Initialize trainer with pre-tokenized datasets
try:
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,  # Now pre-tokenized
        eval_dataset=eval_dataset,    # Now pre-tokenized  
        data_collator=data_collator,  # Use our custom data collator
        # Remove these parameters since we're using pre-tokenized data
        # dataset_text_field=None,    # Not needed for pre-tokenized data
        # max_seq_length=None,        # Already applied during tokenization
        # packing=False,              # Not applicable to pre-tokenized data
        # formatting_func=None,       # Already formatted and tokenized
        callbacks=[sample_callback] if 'sample_callback' in locals() else [],
    )
    
    print(f"‚úÖ Unsloth SFTTrainer initialized successfully!")
    print(f"   Using pre-tokenized datasets")
    print(f"   Data collator: {type(data_collator).__name__}")
    
except Exception as trainer_error:
    print(f"‚ùå Trainer initialization failed: {trainer_error}")
    
    # Detailed error analysis
    error_str = str(trainer_error).lower()
    
    if "tensor" in error_str and ("nesting" in error_str or "list" in error_str):
        print("üîç TENSOR CREATION ERROR DETECTED:")
        print("   Even with pre-tokenization, there are tensor issues")
        print("   This should not happen with the pre-tokenization fix")
        print("\nüí° Solutions:")
        print("   1. Restart the notebook kernel completely")
        print("   2. Check the pre-tokenization function") 
        print("   3. Verify data collator compatibility")
        
    elif "shape" in error_str or "size" in error_str:
        print("üîç SHAPE MISMATCH ERROR:")
        print("   This indicates model loading conflicts")
        print("   Likely cause: incompatible model states or adapters")
        print("\nüí° Solutions:")
        print("   1. Restart notebook kernel")
        print("   2. Clear model cache directories") 
        print("   3. Use fresh model loading")
    
    elif "adapter" in error_str or "peft" in error_str:
        print("üîç ADAPTER CONFLICT ERROR:")
        print("   The model may have conflicting adapters")
        print("\nüí° Solutions:")
        print("   1. Remove existing adapter directories")
        print("   2. Use a fresh model instance")
        print("   3. Clear PEFT cache")
    
    else:
        print(f"üîç UNKNOWN TRAINER ERROR:")
        print(f"   {trainer_error}")
    
    # Show full traceback for debugging
    import traceback
    print(f"\nüîç Full error traceback:")
    traceback.print_exc()
    
    raise trainer_error

# Verify trainer is working
print(f"üìä Trainer verification:")
print(f"   Model device: {model.device}")
print(f"   Train dataset size: {len(trainer.train_dataset):,}")
print(f"   Eval dataset size: {len(trainer.eval_dataset):,}")
print(f"   Callbacks: {len(trainer.callback_handler.callbacks)} registered")

# Enable Unsloth training optimizations
FastLanguageModel.for_training(model)
print(f"‚úÖ Unsloth training mode enabled")

# Final memory check
if torch.cuda.is_available():
    print(f"   GPU memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    print(f"   GPU memory reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")
    
    # RTX 3090 specific optimizations
    gpu_name = torch.cuda.get_device_name(0)
    if "RTX 3090" in gpu_name or "3090" in gpu_name:
        print(f"   üöÄ RTX 3090 detected - optimizations active")
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        used_memory = torch.cuda.memory_allocated() / 1024**3
        print(f"   üìä Memory utilization: {used_memory:.1f}/{total_memory:.1f} GB ({used_memory/total_memory*100:.1f}%)")

print("üéâ Trainer ready for tensor-safe training with pre-tokenized datasets!")

In [None]:
# üîß Pre-tokenization Solution to Prevent Tensor Errors
print("üîß Applying pre-tokenization fix...")
print("This prevents the 'excessive nesting' tensor error during training")

from transformers import DataCollatorForLanguageModeling

def pre_tokenize_dataset(dataset, tokenizer, split_name):
    """
    Pre-tokenize dataset to prevent tensor creation errors
    This is the same fix that worked in train_news_effects.ipynb
    """
    print(f"Pre-tokenizing {split_name} dataset...")
    
    def tokenize_function(examples):
        # Tokenize the text
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            padding=False,  # Don't pad here - let data collator handle it
            max_length=CONFIG['max_seq_length'],
            return_tensors=None  # Return lists, not tensors
        )
        return tokenized
    
    # Apply tokenization
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        batch_size=100,
        remove_columns=dataset.column_names,  # Remove original columns
        desc=f"Tokenizing {split_name}"
    )
    
    print(f"‚úÖ {split_name} dataset pre-tokenized: {len(tokenized_dataset)} samples")
    return tokenized_dataset

# Pre-tokenize all datasets
print("üìä Pre-tokenizing datasets to prevent training errors...")
train_dataset_tokenized = pre_tokenize_dataset(train_dataset, tokenizer, "train")
eval_dataset_tokenized = pre_tokenize_dataset(eval_dataset, tokenizer, "eval")
test_dataset_tokenized = pre_tokenize_dataset(test_dataset, tokenizer, "test")

# Create data collator for pre-tokenized data
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
    pad_to_multiple_of=8,  # Helps with tensor core efficiency
)

print(f"‚úÖ Pre-tokenization complete!")
print(f"  Train: {len(train_dataset_tokenized)} samples")
print(f"  Eval: {len(eval_dataset_tokenized)} samples") 
print(f"  Test: {len(test_dataset_tokenized)} samples")
print(f"  Data collator: {type(data_collator).__name__}")

# Replace original datasets with tokenized versions
train_dataset = train_dataset_tokenized
eval_dataset = eval_dataset_tokenized  
test_dataset = test_dataset_tokenized

print("üéØ Datasets ready for tensor-safe training!")

## 9. Start Training

In [None]:
# Initialize wandb if available
if os.getenv("WANDB_API_KEY"):
    import wandb
    wandb.init(
        project="bitcoin-price-prediction-unsloth",
        name=training_args.run_name,
        config=CONFIG,
        tags=["unsloth", "qwen2.5", "bitcoin", "rtx3090"]
    )
    print("Weights & Biases initialized for Unsloth training")

print("\nüöÄ Starting Unsloth accelerated training...")
print(f"   Model: {CONFIG['model_name']}")
print(f"   Dataset: {CONFIG['dataset_name']}")
print(f"   Output: {CONFIG['output_dir']}")
print(f"   Epochs: {CONFIG['num_train_epochs']}")
print(f"   LoRA rank: {CONFIG['lora_r']}")
print(f"   Precision: {'BFloat16' if CONFIG['bf16'] else 'Float16'}")
print(f"   Sample outputs every: {CONFIG['show_sample_every_n_steps']} steps")

# Clear cache before training
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Start training
training_start_time = datetime.now()
print(f"\n‚è∞ Training started at: {training_start_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üéØ Expected completion: ~{CONFIG['num_train_epochs'] * len(train_dataset) // (CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']) // 60} minutes")
print("-" * 80)

In [None]:
# Train the model
try:
    train_result = trainer.train()
    
    training_end_time = datetime.now()
    training_duration = training_end_time - training_start_time
    
    print("-" * 80)
    print(f"‚úÖ Training completed!")
    print(f"   Duration: {training_duration}")
    print(f"   Final train loss: {train_result.training_loss:.4f}")
    print(f"   Total steps: {train_result.global_step:,}")
    
    # Save training metrics
    metrics = {
        "training_loss": train_result.training_loss,
        "global_step": train_result.global_step,
        "training_duration": str(training_duration),
        "start_time": training_start_time.isoformat(),
        "end_time": training_end_time.isoformat()
    }
    
    with open(f"{CONFIG['output_dir']}/training_metrics.json", "w") as f:
        json.dump(metrics, f, indent=2)
    
except Exception as e:
    print(f"‚ùå Training failed with error: {e}")
    raise e

## 10. Model Evaluation

In [None]:
# Evaluate the model
print("üìä Evaluating model...")

eval_results = trainer.evaluate()
print(f"\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

# Save evaluation results
with open(f"{CONFIG['output_dir']}/eval_results.json", "w") as f:
    json.dump(eval_results, f, indent=2)

print(f"\nEvaluation results saved to {CONFIG['output_dir']}/eval_results.json")

## 11. Model Testing and Inference

In [None]:
# Test inference on a few samples
def test_inference(model, tokenizer, test_samples, max_new_tokens=200):
    """Test model inference on samples"""
    model.eval()
    results = []
    
    for i, sample in enumerate(test_samples[:3]):  # Test first 3 samples
        print(f"\n--- Test Sample {i+1} ---")
        
        # Prepare input (without assistant response)
        messages = [
            {"role": "system", "content": sample["instruction"]},
            {"role": "user", "content": sample["input"]}
        ]
        
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        # Tokenize and generate
        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        
        print(f"Expected: {sample['output']}")
        print(f"Generated: {response.strip()}")
        
        results.append({
            "sample_id": i,
            "expected": sample["output"],
            "generated": response.strip(),
            "input_length": len(sample["input"]),
        })
    
    return results

# Convert test dataset back to original format for testing
original_test_samples = []
for i in range(min(5, len(test_dataset))):
    # Parse the formatted text back to original components
    text = test_dataset[i]['text']
    
    # Extract instruction, input, and output from formatted text
    # This is a simplified extraction - in practice you might want to save original data
    parts = text.split('<|im_start|>assistant\n')
    if len(parts) == 2:
        input_part = parts[0]
        output_part = parts[1].replace('<|im_end|>', '').strip()
        
        # Extract system and user messages
        system_start = input_part.find('<|im_start|>system\n') + len('<|im_start|>system\n')
        system_end = input_part.find('<|im_end|>', system_start)
        instruction = input_part[system_start:system_end].strip()
        
        user_start = input_part.find('<|im_start|>user\n') + len('<|im_start|>user\n')
        user_end = input_part.find('<|im_end|>', user_start)
        user_input = input_part[user_start:user_end].strip()
        
        original_test_samples.append({
            "instruction": instruction,
            "input": user_input,
            "output": output_part
        })

print(f"Testing inference on {len(original_test_samples)} samples...")
test_results = test_inference(model, tokenizer, original_test_samples)

## 12. Save and Push Model

In [None]:
# Save the Unsloth model
print("üíæ Saving Unsloth model...")

# Save LoRA adapter using Unsloth (much faster)
model.save_pretrained(CONFIG['output_dir'])
tokenizer.save_pretrained(CONFIG['output_dir'])

# Save in multiple formats for flexibility
print("Saving in different formats...")

# 1. Save as standard LoRA adapter
model.save_pretrained(f"{CONFIG['output_dir']}/lora_adapter")

# 2. Save as merged model (optional - takes more space but easier to use)
try:
    print("Saving merged model (this may take a moment)...")
    merged_model = model.merge_and_unload()
    merged_model.save_pretrained(f"{CONFIG['output_dir']}/merged_model")
    tokenizer.save_pretrained(f"{CONFIG['output_dir']}/merged_model")
    print("‚úÖ Merged model saved successfully")
except Exception as e:
    print(f"‚ö†Ô∏è Could not save merged model: {e}")

# 3. Save as GGUF for inference (Unsloth feature)
try:
    print("Saving as GGUF for efficient inference...")
    model.save_pretrained_gguf(
        f"{CONFIG['output_dir']}/gguf",
        tokenizer,
        quantization_method="q4_k_m"  # 4-bit quantization
    )
    print("‚úÖ GGUF model saved successfully")
except Exception as e:
    print(f"‚ö†Ô∏è Could not save GGUF model: {e}")

print(f"‚úÖ Unsloth model saved to: {CONFIG['output_dir']}")

# Save configuration
config_file = f"{CONFIG['output_dir']}/training_config.json"
with open(config_file, "w") as f:
    # Convert values to JSON serializable format
    config_copy = CONFIG.copy()
    config_copy['bf16'] = bool(CONFIG['bf16'])
    config_copy['fp16'] = bool(CONFIG['fp16'])
    json.dump(config_copy, f, indent=2)

print(f"‚úÖ Configuration saved to: {config_file}")

# List all saved files
import os
saved_files = []
for root, dirs, files in os.walk(CONFIG['output_dir']):
    for file in files:
        saved_files.append(os.path.relpath(os.path.join(root, file), CONFIG['output_dir']))

print(f"\nüìÅ Files saved ({len(saved_files)} total):")
for f in sorted(saved_files)[:10]:  # Show first 10 files
    print(f"  ‚Ä¢ {f}")
if len(saved_files) > 10:
    print(f"  ... and {len(saved_files) - 10} more files")

In [None]:
# Push Unsloth model to Hugging Face Hub
if CONFIG.get('hub_model_id'):
    try:
        print(f"üöÄ Pushing Unsloth model to Hugging Face Hub: {CONFIG['hub_model_id']}")
        
        # Push the LoRA adapter
        model.push_to_hub(CONFIG['hub_model_id'], token=True)
        tokenizer.push_to_hub(CONFIG['hub_model_id'], token=True)
        
        # Create enhanced model card for Unsloth
        model_card = f"""---
license: apache-2.0
base_model: {CONFIG['model_name']}
library_name: unsloth
tags:
- bitcoin
- cryptocurrency
- trading
- price-prediction
- qwen2.5
- unsloth
- lora
- peft
- rtx3090-optimized
datasets:
- {CONFIG['dataset_name']}
metrics:
- loss
model-index:
- name: {CONFIG['hub_model_id']}
  results:
  - task:
      type: text-generation
      name: Bitcoin Price Prediction
    metrics:
    - type: loss
      value: {train_result.training_loss:.4f}
      name: training_loss
---

# Bitcoin Price Prediction - Qwen2.5 4B Unsloth LoRA

This model is a fine-tuned version of {CONFIG['model_name']} for Bitcoin price prediction and trading decisions, optimized using **Unsloth** for 2x faster training on RTX 3090.

## üöÄ Unsloth Optimizations

- **2x faster training** compared to standard methods
- **RTX 3090 optimized** with efficient memory usage
- **BFloat16 precision** for better numerical stability
- **Optimized gradient checkpointing** for memory efficiency

## Model Details

- **Base Model**: {CONFIG['model_name']}
- **Fine-tuning Method**: Unsloth LoRA (Low-Rank Adaptation)
- **Task**: Bitcoin trading decision and 10-day price forecasting
- **Dataset**: {CONFIG['dataset_name']}
- **Training Samples**: {len(train_dataset):,}

## Training Configuration

- **LoRA Rank**: {CONFIG['lora_r']} (optimized for Unsloth)
- **LoRA Alpha**: {CONFIG['lora_alpha']}
- **Learning Rate**: {CONFIG['learning_rate']}
- **Epochs**: {CONFIG['num_train_epochs']}
- **Batch Size**: {CONFIG['per_device_train_batch_size']} (effective: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']})
- **Precision**: {'BFloat16' if CONFIG['bf16'] else 'Float16'}

## Usage with Unsloth

```python
from unsloth import FastLanguageModel

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="{CONFIG['hub_model_id']}",
    max_seq_length=2048,
    dtype=None,  # Auto-detect
    load_in_4bit=True,
)

# Enable fast inference
FastLanguageModel.for_inference(model)

# Prepare input
messages = [
    {{"role": "system", "content": "Your instruction here..."}},
    {{"role": "user", "content": "Your trading context here..."}}
]

inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt")

# Generate prediction
outputs = model.generate(**inputs, max_new_tokens=200, temperature=0.1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
```

## Standard Usage (without Unsloth)

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen2.5-4B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("unsloth/Qwen2.5-4B-Instruct")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "{CONFIG['hub_model_id']}")

# Generate as usual...
```

## Training Results

- **Final Training Loss**: {train_result.training_loss:.4f}
- **Training Steps**: {train_result.global_step:,}
- **Training Duration**: {str(training_end_time - training_start_time)}
- **Speed**: ~2x faster than standard fine-tuning
- **GPU**: RTX 3090 (24GB VRAM)

## Performance Features

- ‚úÖ **Real-time output monitoring** every 50 training steps
- ‚úÖ **Memory optimized** for RTX 3090
- ‚úÖ **GGUF export** for efficient deployment
- ‚úÖ **Merged model** option for easier inference

## Disclaimer

This model is for educational and research purposes only. It should not be used for actual financial trading decisions. 
Cryptocurrency trading involves significant risks and can result in substantial losses.

---

*Trained with [Unsloth](https://github.com/unslothai/unsloth) for 2x faster fine-tuning*
"""
        
        # Save model card
        with open(f"{CONFIG['output_dir']}/README.md", "w") as f:
            f.write(model_card)
        
        print(f"‚úÖ Unsloth model pushed to Hub: https://huggingface.co/{CONFIG['hub_model_id']}")
        
    except Exception as e:
        print(f"‚ùå Failed to push to Hub: {e}")
        print("   Model saved locally only")
else:
    print("‚ÑπÔ∏è Hub model ID not specified, skipping push to Hub")

## 13. Training Summary and Analysis

In [None]:
# Display comprehensive training summary
print("=" * 80)
print("üéØ TRAINING SUMMARY")
print("=" * 80)

summary = f"""
MODEL INFORMATION:
  Base Model: {CONFIG['model_name']}
  Output Directory: {CONFIG['output_dir']}
  Hub Model ID: {CONFIG.get('hub_model_id', 'Not specified')}

DATASET:
  Source: {CONFIG['dataset_name']}
  Total Samples: {len(formatted_dataset):,}
  Train Samples: {len(train_dataset):,}
  Eval Samples: {len(eval_dataset):,}
  Test Samples: {len(test_dataset):,}

TRAINING CONFIGURATION:
  LoRA Rank: {CONFIG['lora_r']}
  LoRA Alpha: {CONFIG['lora_alpha']}
  Learning Rate: {CONFIG['learning_rate']}
  Epochs: {CONFIG['num_train_epochs']}
  Batch Size: {CONFIG['per_device_train_batch_size']} (effective: {CONFIG['per_device_train_batch_size'] * CONFIG['gradient_accumulation_steps']})
  Max Length: {CONFIG['max_length']}

TRAINING RESULTS:
  Final Loss: {train_result.training_loss:.4f}
  Total Steps: {train_result.global_step:,}
  Duration: {training_end_time - training_start_time}
  Start Time: {training_start_time.strftime('%Y-%m-%d %H:%M:%S')}
  End Time: {training_end_time.strftime('%Y-%m-%d %H:%M:%S')}

MODEL PARAMETERS:
  Total Parameters: {model.num_parameters():,}
  Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}
  Trainable Percentage: {100 * sum(p.numel() for p in model.parameters() if p.requires_grad) / model.num_parameters():.2f}%

HARDWARE:
  Device: {model.device}
  GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}
  Memory Used: {torch.cuda.memory_allocated()/1024**3:.2f} GB

FILES CREATED:
  ‚Ä¢ {CONFIG['output_dir']}/pytorch_model.bin
  ‚Ä¢ {CONFIG['output_dir']}/adapter_config.json
  ‚Ä¢ {CONFIG['output_dir']}/tokenizer_config.json
  ‚Ä¢ {CONFIG['output_dir']}/training_metrics.json
  ‚Ä¢ {CONFIG['output_dir']}/eval_results.json
  ‚Ä¢ {CONFIG['output_dir']}/training_config.json
  ‚Ä¢ {CONFIG['output_dir']}/README.md
"""

print(summary)

# Save summary
with open(f"{CONFIG['output_dir']}/training_summary.txt", "w") as f:
    f.write(summary)

print(f"\n‚úÖ Training summary saved to: {CONFIG['output_dir']}/training_summary.txt")
print("\nüéâ Training completed successfully!")

if os.getenv("WANDB_API_KEY"):
    wandb.finish()
    print("üìä Weights & Biases session closed")

## 14. Quick Model Test

In [None]:
# Quick test of the Unsloth trained model
def quick_test():
    print("üß™ Quick Unsloth Model Test")
    print("-" * 50)
    
    # Enable fast inference mode
    FastLanguageModel.for_inference(model)
    
    # Sample input for testing
    test_instruction = """CONTEXT DATE: 2024-01-15

ANALYSIS FRAMEWORK:
‚Ä¢ Technical Analysis: Use price trends, volatility, and momentum indicators
‚Ä¢ Macro Analysis: Consider gold/oil prices for broader market context
‚Ä¢ News Analysis: Integrate comprehensive daily news summaries for market catalysts

OUTPUT FORMAT (JSON ONLY):
Return a single JSON object with EXACTLY these keys:
{"action":"BUY|SELL|HOLD","confidence":<int 1-99>,"stop_loss":<price 2dp>,"take_profit":<price 2dp>,"forecast_10d":[<10 prices 2dp>]}
No extra text, no explanations, just the JSON."""
    
    test_input = """Daily Context ‚Äî 2024-01-15

[Technical Price Analysis]
- Current Price: $42,350.50
- 60-Day Range: $38,500.00 ‚Üí $45,200.00
- 1D Return: 2.3%
- 7D Return: 5.8%
- 30D Return: 12.4%
- Volatility (14d): 3.2%
- Avg Daily Change (14d): 650.30
- Drawdown from Max: -6.3%

[Price History (Last 60 Days USD)]
[41200.50, 41850.75, 42100.25, 42350.50, 42600.80, 43150.25, 42950.75, 42350.50, 42700.25, 43050.50]

[Macro & Commodities Context]
- Gold Price: $2,025.50
- Crude Oil Price: $72.35

[Market Context]
- Bitcoin dominates crypto market as leading digital asset
- Price influenced by adoption, regulation, and macro factors

Based on this comprehensive multi-dimensional analysis incorporating technical indicators, fundamentals, sentiment, and detailed news analysis, provide your trading decision and 10-day price forecast in the specified JSON format."""
    
    messages = [
        {"role": "system", "content": test_instruction},
        {"role": "user", "content": test_input}
    ]
    
    # Use Unsloth optimized generation
    inputs = tokenizer.apply_chat_template(
        messages, 
        tokenize=True, 
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    
    print("Input prepared, generating response with Unsloth...")
    
    # Unsloth optimized generation
    start_time = datetime.now()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=300,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generation_time = datetime.now() - start_time
    response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
    
    print(f"\n‚ö° Generation completed in: {generation_time.total_seconds():.2f} seconds")
    print("\nüì§ Unsloth Model Response:")
    print(response.strip())
    
    # Try to parse JSON
    try:
        import re
        json_match = re.search(r'\{.*\}', response.strip(), re.DOTALL)
        if json_match:
            json_str = json_match.group()
            parsed = json.loads(json_str)
            print("\n‚úÖ JSON parsing successful:")
            for key, value in parsed.items():
                if key == "forecast_10d":
                    print(f"  {key}: {value[:3]}... (10 prices total)")
                else:
                    print(f"  {key}: {value}")
        else:
            print("\n‚ö†Ô∏è No JSON found in response")
    except Exception as e:
        print(f"\n‚ùå JSON parsing failed: {e}")
    
    # Switch back to training mode if needed
    model.train()

print("üöÄ Testing Unsloth trained model...")
quick_test()

print("\n" + "="*80)
print("üéâ UNSLOTH TRAINING COMPLETE!")
print("="*80)
print(f"‚úÖ Model saved to: {CONFIG['output_dir']}")
print(f"‚úÖ Training optimized with Unsloth (2x faster)")
print(f"‚úÖ RTX 3090 memory optimized")
print(f"‚úÖ Real-time outputs every {CONFIG['show_sample_every_n_steps']} steps")
print(f"‚úÖ Multiple export formats available")
if CONFIG.get('hub_model_id'):
    print(f"‚úÖ Model pushed to: https://huggingface.co/{CONFIG['hub_model_id']}")
print("="*80)