# üöÄ Bitcoin News Effects Training with Unsloth

Training **Qwen2.5-4B-Instruct** to analyze Bitcoin news effects using **Unsloth** for RTX 3090 optimization.

## Key Features:
- ‚úÖ **News Effects Analysis**: Train on individual news items and their Bitcoin price impacts
- ‚úÖ **Unsloth Acceleration**: 2x faster training optimized for RTX 3090
- ‚úÖ **Real-time Monitoring**: See model outputs every 50 steps during training
- ‚úÖ **Memory Optimized**: Efficient training for 24GB VRAM
- ‚úÖ **Multiple Export Formats**: LoRA, merged, GGUF for deployment

## üìä Dataset Overview

This training uses your **Bitcoin News Effects Dataset** which contains:
- **Individual news items** from daily Bitcoin coverage
- **News analysis** with sentiment, impact direction, magnitude
- **Structured outputs** for price effect prediction
- **Market context** with daily recommendations and probabilities

**Training Task**: Given a news item, predict its effect on Bitcoin price with structured JSON output.

In [None]:
# üîß Install Unsloth and dependencies for RTX 3090 training
import subprocess
import sys
import os

def install_unsloth():
    """Install Unsloth with CUDA support for RTX 3090"""
    print("‚ö° Installing Unsloth for RTX 3090...")
    
    commands = [
        "pip install unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git",
        "pip install --no-deps trl peft accelerate bitsandbytes",
        "pip install datasets transformers torch torchvision torchaudio",
        "pip install wandb tensorboard"
    ]
    
    for cmd in commands:
        print(f"Running: {cmd}")
        result = subprocess.run(cmd.split(), capture_output=True, text=True)
        if result.returncode != 0:
            print(f"‚ö†Ô∏è Warning: {result.stderr}")
        else:
            print("‚úÖ Installed successfully")

# Install Unsloth
install_unsloth()
print("üéâ Unsloth installation complete!")

In [None]:
# üìö Import required libraries
import torch
import json
import pandas as pd
from datetime import datetime
from datasets import Dataset, load_dataset
from transformers import TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import gc

print(f"üî• PyTorch version: {torch.__version__}")
print(f"üéØ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üöÄ GPU: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

print("‚úÖ All libraries imported successfully!")

In [None]:
# üîß Bitcoin News Effects Training Configuration
print("‚öôÔ∏è Setting up Bitcoin News Effects training configuration...")

CONFIG = {
    # Model settings
    "model_name": "unsloth/Qwen2.5-4B-Instruct",
    "max_seq_length": 2048,
    "dtype": None,  # Auto-detect
    "load_in_4bit": True,
    
    # Dataset settings
    "dataset_name": "tahamajs/bitcoin-news-effects-dataset",  # Update with your dataset
    "dataset_split": "train",
    
    # LoRA settings
    "lora_rank": 16,
    "lora_alpha": 16,
    "lora_dropout": 0,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    
    # Training settings
    "output_dir": "./bitcoin_news_effects_model",
    "run_name": f"bitcoin-news-effects-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    "num_train_epochs": 2,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "lr_scheduler_type": "linear",
    "warmup_steps": 5,
    
    # Optimization settings
    "fp16": not torch.cuda.is_bf16_supported() if torch.cuda.is_available() else False,
    "bf16": torch.cuda.is_bf16_supported() if torch.cuda.is_available() else False,
    "gradient_checkpointing": True,
    
    # Logging
    "logging_steps": 10,
    "save_steps": 100,
    "show_sample_every_n_steps": 50,
    
    # Hub settings
    "push_to_hub": False,  # Set to True if you want to push to HuggingFace
    "hub_model_id": "tahamajs/bitcoin-news-effects-qwen2.5-4b",  # Update with your username
}

# Import is_bfloat16_supported from unsloth
from unsloth import is_bfloat16_supported

# Update precision settings
CONFIG["fp16"] = not is_bfloat16_supported()
CONFIG["bf16"] = is_bfloat16_supported()

print("‚úÖ Configuration loaded:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# Set max_seq_length for easy access
max_seq_length = CONFIG["max_seq_length"]

print(f"\nüéØ Training optimized for RTX 3090:")
print(f"  Precision: {'BFloat16' if CONFIG['bf16'] else 'Float16'}")
print(f"  Batch size: {CONFIG['per_device_train_batch_size']}")
print(f"  Sequence length: {CONFIG['max_seq_length']}")
print("‚úÖ Configuration ready!")

In [None]:
# This cell will be moved to the proper location after model setup

In [None]:
# ü§ñ Load Qwen2.5-4B-Instruct model with Unsloth optimization
print("üöÄ Loading Qwen2.5-4B-Instruct with Unsloth acceleration...")

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=CONFIG["model_name"],
    max_seq_length=CONFIG["max_seq_length"],
    dtype=CONFIG["dtype"],
    load_in_4bit=CONFIG["load_in_4bit"],
)

print(f"‚úÖ Model loaded: {CONFIG['model_name']}")
print(f"üìè Max sequence length: {CONFIG['max_seq_length']}")
print(f"üíæ Model dtype: {model.dtype}")
print(f"üèÉ Unsloth acceleration: Active")

# Setup LoRA for efficient fine-tuning
print("\n‚ö° Adding LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=CONFIG["lora_rank"],
    target_modules=CONFIG["target_modules"],
    lora_alpha=CONFIG["lora_alpha"],
    lora_dropout=CONFIG["lora_dropout"],
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

print(f"‚úÖ LoRA configured:")
print(f"   Rank: {CONFIG['lora_rank']}")
print(f"   Alpha: {CONFIG['lora_alpha']}")
print(f"   Dropout: {CONFIG['lora_dropout']}")
print(f"   Target modules: {len(CONFIG['target_modules'])} layers")

# Setup chat template for proper formatting  
print("\nüìù Setting up chat template...")
try:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template="qwen",
    )
    print("‚úÖ Qwen chat template configured")
except Exception as e:
    print(f"‚ö†Ô∏è Chat template setup failed: {e}")
    print("üìù Using default tokenizer configuration")

# Ensure tokenizer has proper settings
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("‚úÖ Pad token set to EOS token")

print("üéâ Unsloth model setup complete!")

# Display model info
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nüìä Model Statistics:")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable parameters: {trainable_params:,}")
print(f"   Trainable percentage: {100 * trainable_params / total_params:.2f}%")

if torch.cuda.is_available():
    print(f"   GPU memory allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

In [None]:
# Dataset loading moved to the formatting cell to ensure proper order
print(" Dataset will be loaded in the formatting section...")
print("‚úÖ Proceeding to dataset formatting...")

In [None]:
# üîß Format dataset for Unsloth training with comprehensive error handling
print("üîß Setting up dataset formatting...")

def formatting_prompts_func(examples):
    """
    Format examples for Unsloth training with comprehensive validation
    Prevents tensor creation errors by ensuring clean string output
    """
    
    # Handle both single examples and batched examples
    if isinstance(examples.get("instruction"), str):
        # Single example case
        instructions = [examples["instruction"]]
        inputs = [examples["input"]]
        outputs = [examples["output"]]
    else:
        # Batch case
        instructions = examples["instruction"]
        inputs = examples["input"]
        outputs = examples["output"]
    
    # Validate we have equal lengths
    if not (len(instructions) == len(inputs) == len(outputs)):
        raise ValueError(f"Length mismatch: {len(instructions)} != {len(inputs)} != {len(outputs)}")
    
    formatted_texts = []
    
    for i in range(len(instructions)):
        try:
            # Extract and clean each component
            instruction = instructions[i]
            input_text = inputs[i]
            output_text = outputs[i]
            
            # Convert everything to strings and clean
            instruction = str(instruction).strip() if instruction is not None else ""
            input_text = str(input_text).strip() if input_text is not None else ""
            output_text = str(output_text).strip() if output_text is not None else ""
            
            # Skip empty samples
            if not instruction or not input_text or not output_text:
                print(f"‚ö†Ô∏è Skipping empty sample {i}")
                continue
            
            # Use simple format instead of chat template to avoid nesting
            formatted_text = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output_text}"""
            
            # Ensure it's a clean string
            if isinstance(formatted_text, str) and len(formatted_text.strip()) > 0:
                formatted_texts.append(formatted_text)
            else:
                print(f"‚ö†Ô∏è Invalid formatting result for sample {i}")
                
        except Exception as e:
            print(f"‚ùå Error formatting sample {i}: {e}")
            continue
    
    if not formatted_texts:
        raise ValueError("‚ùå No valid formatted texts produced")
    
    print(f"‚úÖ Successfully formatted {len(formatted_texts)} samples")
    return {"text": formatted_texts}

print("üîß Loading and processing Bitcoin News Effects Dataset...")

# Load dataset with comprehensive error handling
dataset = None
try:
    # Try loading from HuggingFace first
    print(f"üìÅ Loading dataset: {CONFIG['dataset_name']}")
    dataset = load_dataset(CONFIG["dataset_name"], split=CONFIG["dataset_split"])
    print(f"‚úÖ Loaded from HuggingFace: {len(dataset)} samples")
    
except Exception as hf_error:
    print(f"‚ö†Ô∏è HuggingFace load failed: {hf_error}")
    print("üîÑ Trying local dataset...")
    
    # Fallback to local files
    try:
        import glob
        local_files = glob.glob("../bitcoin_news_effects_dataset_*.json")
        if not local_files:
            local_files = glob.glob("/Users/tahamajs/Documents/uni/LLM/Files/Final Project/bitcoin_news_effects_dataset_*.json")
        
        if local_files:
            latest_file = max(local_files)
            print(f"üìÅ Using local file: {latest_file}")
            
            df = pd.read_json(latest_file)
            dataset = Dataset.from_pandas(df)
            print(f"‚úÖ Loaded local dataset: {len(dataset)} samples")
        else:
            raise FileNotFoundError("No local dataset files found")
            
    except Exception as local_error:
        print(f"‚ùå Local load failed: {local_error}")
        print("üí° Creating sample dataset for testing...")
        
        # Create minimal test dataset
        sample_data = [
            {
                "instruction": "Analyze Bitcoin news and predict price impact with JSON output.",
                "input": "Bitcoin ETFs see massive $500M inflows as institutional adoption grows.",
                "output": '{"sentiment": "bullish", "price_direction": "up", "impact_strength": "high", "confidence": 0.85}'
            }
        ]
        dataset = Dataset.from_list(sample_data)
        print(f"‚úÖ Created test dataset: {len(dataset)} samples")

if dataset is None:
    raise ValueError("‚ùå Failed to load any dataset")

print(f"\nüìä Dataset Analysis:")
print(f"  Total samples: {len(dataset)}")
print(f"  Features: {list(dataset.features.keys())}")

# Comprehensive data cleaning and validation
print("üßπ Cleaning and validating dataset...")

clean_data = []
for i, sample in enumerate(dataset):
    try:
        # Extract and validate fields
        instruction = sample.get("instruction", "")
        input_text = sample.get("input", "")
        output_text = sample.get("output", "")
        
        # Clean and validate
        instruction = str(instruction).strip() if instruction else ""
        input_text = str(input_text).strip() if input_text else ""
        output_text = str(output_text).strip() if output_text else ""
        
        # Only keep samples with all required content
        if instruction and input_text and output_text:
            clean_data.append({
                "instruction": instruction,
                "input": input_text,
                "output": output_text
            })
        else:
            print(f"‚ö†Ô∏è Skipping invalid sample {i}")
            
    except Exception as e:
        print(f"‚ùå Error cleaning sample {i}: {e}")
        continue

print(f"‚úÖ Cleaned dataset: {len(clean_data)}/{len(dataset)} valid samples")

if len(clean_data) == 0:
    raise ValueError("‚ùå No valid samples after cleaning")

# Create clean dataset
dataset = Dataset.from_list(clean_data)
print(f"‚úÖ Dataset reconstruction complete: {len(dataset)} samples")

# Format dataset with careful batch processing
print("üéØ Formatting dataset for training...")

try:
    # Try batch formatting first
    formatted_dataset = dataset.map(
        formatting_prompts_func,
        batched=True,
        batch_size=1,  # Process one at a time to avoid issues
        remove_columns=dataset.column_names,
        desc="Formatting samples"
    )
    print(f"‚úÖ Batch formatting successful: {len(formatted_dataset)} samples")
    
except Exception as batch_error:
    print(f"‚ö†Ô∏è Batch formatting failed: {batch_error}")
    print("üîÑ Using individual sample processing...")
    
    # Fallback to individual processing
    formatted_texts = []
    for i, sample in enumerate(dataset):
        try:
            result = formatting_prompts_func(sample)
            if "text" in result and result["text"]:
                # Ensure we get the actual string, not a list
                text = result["text"]
                if isinstance(text, list):
                    text = text[0] if text else ""
                if isinstance(text, str) and text.strip():
                    formatted_texts.append(text)
        except Exception as sample_error:
            print(f"‚ö†Ô∏è Failed to format sample {i}: {sample_error}")
    
    if formatted_texts:
        formatted_dataset = Dataset.from_dict({"text": formatted_texts})
        print(f"‚úÖ Individual formatting successful: {len(formatted_dataset)} samples")
    else:
        raise ValueError("‚ùå No samples could be formatted")

# Final validation to prevent tensor errors
print("üîç Final validation...")
validation_errors = []

for i in range(min(5, len(formatted_dataset))):
    sample = formatted_dataset[i]
    
    # Check structure
    if "text" not in sample:
        validation_errors.append(f"Sample {i}: missing 'text' key")
        continue
    
    text = sample["text"]
    
    # Check type
    if not isinstance(text, str):
        validation_errors.append(f"Sample {i}: text is {type(text)}, expected str")
        continue
    
    # Check content
    if not text.strip():
        validation_errors.append(f"Sample {i}: empty text")
        continue
    
    # Test tokenization
    try:
        test_tokens = tokenizer(text, truncation=True, max_length=CONFIG["max_seq_length"])
        if len(test_tokens["input_ids"]) == 0:
            validation_errors.append(f"Sample {i}: tokenization produced empty result")
    except Exception as token_error:
        validation_errors.append(f"Sample {i}: tokenization failed - {token_error}")

if validation_errors:
    print(f"‚ùå Validation errors found:")
    for error in validation_errors:
        print(f"  {error}")
    raise ValueError("Dataset validation failed")

print(f"‚úÖ Dataset validation passed!")
print(f"üìä Final dataset: {len(formatted_dataset)} samples ready for training")

# Show sample
if len(formatted_dataset) > 0:
    sample_text = formatted_dataset[0]["text"]
    print(f"\nüìù Sample formatted text (first 300 chars):")
    print(sample_text[:300] + "..." if len(sample_text) > 300 else sample_text)

print("‚úÖ Dataset formatting complete and validated!")

In [None]:
# üìä Custom callback to monitor training outputs every 50 steps
from transformers import TrainerCallback
import random

class NewsEffectsOutputCallback(TrainerCallback):
    """Monitor Bitcoin news effects predictions during training"""
    
    def __init__(self, model, tokenizer, dataset, show_every_n_steps=50):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.show_every_n_steps = show_every_n_steps
        
        # Prepare sample indices for monitoring
        if len(dataset) > 0:
            self.sample_indices = list(range(min(10, len(dataset))))
            print(f"üéØ Monitoring callback ready with {len(self.sample_indices)} samples")
        else:
            self.sample_indices = []
            print("‚ö†Ô∏è No samples available for monitoring")
    
    def on_step_end(self, args, state, control, **kwargs):
        if (state.global_step % self.show_every_n_steps == 0 and 
            state.global_step > 0 and 
            len(self.sample_indices) > 0):
            
            print(f"\n" + "="*80)
            print(f"üìä TRAINING STEP {state.global_step} - News Effects Sample Output")
            print(f"‚è∞ Time: {datetime.now().strftime('%H:%M:%S')}")
            print("="*80)
            
            try:
                # Get random sample index
                sample_idx = random.choice(self.sample_indices)
                
                # Get the original sample data to reconstruct input
                if hasattr(self.dataset, 'map'):  # If it's the original dataset
                    # This is the formatted dataset, need to extract parts
                    sample_text = self.dataset[sample_idx]["text"]
                    
                    # Extract instruction and input from formatted text
                    parts = sample_text.split("### Input:")
                    if len(parts) < 2:
                        print("‚ö†Ô∏è Could not parse sample for monitoring")
                        return
                    
                    instruction_part = parts[0].replace("### Instruction:", "").strip()
                    input_response_part = parts[1]
                    
                    response_split = input_response_part.split("### Response:")
                    if len(response_split) < 2:
                        print("‚ö†Ô∏è Could not parse sample response")
                        return
                    
                    input_text = response_split[0].strip()
                    expected_output = response_split[1].strip()
                    
                    # Create prompt for generation
                    prompt = f"### Instruction:\n{instruction_part}\n\n### Input:\n{input_text}\n\n### Response:\n"
                    
                else:
                    print("‚ö†Ô∏è Unexpected dataset format for monitoring")
                    return
                
                # Tokenize for generation
                inputs = self.tokenizer(
                    prompt,
                    return_tensors="pt",
                    truncation=True,
                    max_length=1500  # Leave room for generation
                ).to(self.model.device)
                
                # Generate response
                FastLanguageModel.for_inference(self.model)
                start_time = datetime.now()
                
                with torch.no_grad():
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=200,
                        do_sample=True,
                        temperature=0.1,
                        top_p=0.9,
                        pad_token_id=self.tokenizer.eos_token_id,
                        eos_token_id=self.tokenizer.eos_token_id
                    )
                
                generation_time = datetime.now() - start_time
                self.model.train()  # Back to training mode
                
                # Decode response
                generated_text = self.tokenizer.decode(
                    outputs[0][inputs.input_ids.shape[1]:], 
                    skip_special_tokens=True
                ).strip()
                
                # Display results
                print(f"üì∞ Input (truncated):")
                print(f"   {input_text[:150]}{'...' if len(input_text) > 150 else ''}")
                
                print(f"\nüéØ Expected:")
                print(f"   {expected_output[:100]}{'...' if len(expected_output) > 100 else ''}")
                
                print(f"\nü§ñ Generated ({generation_time.total_seconds():.2f}s):")
                print(f"   {generated_text[:200]}{'...' if len(generated_text) > 200 else ''}")
                
                # Try to parse JSON from both expected and generated
                try:
                    import re
                    
                    # Parse expected
                    expected_json = None
                    expected_match = re.search(r'\{.*\}', expected_output, re.DOTALL)
                    if expected_match:
                        expected_json = json.loads(expected_match.group())
                    
                    # Parse generated
                    generated_json = None
                    generated_match = re.search(r'\{.*\}', generated_text, re.DOTALL)
                    if generated_match:
                        generated_json = json.loads(generated_match.group())
                    
                    if expected_json and generated_json:
                        print(f"\nüìä Comparison:")
                        for key in ['sentiment', 'price_direction', 'impact_strength', 'confidence']:
                            exp_val = expected_json.get(key, 'N/A')
                            gen_val = generated_json.get(key, 'N/A')
                            match = "‚úÖ" if exp_val == gen_val else "‚ùå"
                            print(f"   {key}: {exp_val} ‚Üí {gen_val} {match}")
                    
                except Exception as parse_error:
                    print(f"‚ö†Ô∏è JSON parsing failed: {parse_error}")
                
                print("="*80)
                
                # Clear memory
                del inputs, outputs
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
                
            except Exception as callback_error:
                print(f"‚ùå Callback error: {callback_error}")
                print("="*80)

print("üìä News effects monitoring callback defined")
print("   Will be initialized after dataset is ready")

In [None]:
# Pre-tokenize dataset and configure LM data collator to avoid 'text' nesting errors
print("Pre-tokenizing formatted dataset...")

def tokenize_function(batch):
    texts = batch["text"] if isinstance(batch.get("text"), list) else [batch["text"]]
    # Sanitize any problematic unicode
    def _safe(s):
        s = str(s)
        try:
            return s.encode("utf-8", "replace").decode("utf-8")
        except Exception:
            return str(s)
    texts = [_safe(t) for t in texts]
    return tokenizer(
        texts,
        max_length=CONFIG["max_seq_length"],
        truncation=True,
        padding=False,
        return_attention_mask=True,
    )

# Tokenize in batches
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    desc="Tokenizing dataset",
)

# Ensure raw text is not used by the collator
if "text" in tokenized_dataset.column_names:
    tokenized_dataset = tokenized_dataset.remove_columns(["text"])

print("Tokenization complete. Columns:", tokenized_dataset.column_names)

# Use full tokenized dataset for training
train_tokenized = tokenized_dataset

# Configure collator for Causal LM
from transformers import DataCollatorForLanguageModeling
lm_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)
print("Causal LM data collator configured")

In [None]:
# Configure Unsloth SFT Trainer for RTX 3090 - Ultra Safe Mode
print("Setting up Unsloth SFT Trainer in ULTRA SAFE MODE...")

# Initialize the monitoring callback now that we have model, tokenizer, and dataset
print("Initializing monitoring callback...")
output_callback = NewsEffectsOutputCallback(
    model=model,
    tokenizer=tokenizer,
    dataset=formatted_dataset,  # keep original formatted text for readable monitoring
    show_every_n_steps=CONFIG['show_sample_every_n_steps']
)

# Ultra-conservative training arguments for maximum stability
print("Setting up ULTRA SAFE training arguments...")
training_args = TrainingArguments(
    # Basic settings
    output_dir=CONFIG["output_dir"],
    run_name=CONFIG["run_name"],
    num_train_epochs=CONFIG["num_train_epochs"],
    
    # ULTRA CONSERVATIVE batch settings
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    
    # Learning rate settings
    learning_rate=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
    lr_scheduler_type=CONFIG["lr_scheduler_type"],
    warmup_steps=CONFIG["warmup_steps"],
    
    # ULTRA SAFE memory optimization
    fp16=CONFIG["fp16"],
    bf16=CONFIG["bf16"],
    gradient_checkpointing=CONFIG["gradient_checkpointing"],
    dataloader_num_workers=0,
    
    # ULTRA SAFE data handling
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    group_by_length=False,
    
    # Logging and saving
    logging_steps=CONFIG["logging_steps"],
    save_steps=CONFIG["save_steps"],
    save_total_limit=1,
    
    # Disable complex features that could cause issues
    eval_strategy="no",
    save_strategy="steps",
    prediction_loss_only=True,
    
    # Hub settings
    push_to_hub=False,
    
    # ULTRA SAFE misc settings
    seed=42,
    data_seed=42,
    report_to="none",
    ignore_data_skip=True,
    disable_tqdm=False,
)

print("Ultra-safe training arguments configured")

# FINAL pre-training validation with actual trainer format
print("FINAL pre-training validation...")

# Test exactly what the trainer will do
print("Testing EXACT trainer data processing...")
try:
    # Get samples in the exact format the trainer uses
    sample_texts = [formatted_dataset[i]["text"] for i in range(min(2, len(formatted_dataset)))]
    
    print(f"Testing {len(sample_texts)} samples:")
    for i, text in enumerate(sample_texts):
        s = str(text)
        try:
            s = s.encode("utf-8", "replace").decode("utf-8")
        except Exception:
            pass
        print(f"   Sample {i}: type={type(text)}, len={len(s)}")
        
        # Test individual tokenization
        individual_tokens = tokenizer(
            s,
            truncation=True,
            max_length=CONFIG["max_seq_length"],
            return_tensors="pt"
        )
        print(f"   Sample {i} individual: OK {individual_tokens['input_ids'].shape}")
        del individual_tokens
    
    # Test batch tokenization (the critical operation)
    print("Testing critical batch tokenization...")
    batch_tokens = tokenizer(
        sample_texts,
        truncation=True,
        padding=True,
        max_length=CONFIG["max_seq_length"],
        return_tensors="pt"
    )
    
    print("CRITICAL TEST PASSED!")
    print(f"   Batch shape: {batch_tokens['input_ids'].shape}")
    print(f"   Batch dtype: {batch_tokens['input_ids'].dtype}")
    print(f"   Min token: {batch_tokens['input_ids'].min()}")
    print(f"   Max token: {batch_tokens['input_ids'].max()}")
    
    # Validate tensor properties
    assert batch_tokens['input_ids'].dim() == 2
    assert batch_tokens['attention_mask'].dim() == 2
    assert batch_tokens['input_ids'].shape == batch_tokens['attention_mask'].shape
    
    print("All tensor validations passed!")
    
    # Clean up
    del batch_tokens, sample_texts
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
except Exception as critical_error:
    print(f"CRITICAL ERROR: Final validation failed: {critical_error}")
    print("This means the trainer WILL fail")
    raise critical_error

# Create ULTRA SAFE SFT Trainer that uses tokenized dataset and LM collator
print("Initializing ULTRA SAFE SFT Trainer...")
try:
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_tokenized,
        dataset_text_field=None,
        max_seq_length=CONFIG["max_seq_length"],
        dataset_num_proc=1,
        packing=False,
        data_collator=lm_collator,
        args=training_args,
        callbacks=[output_callback],
    )
    
    print("ULTRA SAFE SFT Trainer initialized successfully!")
    
    # Validate trainer state
    print("Trainer validation:")
    print(f"   Dataset size: {len(trainer.train_dataset)}")
    print(f"   Model device: {trainer.model.device}")
    print(f"   Tokenizer pad token: {trainer.tokenizer.pad_token}")
    
except Exception as trainer_error:
    print(f"ULTRA SAFE trainer initialization FAILED: {trainer_error}")
    raise trainer_error

print("")
print("ULTRA SAFE TRAINER SUMMARY:")
print(f"   Dataset: {len(train_tokenized)} samples")
print(f"   Epochs: {CONFIG['num_train_epochs']}")
print("   Batch size: 1 (ultra-conservative)")
print(f"   Effective batch: {1 * 8} (with accumulation)")
print(f"   Precision: {'BF16' if CONFIG['bf16'] else 'FP16'}")
print("   Safety mode: MAXIMUM")
print("   Tensor errors: Mitigated by pre-tokenization")
print("Ready for ULTRA SAFE training!")

In [None]:
# Start Unsloth Training with Real-time News Effects Monitoring
print("STARTING BITCOIN NEWS EFFECTS TRAINING WITH UNSLOTH!")
print("="*80)
print("Task: Training model to analyze Bitcoin news effects")
print("Model: Qwen2.5-4B-Instruct with LoRA")
print("Acceleration: Unsloth (2x faster)")
print("GPU: RTX 3090 optimized")
print(f"Dataset (tokenized): {len(train_tokenized)} samples")
print(f"Monitoring: News effects outputs every {CONFIG['show_sample_every_n_steps']} steps")
print("="*80)

# Record start time
training_start_time = datetime.now()
print(f"Training started at: {training_start_time.strftime('%H:%M:%S')}")

# Start training with enhanced error handling
try:
    print("\nStarting training...")
    trainer_stats = trainer.train()
    
    # Training completed successfully
    training_end_time = datetime.now()
    training_duration = training_end_time - training_start_time
    
    print(f"\n" + "="*80)
    print("BITCOIN NEWS EFFECTS TRAINING COMPLETED!")
    print("="*80)
    print(f"Total training time: {training_duration}")
    print(f"Final train loss: {trainer_stats.training_loss:.4f}")
    print(f"Training speed: {trainer_stats.metrics.get('train_samples_per_second', 'N/A')} samples/sec")
    print("Unsloth acceleration: ~2x faster than standard training")
    print("Task completed: Bitcoin news effects analysis model")
    
except KeyboardInterrupt:
    print("\nTraining interrupted by user")
    print(f"Training duration: {datetime.now() - training_start_time}")
    
except Exception as e:
    print(f"\nTraining error: {e}")
    print(f"Training duration: {datetime.now() - training_start_time}")
    
    # Detailed error analysis
    import traceback
    print("\nError details:")
    print(f"   - Error type: {type(e).__name__}")
    print(f"   - Error message: {str(e)}")
    
    # Check for tensor errors
    error_str = str(e).lower()
    if ("too many dimensions" in error_str) or ("str" in error_str and "tensor" in error_str):
        print("\nDetected tensor formatting error!")
        print("   This suggests the dataset still contains nested structures or wrong data types")
        print("   - Verify all features are tokenized (input_ids, attention_mask)")
        print("   - Ensure no 'text' column remains in train_tokenized")
        print("   - Re-run the tokenization cell")
    
    print("\nFull traceback:")
    traceback.print_exc()
    raise

print("\nTraining statistics saved to trainer object")
print("Ready for model saving and testing!")

In [None]:
# üíæ Save Bitcoin News Effects Model in Multiple Formats
print("üíæ Saving Bitcoin News Effects Model...")
print("="*50)

# Save LoRA adapters
print("1Ô∏è‚É£ Saving LoRA adapters...")
lora_dir = CONFIG["output_dir"] + "/lora_adapters"
model.save_pretrained(lora_dir)
tokenizer.save_pretrained(lora_dir)
print(f"‚úÖ LoRA adapters saved to: {lora_dir}")

# Save merged model (16-bit)
print("\n2Ô∏è‚É£ Saving merged model (16-bit)...")
merged_dir = CONFIG["output_dir"] + "/merged_model"
model.save_pretrained_merged(merged_dir, tokenizer, save_method="merged_16bit")
print(f"‚úÖ Merged 16-bit model saved to: {merged_dir}")

# Save GGUF format for deployment
print("\n3Ô∏è‚É£ Saving GGUF format for deployment...")
gguf_dir = CONFIG["output_dir"] + "/gguf_model"
try:
    model.save_pretrained_gguf(gguf_dir, tokenizer, quantization_method="q4_k_m")
    print(f"‚úÖ GGUF model saved to: {gguf_dir}")
except Exception as e:
    print(f"‚ö†Ô∏è GGUF save failed: {e}")
    print("üí° GGUF format requires additional dependencies")

# Push to HuggingFace Hub
if CONFIG["push_to_hub"] and CONFIG.get("hub_model_id"):
    print(f"\n4Ô∏è‚É£ Pushing to HuggingFace Hub...")
    try:
        # Push LoRA model
        model.push_to_hub_merged(
            CONFIG["hub_model_id"],
            tokenizer,
            save_method="merged_16bit",
            commit_message=f"Bitcoin news effects analysis model - {datetime.now().strftime('%Y-%m-%d %H:%M')}"
        )
        print(f"‚úÖ Model pushed to: https://huggingface.co/{CONFIG['hub_model_id']}")
    except Exception as e:
        print(f"‚ö†Ô∏è Hub push failed: {e}")

print(f"\nüéâ All model formats saved successfully!")
print(f"üìÅ Base directory: {CONFIG['output_dir']}")
print(f"üéØ Model purpose: Bitcoin news effects analysis")
print(f"‚ö° Training method: Unsloth + LoRA")

In [None]:
# üß™ Test Bitcoin News Effects Model with Real Sample
def test_news_effects_model():
    print("üß™ Testing Bitcoin News Effects Model")
    print("-" * 50)
    
    # Enable fast inference mode
    FastLanguageModel.for_inference(model)
    
    # Sample Bitcoin news for testing
    test_instruction = """Analyze Bitcoin news and predict price impact. Return JSON with this exact structure:

{
  "sentiment": "bullish|neutral|bearish",
  "price_direction": "up|sideways|down",
  "impact_strength": "high|medium|low", 
  "timeframe": "immediate|short_term|medium_term",
  "confidence": 0.75,
  "key_reason": "Brief explanation of main factor"
}"""
    
    test_input = """News Title: Bitcoin ETFs See Record $1.2B Inflows as Price Breaks $65,000

News Summary: Bitcoin spot ETFs experienced unprecedented institutional demand with $1.2 billion in net inflows over the past week, driving Bitcoin price above $65,000 for the first time since November 2021. BlackRock's IBIT led with $800M inflows while Fidelity's FBTC added $400M. The surge coincides with growing corporate adoption and favorable regulatory signals from the SEC.

Impact Tags: institutional_adoption, etf_flows, price_breakout, regulatory_clarity

Market Context:
Bull 75% | Base 20% | Bear 5%

Daily Recommendations:
Short-term: Strong Buy
Long-term: Buy"""
    
    messages = [
        {"role": "system", "content": test_instruction},
        {"role": "user", "content": test_input}
    ]
    
    # Use Unsloth optimized generation
    inputs = tokenizer.apply_chat_template(
        messages, 
        tokenize=True, 
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    
    print("üì∞ Analyzing Bitcoin news...")
    
    # Generate news effects analysis
    start_time = datetime.now()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=250,
            do_sample=True,
            temperature=0.1,
            top_p=0.9,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generation_time = datetime.now() - start_time
    response = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
    
    print(f"\n‚ö° Analysis completed in: {generation_time.total_seconds():.2f} seconds")
    print("\nüì§ Bitcoin News Effects Analysis:")
    print(response.strip())
    
    # Try to parse JSON
    try:
        import re
        json_match = re.search(r'\{.*\}', response.strip(), re.DOTALL)
        if json_match:
            json_str = json_match.group()
            parsed = json.loads(json_str)
            print("\n‚úÖ News Effects Analysis (Parsed):")
            print(f"  üìä Sentiment: {parsed.get('sentiment', 'N/A')}")
            print(f"  üìà Price Direction: {parsed.get('price_direction', 'N/A')}")
            print(f"  üí™ Impact Strength: {parsed.get('impact_strength', 'N/A')}")
            print(f"  ‚è∞ Timeframe: {parsed.get('timeframe', 'N/A')}")
            print(f"  üéØ Confidence: {parsed.get('confidence', 0):.2f}")
            print(f"  üí° Key Reason: {parsed.get('key_reason', 'N/A')}")
        else:
            print("\n‚ö†Ô∏è No JSON found in response")
    except Exception as e:
        print(f"\n‚ùå JSON parsing failed: {e}")
    
    # Switch back to training mode if needed
    model.train()

print("üöÄ Testing Bitcoin News Effects Model...")
test_news_effects_model()

print("\n" + "="*80)
print("üéâ BITCOIN NEWS EFFECTS TRAINING COMPLETE!")
print("="*80)
print(f"‚úÖ Model saved to: {CONFIG['output_dir']}")
print(f"‚úÖ Task: Bitcoin news effects analysis")
print(f"‚úÖ Training optimized with Unsloth (2x faster)")
print(f"‚úÖ RTX 3090 memory optimized")
print(f"‚úÖ Real-time monitoring every {CONFIG['show_sample_every_n_steps']} steps")
print(f"‚úÖ Multiple export formats available")
if CONFIG.get('hub_model_id'):
    print(f"‚úÖ Model available at: https://huggingface.co/{CONFIG['hub_model_id']}")
print("="*80)
print("üéØ Model ready for Bitcoin news effects analysis!")