# Bitcoin Investment Advisory Training Dataset Creator

This notebook creates structured input-output pairs for training AI models from the comprehensive Bitcoin investment advisory dataset.

## 1. Setup and Dependencies

In [1]:
import json
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, List, Any
import os
from datasets import Dataset
from huggingface_hub import HfApi, create_repo
import warnings
warnings.filterwarnings('ignore')

# Install required packages if not available
try:
    from datasets import Dataset
except ImportError:
    !pip install datasets
    from datasets import Dataset

try:
    from huggingface_hub import HfApi
except ImportError:
    !pip install huggingface_hub
    from huggingface_hub import HfApi

## 2. Load and Analyze Dataset

In [2]:
# Load the comprehensive dataset
dataset_path = 'bitcoin_investment_advisory_ULTRA_COMPREHENSIVE_20250907_232548.json'

with open(dataset_path, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

print(f"Dataset Name: {raw_data['dataset_metadata']['name']}")
print(f"Version: {raw_data['dataset_metadata']['version']}")
print(f"Total Samples: {raw_data['processing_statistics']['execution_summary']['total_advisory_samples']}")
print(f"Date Range: {raw_data['processing_statistics']['data_coverage_analysis']['date_range']['earliest_date']} to {raw_data['processing_statistics']['data_coverage_analysis']['date_range']['latest_date']}")

Dataset Name: Ultra-Comprehensive Bitcoin Investment Advisory Dataset
Version: 2.0.ultra-complete
Total Samples: 2437
Date Range: 2018-01-01 to 2024-12-31


## 3. Define Training Data Structure

We'll create comprehensive input-output pairs that include all relevant context for training an investment advisory AI.

In [7]:
def format_market_data(market_data: Dict) -> str:
    """Format market data into a readable string for input."""
    
    # Format future prices
    next_10_prices = market_data.get('next_10_day_prices', [])
    next_60_prices = market_data.get('next_60_day_prices', [])
    
    market_text = f"""
MARKET DATA:
- Current Price: ${market_data.get('current_price', 'N/A')}
- Price Range: Min: ${market_data.get('price_range', {}).get('min', 'N/A')}, Max: ${market_data.get('price_range', {}).get('max', 'N/A')}
- Next 10-Day Price Trend: {', '.join([f'${p:.2f}' for p in next_10_prices])}
- Next 60-Day Price Outlook: {', '.join([f'${p:.2f}' for p in next_60_prices])}
"""
    return market_text.strip()

def format_news_analysis(news_analysis: Dict) -> str:
    """Format news analysis into a readable string for input."""
    
    sentiment = news_analysis.get('market_sentiment', {})
    high_impact_news = news_analysis.get('high_impact_news', [])
    
    news_text = f"""
NEWS ANALYSIS:
- Total News Items: {news_analysis.get('total_news_items', 0)}
- Market Sentiment: {sentiment.get('overall_sentiment', 'Neutral')}
- Bull Probability: {sentiment.get('bull_probability', 0):.1%}
- Bear Probability: {sentiment.get('bear_probability', 0):.1%}
- High Impact News Count: {len(high_impact_news)}

KEY NEWS ITEMS:
"""
    
    for i, news in enumerate(high_impact_news, 1):  # All news items
        news_text += f"""
{i}. {news.get('title', 'No title')}
   Summary: {news.get('summary', 'No summary')}
   Direction: {news.get('direction', 'neutral').upper()}
   Impact: {news.get('magnitude', 'medium').upper()}
   Confidence: {news.get('confidence', 0):.0%}
   Impact Tags: {', '.join(news.get('impact_tags', []))}
"""
    
    return news_text.strip()

def format_daily_analysis(daily_analysis: Dict) -> str:
    """Format daily analysis into a readable string for input."""
    
    analysis_text = f"""
DAILY MARKET ANALYSIS:
- Market Summary: {daily_analysis.get('summary', 'No summary available')}
- Aggregated Effects: {daily_analysis.get('aggregated_effects', 'No effects data')}
- Key Events: {len(daily_analysis.get('key_events', []))} events identified
- Price Drivers: {len(daily_analysis.get('price_drivers', []))} factors analyzed
"""
    
    # Add key events if available
    key_events = daily_analysis.get('key_events', [])
    if key_events:
        analysis_text += "\nKEY EVENTS:\n"
        for i, event in enumerate(key_events, 1):
            analysis_text += f"{i}. {event}\n"
    
    # Add price drivers if available
    price_drivers = daily_analysis.get('price_drivers', [])
    if price_drivers:
        analysis_text += "\nPRICE DRIVERS:\n"
        for i, driver in enumerate(price_drivers, 1):
            analysis_text += f"{i}. {driver}\n"
    
    return analysis_text.strip()

def create_comprehensive_input(sample: Dict) -> str:
    """Create a comprehensive input prompt from a sample."""
    
    date = sample.get('date', 'Unknown')
    market_data = sample.get('market_data', {})
    news_analysis = sample.get('news_analysis', {})
    daily_analysis = sample.get('daily_analysis', {})
    
    input_text = f"""
You are an elite institutional Bitcoin investment advisor. Please provide a comprehensive investment advisory based on the following market intelligence for {date}.

{format_market_data(market_data)}

{format_news_analysis(news_analysis)}

{format_daily_analysis(daily_analysis)}

TASK:
Provide a detailed, institutional-grade Bitcoin investment advisory that includes:
1. Executive Summary & Market Overview
2. Investment Recommendation (Short/Medium/Long-term)
3. Risk Assessment & Management
4. Price Targets & Scenarios
5. Trading Strategy & Execution
6. Technical and Fundamental Analysis
7. Portfolio Integration Advice

Format your response as a professional investment advisory suitable for institutional clients.
"""
    
    return input_text.strip()

def extract_advisory_output(sample: Dict) -> str:
    """Extract the advisory output from a sample."""
    
    advisory_output = sample.get('advisory_output', {})
    return advisory_output.get('advisory_text', '')

print("Training data formatting functions defined successfully!")

Training data formatting functions defined successfully!


## 4. Create Training Dataset

In [8]:
def create_training_samples(raw_data: Dict) -> List[Dict]:
    """Create training samples from raw data."""
    
    samples = raw_data.get('comprehensive_samples', [])
    training_data = []
    
    print(f"Processing {len(samples)} samples...")
    
    for i, sample in enumerate(samples):
        if i % 100 == 0:
            print(f"Processed {i}/{len(samples)} samples")
        
        # Create comprehensive input
        input_text = create_comprehensive_input(sample)
        
        # Extract output
        output_text = extract_advisory_output(sample)
        
        # Skip samples with insufficient data
        if len(input_text.strip()) < 100 or len(output_text.strip()) < 100:
            continue
        
        # Create training sample
        training_sample = {
            'date': sample.get('date', ''),
            'input': input_text,
            'output': output_text,
            'instruction': 'You are an elite institutional Bitcoin investment advisor. Provide comprehensive investment advisory based on the given market intelligence.',
            
            # Additional metadata
            'sample_index': sample.get('sample_index', i),
            'input_length': len(input_text),
            'output_length': len(output_text),
            'has_news_data': len(sample.get('news_analysis', {}).get('high_impact_news', [])) > 0,
            'has_price_data': len(sample.get('market_data', {}).get('next_10_day_prices', [])) > 0,
            
            # Quality metrics
            'quality_score': calculate_quality_score(sample),
        }
        
        training_data.append(training_sample)
    
    print(f"Created {len(training_data)} training samples")
    return training_data

def calculate_quality_score(sample: Dict) -> float:
    """Calculate a quality score for the sample based on data completeness."""
    
    score = 0.0
    
    # Check market data quality
    market_data = sample.get('market_data', {})
    if len(market_data.get('next_10_day_prices', [])) > 0:
        score += 0.3
    if len(market_data.get('next_60_day_prices', [])) > 0:
        score += 0.2
    
    # Check news data quality
    news_analysis = sample.get('news_analysis', {})
    if len(news_analysis.get('high_impact_news', [])) > 0:
        score += 0.2
    if news_analysis.get('total_news_items', 0) > 0:
        score += 0.1
    
    # Check advisory quality
    advisory = sample.get('advisory_output', {})
    if advisory.get('advisory_length_chars', 0) > 1000:
        score += 0.1
    if advisory.get('generation_successful', False):
        score += 0.1
    
    return min(score, 1.0)

# Create training samples
training_samples = create_training_samples(raw_data)

print(f"\nTraining Dataset Summary:")
print(f"Total samples: {len(training_samples)}")
print(f"Average input length: {np.mean([s['input_length'] for s in training_samples]):.0f} characters")
print(f"Average output length: {np.mean([s['output_length'] for s in training_samples]):.0f} characters")
print(f"Average quality score: {np.mean([s['quality_score'] for s in training_samples]):.2f}")
print(f"Samples with news data: {sum(s['has_news_data'] for s in training_samples)}")
print(f"Samples with price data: {sum(s['has_price_data'] for s in training_samples)}")

Processing 2437 samples...
Processed 0/2437 samples
Processed 100/2437 samples
Processed 200/2437 samples
Processed 300/2437 samples
Processed 400/2437 samples
Processed 500/2437 samples
Processed 600/2437 samples
Processed 700/2437 samples
Processed 800/2437 samples
Processed 900/2437 samples
Processed 1000/2437 samples
Processed 1100/2437 samples
Processed 1200/2437 samples
Processed 1300/2437 samples
Processed 1400/2437 samples
Processed 1500/2437 samples
Processed 1600/2437 samples
Processed 1700/2437 samples
Processed 1800/2437 samples
Processed 1900/2437 samples
Processed 2000/2437 samples
Processed 2100/2437 samples
Processed 2200/2437 samples
Processed 2300/2437 samples
Processed 2400/2437 samples
Created 2437 training samples

Training Dataset Summary:
Total samples: 2437
Average input length: 4865 characters
Average output length: 8643 characters
Average quality score: 0.90
Samples with news data: 2419
Samples with price data: 2437


## 5. Create Different Training Formats

In [9]:
def create_instruction_format(samples: List[Dict]) -> List[Dict]:
    """Create instruction-tuning format."""
    
    instruction_data = []
    
    for sample in samples:
        instruction_sample = {
            'instruction': sample['instruction'],
            'input': sample['input'],
            'output': sample['output'],
            'date': sample['date'],
            'quality_score': sample['quality_score']
        }
        instruction_data.append(instruction_sample)
    
    return instruction_data

def create_chat_format(samples: List[Dict]) -> List[Dict]:
    """Create chat format for conversational training."""
    
    chat_data = []
    
    for sample in samples:
        chat_sample = {
            'messages': [
                {
                    'role': 'system',
                    'content': 'You are an elite institutional Bitcoin investment advisor with deep expertise in cryptocurrency markets, technical analysis, and portfolio management. You provide comprehensive, data-driven investment advice to institutional clients.'
                },
                {
                    'role': 'user',
                    'content': sample['input']
                },
                {
                    'role': 'assistant',
                    'content': sample['output']
                }
            ],
            'date': sample['date'],
            'quality_score': sample['quality_score']
        }
        chat_data.append(chat_sample)
    
    return chat_data

def create_alpaca_format(samples: List[Dict]) -> List[Dict]:
    """Create Alpaca format for fine-tuning."""
    
    alpaca_data = []
    
    for sample in samples:
        alpaca_sample = {
            'instruction': sample['instruction'],
            'input': sample['input'],
            'output': sample['output']
        }
        alpaca_data.append(alpaca_sample)
    
    return alpaca_data

# Create different formats
instruction_format = create_instruction_format(training_samples)
chat_format = create_chat_format(training_samples)
alpaca_format = create_alpaca_format(training_samples)

print(f"Created training data in multiple formats:")
print(f"- Instruction format: {len(instruction_format)} samples")
print(f"- Chat format: {len(chat_format)} samples")
print(f"- Alpaca format: {len(alpaca_format)} samples")

Created training data in multiple formats:
- Instruction format: 2437 samples
- Chat format: 2437 samples
- Alpaca format: 2437 samples


## 6. Save Training Datasets

In [10]:
# Create output directory
output_dir = "bitcoin_training_datasets"
os.makedirs(output_dir, exist_ok=True)

# Generate timestamp for file naming
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save as JSON files
formats_and_data = {
    'instruction': instruction_format,
    'chat': chat_format,
    'alpaca': alpaca_format,
    'comprehensive': training_samples
}

saved_files = {}

for format_name, data in formats_and_data.items():
    # JSON format
    json_filename = f"{output_dir}/bitcoin_investment_training_{format_name}_{timestamp}.json"
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    # JSONL format for some formats
    if format_name in ['chat', 'instruction']:
        jsonl_filename = f"{output_dir}/bitcoin_investment_training_{format_name}_{timestamp}.jsonl"
        with open(jsonl_filename, 'w', encoding='utf-8') as f:
            for sample in data:
                f.write(json.dumps(sample, ensure_ascii=False) + '\n')
        saved_files[f'{format_name}_jsonl'] = jsonl_filename
    
    saved_files[f'{format_name}_json'] = json_filename
    print(f"Saved {format_name} format: {json_filename}")

print(f"\nAll training datasets saved in directory: {output_dir}")

Saved instruction format: bitcoin_training_datasets/bitcoin_investment_training_instruction_20250908_072751.json
Saved chat format: bitcoin_training_datasets/bitcoin_investment_training_chat_20250908_072751.json
Saved alpaca format: bitcoin_training_datasets/bitcoin_investment_training_alpaca_20250908_072751.json
Saved comprehensive format: bitcoin_training_datasets/bitcoin_investment_training_comprehensive_20250908_072751.json

All training datasets saved in directory: bitcoin_training_datasets


## 7. Create Dataset Statistics and Quality Report

In [None]:
def create_dataset_report(samples: List[Dict]) -> Dict:
    """Create a comprehensive dataset quality report."""
    
    df = pd.DataFrame(samples)
    
    report = {
        'total_samples': len(samples),
        'date_range': {
            'earliest': df['date'].min(),
            'latest': df['date'].max(),
            'unique_dates': df['date'].nunique()
        },
        'length_statistics': {
            'input_length': {
                'mean': df['input_length'].mean(),
                'median': df['input_length'].median(),
                'min': df['input_length'].min(),
                'max': df['input_length'].max()
            },
            'output_length': {
                'mean': df['output_length'].mean(),
                'median': df['output_length'].median(),
                'min': df['output_length'].min(),
                'max': df['output_length'].max()
            }
        },
        'data_quality': {
            'average_quality_score': df['quality_score'].mean(),
            'high_quality_samples': (df['quality_score'] >= 0.7).sum(),
            'samples_with_news': df['has_news_data'].sum(),
            'samples_with_prices': df['has_price_data'].sum()
        },
        'quality_distribution': {
            'excellent': (df['quality_score'] >= 0.9).sum(),
            'good': ((df['quality_score'] >= 0.7) & (df['quality_score'] < 0.9)).sum(),
            'fair': ((df['quality_score'] >= 0.5) & (df['quality_score'] < 0.7)).sum(),
            'poor': (df['quality_score'] < 0.5).sum()
        }
    }
    
    return report

# Generate report
dataset_report = create_dataset_report(training_samples)

# Save report
report_filename = f"{output_dir}/bitcoin_training_dataset_report_{timestamp}.json"
with open(report_filename, 'w', encoding='utf-8') as f:
    json.dump(dataset_report, f, indent=2, default=str)

# Display report
print("\n=== DATASET QUALITY REPORT ===")
print(f"Total Samples: {dataset_report['total_samples']:,}")
print(f"Date Range: {dataset_report['date_range']['earliest']} to {dataset_report['date_range']['latest']}")
print(f"Unique Dates: {dataset_report['date_range']['unique_dates']:,}")
print("\nLength Statistics:")
print(f"  Input - Mean: {dataset_report['length_statistics']['input_length']['mean']:.0f}, Max: {dataset_report['length_statistics']['input_length']['max']:,}")
print(f"  Output - Mean: {dataset_report['length_statistics']['output_length']['mean']:.0f}, Max: {dataset_report['length_statistics']['output_length']['max']:,}")
print("\nData Quality:")
print(f"  Average Quality Score: {dataset_report['data_quality']['average_quality_score']:.2f}")
print(f"  High Quality Samples (≥0.7): {dataset_report['data_quality']['high_quality_samples']:,}")
print(f"  Samples with News Data: {dataset_report['data_quality']['samples_with_news']:,}")
print(f"  Samples with Price Data: {dataset_report['data_quality']['samples_with_prices']:,}")
print("\nQuality Distribution:")
print(f"  Excellent (≥0.9): {dataset_report['quality_distribution']['excellent']:,}")
print(f"  Good (0.7-0.9): {dataset_report['quality_distribution']['good']:,}")
print(f"  Fair (0.5-0.7): {dataset_report['quality_distribution']['fair']:,}")
print(f"  Poor (<0.5): {dataset_report['quality_distribution']['poor']:,}")

print(f"\nReport saved to: {report_filename}")

## 8. Sample Training Examples

In [None]:
# Display sample training examples
print("\n=== SAMPLE TRAINING EXAMPLES ===")

# Show a high-quality sample
high_quality_samples = [s for s in training_samples if s['quality_score'] >= 0.8]
if high_quality_samples:
    sample = high_quality_samples[0]
    
    print(f"\n--- HIGH QUALITY SAMPLE (Quality Score: {sample['quality_score']:.2f}) ---")
    print(f"Date: {sample['date']}")
    print(f"Input Length: {sample['input_length']:,} characters")
    print(f"Output Length: {sample['output_length']:,} characters")
    print("\nINPUT (truncated):")
    print(sample['input'][:500] + "...")
    print("\nOUTPUT (truncated):")
    print(sample['output'][:500] + "...")

# Show instruction format example
print("\n\n--- INSTRUCTION FORMAT EXAMPLE ---")
instruction_sample = instruction_format[0]
print(f"Instruction: {instruction_sample['instruction']}")
print(f"Input (truncated): {instruction_sample['input'][:200]}...")
print(f"Output (truncated): {instruction_sample['output'][:200]}...")

# Show chat format example
print("\n\n--- CHAT FORMAT EXAMPLE ---")
chat_sample = chat_format[0]
print(f"System: {chat_sample['messages'][0]['content']}")
print(f"User (truncated): {chat_sample['messages'][1]['content'][:200]}...")
print(f"Assistant (truncated): {chat_sample['messages'][2]['content'][:200]}...")

## 9. Upload to Hugging Face Hub (Optional)

**Note**: Make sure you have a Hugging Face account and are logged in before running this section.

In [None]:
# Configuration for Hugging Face upload
HF_REPO_NAME = "bitcoin-investment-advisory-training-dataset"  # Change this to your desired repo name
HF_USERNAME = "your-username"  # Change this to your Hugging Face username
UPLOAD_TO_HF = False  # Set to True to upload

if UPLOAD_TO_HF:
    try:
        # Initialize Hugging Face API
        api = HfApi()
        
        # Create repository
        repo_id = f"{HF_USERNAME}/{HF_REPO_NAME}"
        
        try:
            create_repo(repo_id, repo_type="dataset", exist_ok=True)
            print(f"Created repository: {repo_id}")
        except Exception as e:
            print(f"Repository might already exist: {e}")
        
        # Convert to Hugging Face Dataset format
        hf_datasets = {}
        
        for format_name, data in formats_and_data.items():
            if format_name == 'comprehensive':
                continue  # Skip comprehensive format for HF upload
            
            hf_dataset = Dataset.from_list(data)
            hf_datasets[format_name] = hf_dataset
            
            # Upload each format as a separate configuration
            hf_dataset.push_to_hub(
                repo_id,
                config_name=format_name,
                commit_message=f"Upload {format_name} format training data"
            )
            
            print(f"Uploaded {format_name} format to Hugging Face Hub")
        
        # Create and upload README
        readme_content = f"""
# Bitcoin Investment Advisory Training Dataset

This dataset contains comprehensive Bitcoin investment advisory training data for fine-tuning language models.

## Dataset Description

- **Total Samples**: {dataset_report['total_samples']:,}
- **Date Range**: {dataset_report['date_range']['earliest']} to {dataset_report['date_range']['latest']}
- **Average Quality Score**: {dataset_report['data_quality']['average_quality_score']:.2f}

## Formats Available

- **instruction**: Standard instruction-tuning format
- **chat**: Conversational format with system/user/assistant roles
- **alpaca**: Alpaca-style format for fine-tuning

## Usage

```python
from datasets import load_dataset

# Load instruction format
dataset = load_dataset("{repo_id}", "instruction")

# Load chat format
dataset = load_dataset("{repo_id}", "chat")
```

## License

Research and Educational Use Only

## Citation

If you use this dataset, please cite:

```
@dataset{{bitcoin_investment_advisory_dataset,
  title={{Bitcoin Investment Advisory Training Dataset}},
  author={{{HF_USERNAME}}},
  year={{2025}},
  url={{https://huggingface.co/datasets/{repo_id}}}
}}
```
"""
        
        # Save README locally
        readme_path = f"{output_dir}/README.md"
        with open(readme_path, 'w', encoding='utf-8') as f:
            f.write(readme_content)
        
        # Upload README
        api.upload_file(
            path_or_fileobj=readme_path,
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="dataset"
        )
        
        print(f"\n✅ Successfully uploaded dataset to Hugging Face Hub: https://huggingface.co/datasets/{repo_id}")
        
    except Exception as e:
        print(f"❌ Error uploading to Hugging Face: {e}")
        print("Make sure you are logged in with 'huggingface-cli login' and have the correct permissions.")
        
else:
    print("Hugging Face upload disabled. Set UPLOAD_TO_HF = True and configure HF_USERNAME to upload.")
    print(f"\nTo upload later, you can use the saved files in: {output_dir}")

## 10. Create Training Instructions and Guidelines

In [None]:
# Create comprehensive training instructions
training_instructions = {
    "dataset_overview": {
        "name": "Bitcoin Investment Advisory Training Dataset",
        "purpose": "Train AI models to provide institutional-grade Bitcoin investment advice",
        "total_samples": len(training_samples),
        "formats_available": list(formats_and_data.keys()),
        "quality_score_range": [0.0, 1.0],
        "recommended_min_quality": 0.7
    },
    "training_recommendations": {
        "model_types": [
            "Large Language Models (LLMs) like GPT, LLaMA, Mistral",
            "Instruction-tuned models",
            "Chat-based models"
        ],
        "training_approaches": [
            "Fine-tuning on instruction format",
            "Supervised fine-tuning (SFT)",
            "Parameter-Efficient Fine-Tuning (PEFT) with LoRA"
        ],
        "hyperparameters": {
            "learning_rate": "1e-5 to 5e-5",
            "batch_size": "4-16 depending on GPU memory",
            "epochs": "3-5 for fine-tuning",
            "max_sequence_length": "4096-8192 tokens",
            "warmup_steps": "10% of total steps"
        },
        "data_splitting": {
            "train": "80% (chronologically earlier dates)",
            "validation": "10% (middle dates)",
            "test": "10% (most recent dates)"
        }
    },
    "evaluation_metrics": [
        "BLEU score for text generation quality",
        "ROUGE scores for summarization quality",
        "Human evaluation for investment advice quality",
        "Perplexity for language modeling performance",
        "Custom metrics for investment recommendation accuracy"
    ],
    "usage_guidelines": {
        "data_preprocessing": [
            "Filter samples by quality score (recommended: ≥0.7)",
            "Tokenize inputs and outputs appropriately",
            "Handle long sequences with truncation or chunking",
            "Apply consistent formatting across samples"
        ],
        "training_best_practices": [
            "Use gradient accumulation for large effective batch sizes",
            "Implement early stopping based on validation loss",
            "Monitor for overfitting on specific dates or patterns",
            "Use mixed precision training to optimize memory usage"
        ],
        "ethical_considerations": [
            "This dataset is for research and educational purposes only",
            "Models trained on this data should include financial disclaimers",
            "Investment advice generated should be clearly marked as AI-generated",
            "Consider regulatory compliance in deployment contexts"
        ]
    },
    "sample_training_code": {
        "huggingface_transformers": """
# Example training with Hugging Face Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

# Load dataset
dataset = load_dataset('path/to/dataset', 'instruction')

# Load model and tokenizer
model_name = 'microsoft/DialoGPT-medium'  # or your preferred base model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    # Combine instruction, input, and output
    texts = [f"{inst}\n{inp}\n{out}" for inst, inp, out in 
             zip(examples['instruction'], examples['input'], examples['output'])]
    return tokenizer(texts, truncation=True, padding=True, max_length=2048)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./bitcoin-advisor-model',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    warmup_steps=100,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    evaluation_strategy='steps',
    fp16=True,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

# Train
trainer.train()
"""
    }
}

# Save training instructions
instructions_path = f"{output_dir}/training_instructions_{timestamp}.json"
with open(instructions_path, 'w', encoding='utf-8') as f:
    json.dump(training_instructions, f, indent=2, ensure_ascii=False)

print(f"\n=== TRAINING INSTRUCTIONS CREATED ===")
print(f"Saved to: {instructions_path}")
print("\nKey Recommendations:")
print(f"- Use samples with quality score ≥ {training_instructions['dataset_overview']['recommended_min_quality']}")
print(f"- Available formats: {', '.join(training_instructions['dataset_overview']['formats_available'])}")
print(f"- Recommended learning rate: {training_instructions['training_recommendations']['hyperparameters']['learning_rate']}")
print(f"- Suggested epochs: {training_instructions['training_recommendations']['hyperparameters']['epochs']}")

print("\n" + "="*50)
print("DATASET CREATION COMPLETE!")
print("="*50)
print(f"📁 Output directory: {output_dir}")
print(f"📊 Total samples: {len(training_samples):,}")
print(f"📈 High quality samples: {dataset_report['data_quality']['high_quality_samples']:,}")
print(f"📋 Formats created: {len(formats_and_data)}")
print(f"📝 Files saved: {len(saved_files) + 2}")
print("\nReady for training! 🚀")