# 1. Setup and Dependencies

Install required libraries and setup API keys for Gemini Flash analysis.

In [20]:
# Install required packages
!pip install -q pandas numpy datasets huggingface_hub
!pip install -q tqdm python-dateutil

import pandas as pd
import numpy as np
import json
import glob
import os
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
from tqdm.notebook import tqdm
from datasets import Dataset
from huggingface_hub import login

print("✅ All required libraries imported successfully!")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
✅ All required libraries imported successfully!
✅ All required libraries imported successfully!


In [21]:
# HuggingFace token for dataset upload
HF_TOKEN = "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"

# Login to HuggingFace (only needed for dataset upload)
try:
    login(token=HF_TOKEN, add_to_git_credential=False)
    print("✅ HuggingFace login successful!")
except Exception as e:
    print(f"❌ HuggingFace login failed: {e}")

print("✅ Setup complete - using existing analyzed data, no API calls needed!")

✅ HuggingFace login successful!
✅ Setup complete - using existing analyzed data, no API calls needed!


# 2. Load Individual News Items

Load ALL individual news items (not just daily summaries) from your existing analyzed dataset to create training samples for each news article.

In [22]:
# Load all individual news items from your existing dataset
print("📰 Loading individual news items from outputs_btc_effects...")

# Path to your existing news data
news_data_path = "/Users/tahamajs/Documents/uni/LLM/Files/Final Project/outputs_btc_effects/per_date/*.json"
all_news_files = glob.glob(news_data_path)

if not all_news_files:
    print("❌ No news files found. Please check the path.")
    print("Expected path:", news_data_path)
else:
    print(f"📁 Found {len(all_news_files)} news files")

# Load ALL individual news items (not just daily summaries)
all_individual_news = []

for file_path in tqdm(all_news_files, desc="Loading individual news items"):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # Extract date from filename
        date_str = os.path.basename(file_path).split('.')[0]
        
        # Get daily view context for this date
        daily_context = {}
        if 'daily_view' in data and data['daily_view']:
            daily_view = data['daily_view']
            daily_context = {
                'scenario_probs': daily_view.get('scenario_probs', {}),
                'recommendation_short_term': daily_view.get('recommendation_short_term', {}),
                'recommendation_long_term': daily_view.get('recommendation_long_term', {})
            }
        
        # Process ALL long-term news items for this date
        long_term_items = data.get('long_term', [])
        for item in long_term_items:
            news_item = {
                'date': date_str,
                'title': item.get('title', ''),
                'summary': item.get('summary', ''),
                'direction': item.get('direction', ''),
                'magnitude': item.get('magnitude', ''),
                'confidence': item.get('confidence', 0),
                'impact_horizon_months': item.get('impact_horizon_months', 0),
                'impact_horizon_days': None,  # Long-term items use months
                'rationale': item.get('rationale', ''),
                'timeframe_type': 'long_term',
                'url': item.get('url', ''),
                'id': item.get('id', ''),
                'impact_tags': item.get('impact_tags', []),
                # Add daily context
                **daily_context
            }
            all_individual_news.append(news_item)
        
        # Process ALL short-term news items for this date  
        short_term_items = data.get('short_term', [])
        for item in short_term_items:
            news_item = {
                'date': date_str,
                'title': item.get('title', ''),
                'summary': item.get('summary', ''),
                'direction': item.get('direction', ''),
                'magnitude': item.get('magnitude', ''),
                'confidence': item.get('confidence', 0),
                'impact_horizon_months': None,  # Short-term items use days
                'impact_horizon_days': item.get('impact_horizon_days', 0),
                'rationale': item.get('rationale', ''),
                'timeframe_type': 'short_term',
                'url': item.get('url', ''),
                'id': item.get('id', ''),
                'impact_tags': item.get('impact_tags', []),
                # Add daily context
                **daily_context
            }
            all_individual_news.append(news_item)
            
    except Exception as e:
        print(f"⚠️ Error loading {file_path}: {e}")
        continue

# Convert to DataFrame for easier handling
df_individual_news = pd.DataFrame(all_individual_news)

if not df_individual_news.empty:
    df_individual_news['date'] = pd.to_datetime(df_individual_news['date'])
    df_individual_news = df_individual_news.sort_values(['date', 'confidence'], ascending=[True, False]).reset_index(drop=True)
    
    print(f"✅ Loaded {len(df_individual_news)} individual news items")
    print(f"📅 Date range: {df_individual_news['date'].min().date()} to {df_individual_news['date'].max().date()}")
    
    # Show statistics by date
    daily_counts = df_individual_news.groupby('date').size()
    print(f"📊 Average news items per day: {daily_counts.mean():.1f}")
    print(f"📊 Max news items in a day: {daily_counts.max()}")
    print(f"📊 Min news items in a day: {daily_counts.min()}")
    
    # Show timeframe distribution
    timeframe_counts = df_individual_news['timeframe_type'].value_counts()
    print(f"\\n📈 Timeframe distribution:")
    for timeframe, count in timeframe_counts.items():
        print(f"   {timeframe}: {count} items ({count/len(df_individual_news)*100:.1f}%)")
    
    # Display sample news item
    print(f"\\n📰 Sample individual news item:")
    sample_idx = len(df_individual_news) // 2
    sample = df_individual_news.iloc[sample_idx]
    print(f"Date: {sample['date'].date()}")
    print(f"Title: {sample['title']}")
    print(f"Direction: {sample['direction']} | Magnitude: {sample['magnitude']} | Confidence: {sample['confidence']:.2f}")
    print(f"Timeframe: {sample['timeframe_type']}")
    print(f"Summary: {sample['summary'][:150]}...")
    
else:
    print("❌ No individual news items loaded")

📰 Loading individual news items from outputs_btc_effects...
📁 Found 2437 news files


Loading individual news items:   0%|          | 0/2437 [00:00<?, ?it/s]

✅ Loaded 49171 individual news items
📅 Date range: 2018-01-01 to 2024-12-31
📊 Average news items per day: 20.2
📊 Max news items in a day: 33
📊 Min news items in a day: 2
\n📈 Timeframe distribution:
   short_term: 24764 items (50.4%)
   long_term: 24407 items (49.6%)
\n📰 Sample individual news item:
Date: 2021-07-16
Title: PayPal ups its weekly cryptocurrency buy limit to $100,000
Direction: bullish | Magnitude: medium | Confidence: 0.70
Timeframe: short_term
Summary: PayPal has increased its weekly cryptocurrency buy limit to $100,000, signaling continued expansion and confidence in the digital asset space. This mo...


# 3. Create Training Data Generator

Build a system to convert individual analyzed news items into focused training samples for Bitcoin price prediction models.

In [29]:
class BitcoinTrainingDataCreator:
    """Create training samples directly from existing analyzed Bitcoin data"""
    
    def __init__(self):
        pass
    
    def extract_sentiment_from_probs(self, scenario_probs: Dict[str, float]) -> str:
        """Extract sentiment from scenario probabilities"""
        bull = scenario_probs.get('bull', 0)
        bear = scenario_probs.get('bear', 0)
        base = scenario_probs.get('base', 0)
        
        if bull > bear and bull > base:
            return "bullish"
        elif bear > bull and bear > base:
            return "bearish"
        else:
            return "neutral"
    
    def extract_direction_from_recommendation(self, short_rec: Dict, long_rec: Dict) -> str:
        """Extract price direction from recommendations"""
        short_action = short_rec.get('action', '').lower()
        long_action = long_rec.get('action', '').lower()
        
        if 'buy' in short_action or 'buy' in long_action:
            return "up"
        elif 'sell' in short_action or 'sell' in long_action:
            return "down"
        else:
            return "sideways"
    
    def extract_strength_from_analysis(self, long_term_items: List, short_term_items: List) -> str:
        """Extract impact strength from analysis items"""
        high_impact_count = 0
        total_items = len(long_term_items) + len(short_term_items)
        
        for item in long_term_items + short_term_items:
            if item.get('magnitude', '').lower() == 'high':
                high_impact_count += 1
        
        if total_items == 0:
            return "low"
        
        high_ratio = high_impact_count / total_items
        if high_ratio > 0.6:
            return "high"
        elif high_ratio > 0.3:
            return "medium"
        else:
            return "low"
    
    def extract_timeframe_from_analysis(self, long_term_items: List, short_term_items: List) -> str:
        """Extract primary timeframe from analysis"""
        if len(short_term_items) > len(long_term_items):
            return "short_term"
        elif len(long_term_items) > 0:
            return "medium_term"
        else:
            return "immediate"
    
    def calculate_confidence(self, scenario_probs: Dict, recommendations: Dict) -> float:
        """Calculate confidence based on probabilities and recommendations"""
        # Get the highest probability
        max_prob = max(scenario_probs.values()) if scenario_probs else 0.33
        
        # Get recommendation confidence
        rec_conf = recommendations.get('probability', 0.5) if recommendations else 0.5
        
        # Average them
        return (max_prob + rec_conf) / 2
    
    def extract_key_reason(self, long_term_items: List, short_term_items: List, summary: str) -> str:
        """Extract key reason from analysis"""
        # Try to get from highest confidence item
        all_items = long_term_items + short_term_items
        
        if all_items:
            # Sort by confidence and get top item
            sorted_items = sorted(all_items, key=lambda x: x.get('confidence', 0), reverse=True)
            top_item = sorted_items[0]
            
            reason = top_item.get('rationale', '') or top_item.get('title', '')
            if reason:
                return reason[:100]  # Limit length
        
        # Fallback to summary snippet
        if summary:
            return summary[:100]
        
        return "Market conditions analysis"
    
    def create_training_sample_from_individual_news(self, news_item: Dict[str, Any]) -> Dict[str, str]:
        """Create training sample from individual analyzed news item"""
        
        # Extract individual news item data
        title = news_item.get('title', '')
        summary = news_item.get('summary', '')
        direction = news_item.get('direction', '')
        magnitude = news_item.get('magnitude', '')
        confidence = news_item.get('confidence', 0)
        rationale = news_item.get('rationale', '')
        timeframe_type = news_item.get('timeframe_type', '')
        impact_tags = news_item.get('impact_tags', [])
        
        # Extract daily market context
        scenario_probs = news_item.get('scenario_probs', {})
        short_rec = news_item.get('recommendation_short_term', {})
        long_rec = news_item.get('recommendation_long_term', {})
        
        # Map direction to price direction
        if direction.lower() == 'bullish':
            price_direction = "up"
        elif direction.lower() == 'bearish':
            price_direction = "down"
        else:
            price_direction = "sideways"
        
        # Map magnitude to impact strength
        impact_strength = magnitude.lower() if magnitude.lower() in ['high', 'medium', 'low'] else 'medium'
        
        # Determine timeframe
        if timeframe_type == 'short_term':
            timeframe = "short_term"
        elif timeframe_type == 'long_term':
            timeframe = "medium_term"
        else:
            timeframe = "immediate"
        
        # Extract sentiment from direction
        sentiment = direction.lower() if direction.lower() in ['bullish', 'bearish'] else 'neutral'
        
        # Create Bitcoin effects analysis from individual news item
        effects_analysis = {
            "sentiment": sentiment,
            "price_direction": price_direction,
            "impact_strength": impact_strength,
            "timeframe": timeframe,
            "confidence": float(confidence),
            "key_reason": rationale[:100] if rationale else title[:100]
        }
        
        # Create simple, clear instruction with JSON structure
        instruction = """Analyze Bitcoin news and predict price impact. Return JSON with this exact structure:

{
  "sentiment": "bullish|neutral|bearish",
  "price_direction": "up|sideways|down",
  "impact_strength": "high|medium|low", 
  "timeframe": "immediate|short_term|medium_term",
  "confidence": 0.75,
  "key_reason": "Brief explanation of main factor"
}"""
        
        # Create concise input context with individual news
        input_context = f"""News Title: {title}

News Summary: {summary}

Impact Tags: {', '.join(impact_tags) if impact_tags else 'None'}

Market Context:
Bull {scenario_probs.get('bull', 0):.0%} | Base {scenario_probs.get('base', 0):.0%} | Bear {scenario_probs.get('bear', 0):.0%}

Daily Recommendations:
Short-term: {short_rec.get('action', 'Hold')}
Long-term: {long_rec.get('action', 'Hold')}"""
        
        # Create expected output
        output = json.dumps(effects_analysis, ensure_ascii=False)
        
        return {
            'instruction': instruction,
            'input': input_context,
            'output': output
        }

# Initialize the training data creator
data_creator = BitcoinTrainingDataCreator()
print("✅ Bitcoin Training Data Creator initialized - no API needed!")

✅ Bitcoin Training Data Creator initialized - no API needed!


# 4. Process Individual News Items and Create Training Samples

Process each individual news item from your existing analysis to create focused training samples for Bitcoin price prediction.

In [30]:
# Create training dataset from ALL individual news items
print("🚀 Creating Bitcoin Training Dataset from Individual News Items")
print("=" * 60)

# Use all individual news items (each news article gets its own training sample)
if not df_individual_news.empty:
    print(f"📊 Processing {len(df_individual_news)} individual news items")
    print(f"📅 From {df_individual_news['date'].min().date()} to {df_individual_news['date'].max().date()}")
    
    # Show daily breakdown
    daily_counts = df_individual_news.groupby('date').size()
    print(f"📊 Creating training samples from ~{daily_counts.mean():.0f} news items per day")
else:
    print("❌ No individual news data available for processing")

# Process each individual news item to create training samples
training_samples = []
successful_samples = 0

if not df_individual_news.empty:
    for idx, news_item in tqdm(df_individual_news.iterrows(), total=len(df_individual_news), desc="Creating training samples from individual news"):
        try:
            # Create training sample from individual news item
            training_sample = data_creator.create_training_sample_from_individual_news(news_item.to_dict())
            training_samples.append(training_sample)
            successful_samples += 1
            
            # Show progress every 1000 samples
            if (idx + 1) % 1000 == 0:
                print(f"   ✅ Created {successful_samples} training samples so far...")
            
        except Exception as e:
            print(f"   ⚠️ Error creating sample for news item {idx}: {e}")
            continue

print(f"\\n📈 Processing Complete!")
print(f"   ✅ Successfully created: {successful_samples} training samples")
print(f"   📊 Total samples: {len(training_samples)}")

# Convert to DataFrame for easier handling
if training_samples:
    df_training = pd.DataFrame(training_samples)
    
    # Add metadata
    df_training['sample_id'] = range(len(df_training))
    df_training['created_at'] = datetime.now().isoformat()
    
    print(f"\\n📋 Dataset Statistics:")
    print(f"   Total samples: {len(df_training)}")
    print(f"   Avg instruction length: {df_training['instruction'].str.len().mean():.0f} chars")
    print(f"   Avg input length: {df_training['input'].str.len().mean():.0f} chars")
    print(f"   Avg output length: {df_training['output'].str.len().mean():.0f} chars")
    
    # Analyze distribution
    try:
        outputs = []
        for output_str in df_training['output'][:100]:  # Sample first 100 for analysis
            output_dict = json.loads(output_str)
            outputs.append(output_dict)
        
        if outputs:
            sentiments = [o.get('sentiment', 'unknown') for o in outputs]
            directions = [o.get('price_direction', 'unknown') for o in outputs]
            strengths = [o.get('impact_strength', 'unknown') for o in outputs]
            
            print(f"\\n📊 Sample Distribution (first 100 items):")
            print(f"   Sentiments: {dict(pd.Series(sentiments).value_counts())}")
            print(f"   Directions: {dict(pd.Series(directions).value_counts())}")
            print(f"   Strengths: {dict(pd.Series(strengths).value_counts())}")
    except:
        pass
    
    # Display sample
    print(f"\\n📄 Sample Training Entry:")
    sample_idx = len(df_training) // 2
    sample = df_training.iloc[sample_idx]
    
    print("\\nInstruction:")
    print(sample['instruction'])
    
    print("\\nInput:")
    print(sample['input'])
    
    print("\\nOutput:")
    print(sample['output'])
    
else:
    print("❌ No training samples generated")
    df_training = pd.DataFrame()

🚀 Creating Bitcoin Training Dataset from Individual News Items
📊 Processing 49171 individual news items
📅 From 2018-01-01 to 2024-12-31
📊 Creating training samples from ~20 news items per day


Creating training samples from individual news:   0%|          | 0/49171 [00:00<?, ?it/s]

   ✅ Created 1000 training samples so far...
   ✅ Created 2000 training samples so far...
   ✅ Created 3000 training samples so far...
   ✅ Created 4000 training samples so far...
   ✅ Created 5000 training samples so far...
   ✅ Created 6000 training samples so far...
   ✅ Created 7000 training samples so far...
   ✅ Created 8000 training samples so far...
   ✅ Created 9000 training samples so far...
   ✅ Created 10000 training samples so far...
   ✅ Created 11000 training samples so far...
   ✅ Created 12000 training samples so far...
   ✅ Created 13000 training samples so far...
   ✅ Created 14000 training samples so far...
   ✅ Created 15000 training samples so far...
   ✅ Created 16000 training samples so far...
   ✅ Created 17000 training samples so far...
   ✅ Created 18000 training samples so far...
   ✅ Created 19000 training samples so far...
   ✅ Created 20000 training samples so far...
   ✅ Created 21000 training samples so far...
   ✅ Created 22000 training samples so far.

# 5. Create and Upload Individual News Training Dataset

Convert ALL individual news training samples into a HuggingFace dataset for Bitcoin price prediction model training.

In [31]:
# Create HuggingFace Dataset and Upload
if not df_training.empty:
    print("📦 Creating HuggingFace Dataset...")
    
    # Prepare dataset
    dataset_dict = {
        'instruction': df_training['instruction'].tolist(),
        'input': df_training['input'].tolist(),
        'output': df_training['output'].tolist(),
        'sample_id': df_training['sample_id'].tolist(),
        'created_at': df_training['created_at'].tolist()
    }
    
    # Create HuggingFace Dataset
    hf_dataset = Dataset.from_dict(dataset_dict)
    
    print(f"✅ Dataset created with {len(hf_dataset)} samples")
    
    # Upload to HuggingFace Hub
    repo_name = "bitcoin-individual-news-dataset"
    repo_id = f"tahamajs/{repo_name}"
    
    try:
        print(f"🚀 Uploading dataset to {repo_id}...")
        
        hf_dataset.push_to_hub(
            repo_id,
            commit_message=f"Bitcoin individual news training dataset with {len(hf_dataset)} samples from individual news items",
        )
        
        print(f"\\n🎉 Dataset Successfully Uploaded!")
        print(f"🔗 Dataset URL: https://huggingface.co/datasets/{repo_id}")
        print(f"📊 Total Samples: {len(hf_dataset)}")
        
        # Dataset statistics
        avg_instruction_length = sum(len(x) for x in dataset_dict['instruction']) / len(dataset_dict['instruction'])
        avg_input_length = sum(len(x) for x in dataset_dict['input']) / len(dataset_dict['input'])
        avg_output_length = sum(len(x) for x in dataset_dict['output']) / len(dataset_dict['output'])
        
        print(f"\\n📏 Dataset Statistics:")
        print(f"   Average instruction length: {avg_instruction_length:.0f} characters")
        print(f"   Average input length: {avg_input_length:.0f} characters")
        print(f"   Average output length: {avg_output_length:.0f} characters")
        print(f"   Total dataset size: {avg_instruction_length + avg_input_length + avg_output_length:.0f} avg chars per sample")
        
    except Exception as e:
        print(f"❌ Upload failed: {e}")
        print("💡 Saving dataset locally instead...")
        
        # Save locally as backup
        df_training.to_json(f"bitcoin_news_effects_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", 
                           orient='records', indent=2)
        print("💾 Dataset saved locally as JSON file")

else:
    print("❌ No dataset to upload - training samples list is empty")

📦 Creating HuggingFace Dataset...
✅ Dataset created with 49171 samples
🚀 Uploading dataset to tahamajs/bitcoin-individual-news-dataset...
✅ Dataset created with 49171 samples
🚀 Uploading dataset to tahamajs/bitcoin-individual-news-dataset...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  73%|#######3  | 12.2MB / 16.7MB            

\n🎉 Dataset Successfully Uploaded!
🔗 Dataset URL: https://huggingface.co/datasets/tahamajs/bitcoin-individual-news-dataset
📊 Total Samples: 49171
\n📏 Dataset Statistics:
   Average instruction length: 338 characters
   Average input length: 623 characters
   Average output length: 243 characters
   Total dataset size: 1204 avg chars per sample


# 6. Sample Analysis and Outputs

Display detailed examples of the Bitcoin news effects analysis to understand the quality and format of the generated dataset.

In [26]:
# Display comprehensive sample outputs
if not df_training.empty:
    print("🔍 COMPREHENSIVE SAMPLE ANALYSIS")
    print("=" * 70)
    
    # Select a representative sample
    sample_idx = len(df_training) // 2
    sample = df_training.iloc[sample_idx]
    
    print("\\n📋 INSTRUCTION:")
    print("-" * 50)
    print(sample['instruction'])
    
    print("\\n📊 INPUT (News Context):")
    print("-" * 50)
    input_lines = sample['input'].split('\\n')
    for line in input_lines[:30]:  # Show first 30 lines
        print(line)
    if len(input_lines) > 30:
        print(f"... ({len(input_lines) - 30} more lines)")
    
    print("\\n🎯 OUTPUT (Bitcoin Effects Analysis):")
    print("-" * 50)
    try:
        output_dict = json.loads(sample['output'])
        print(json.dumps(output_dict, indent=2, ensure_ascii=False)[:1000])  # First 1000 chars
        if len(sample['output']) > 1000:
            print("\\n... (output truncated for display)")
            
        # Extract key insights (using correct field names)
        print("\\n💡 KEY INSIGHTS FROM THIS ANALYSIS:")
        print("-" * 40)
        print(f"Sentiment: {output_dict.get('sentiment', 'N/A')}")
        print(f"Price Direction: {output_dict.get('price_direction', 'N/A')}")
        print(f"Impact Strength: {output_dict.get('impact_strength', 'N/A')}")
        print(f"Timeframe: {output_dict.get('timeframe', 'N/A')}")
        print(f"Confidence: {output_dict.get('confidence', 0):.2f}")
        print(f"Key Reason: {output_dict.get('key_reason', 'N/A')}")
            
    except json.JSONDecodeError:
        print("Raw output (JSON parsing failed):")
        print(sample['output'][:500] + "..." if len(sample['output']) > 500 else sample['output'])
    
    print("\\n📈 DATASET OVERVIEW:")
    print("=" * 50)
    print(f"Total samples: {len(df_training)}")
    
    # Analyze output patterns (using correct field names)
    sentiments = []
    directions = []
    strengths = []
    timeframes = []
    
    for _, row in df_training.iterrows():
        try:
            output_dict = json.loads(row['output'])
            sentiments.append(output_dict.get('sentiment', 'unknown'))
            directions.append(output_dict.get('price_direction', 'unknown'))
            strengths.append(output_dict.get('impact_strength', 'unknown'))
            timeframes.append(output_dict.get('timeframe', 'unknown'))
        except:
            continue
    
    if sentiments:
        print(f"\\nSentiment Distribution:")
        sentiment_counts = pd.Series(sentiments).value_counts()
        for sentiment, count in sentiment_counts.items():
            print(f"  {sentiment}: {count} ({count/len(sentiments)*100:.1f}%)")
        
        print(f"\\nPrice Direction Distribution:")
        direction_counts = pd.Series(directions).value_counts()
        for direction, count in direction_counts.items():
            print(f"  {direction}: {count} ({count/len(directions)*100:.1f}%)")
            
        print(f"\\nImpact Strength Distribution:")
        strength_counts = pd.Series(strengths).value_counts()
        for strength, count in strength_counts.items():
            print(f"  {strength}: {count} ({count/len(strengths)*100:.1f}%)")
            
        print(f"\\nTimeframe Distribution:")
        timeframe_counts = pd.Series(timeframes).value_counts()
        for timeframe, count in timeframe_counts.items():
            print(f"  {timeframe}: {count} ({count/len(timeframes)*100:.1f}%)")
    
else:
    print("❌ No training samples available for analysis")

🔍 COMPREHENSIVE SAMPLE ANALYSIS
\n📋 INSTRUCTION:
--------------------------------------------------
Analyze Bitcoin news and predict price impact. Return JSON with this exact structure:

{
  "sentiment": "bullish|neutral|bearish",
  "price_direction": "up|sideways|down",
  "impact_strength": "high|medium|low", 
  "timeframe": "immediate|short_term|medium_term",
  "confidence": 0.75,
  "key_reason": "Brief explanation of main factor"
}
\n📊 INPUT (News Context):
--------------------------------------------------
News Title: PayPal ups its weekly cryptocurrency buy limit to $100,000

News Summary: PayPal has increased its weekly cryptocurrency buy limit to $100,000, signaling continued expansion and confidence in the digital asset space. This move, following similar expansions by Square and the introduction of crypto checkout options, makes it easier for retail investors to engage with cryptocurrencies. Higher buy limits can increase overall demand.

Impact Tags: adoption, liquidity

Mark

# Bitcoin News Effects Analysis Dataset

This notebook creates a specialized dataset that analyzes news summaries and their predicted effects on Bitcoin prices. It uses:

1. **Existing comprehensive news data** from your previous dataset
2. **Gemini Flash LLM** to analyze news summaries and predict Bitcoin effects
3. **Structured output format** for effect prediction training

**Dataset Purpose:**
- Train models to predict Bitcoin price effects from news summaries
- Analyze sentiment, market impact, and price drivers
- Generate actionable trading insights from news analysis

**Output Format:**
- Input: News summary + market context
- Output: Structured Bitcoin effect analysis (direction, magnitude, timeframe, confidence)

In [27]:
print("🚀 INDIVIDUAL BITCOIN NEWS TRAINING DATASET - READY!")
print("=" * 60)
print()
print("📋 WHAT THIS CREATES:")
print("• Training sample for EACH individual news item (~100 per day)")
print("• Massive dataset with detailed news-to-price-impact predictions")
print("• Each news article gets its own training example")
print()
print("📊 DATASET SCALE:")
print("• ~100 news items per day")
print("• Thousands of days of data")
print("• Potentially 100,000+ training samples")
print()
print("🎯 TRAINING FORMAT PER NEWS ITEM:")
print("• INSTRUCTION: 'Analyze Bitcoin news and predict price impact'")
print("• INPUT: Individual news title + summary + tags + market context")  
print("• OUTPUT: JSON with sentiment, direction, strength, timeframe, confidence, reason")
print()
print("💡 PERFECT FOR:")
print("• Training models on granular news analysis")
print("• Learning specific news patterns and their Bitcoin impacts")
print("• Building highly detailed prediction systems")
print()
print("⚡ Execute all cells to generate comprehensive individual news dataset!")

🚀 INDIVIDUAL BITCOIN NEWS TRAINING DATASET - READY!

📋 WHAT THIS CREATES:
• Training sample for EACH individual news item (~100 per day)
• Massive dataset with detailed news-to-price-impact predictions
• Each news article gets its own training example

📊 DATASET SCALE:
• ~100 news items per day
• Thousands of days of data
• Potentially 100,000+ training samples

🎯 TRAINING FORMAT PER NEWS ITEM:
• INSTRUCTION: 'Analyze Bitcoin news and predict price impact'
• INPUT: Individual news title + summary + tags + market context
• OUTPUT: JSON with sentiment, direction, strength, timeframe, confidence, reason

💡 PERFECT FOR:
• Training models on granular news analysis
• Learning specific news patterns and their Bitcoin impacts
• Building highly detailed prediction systems

⚡ Execute all cells to generate comprehensive individual news dataset!
