# Create Bitcoin Price Prediction Dataset with Local News Summaries

This notebook creates a custom dataset for fine-tuning a language model to predict Bitcoin prices using only local data sources. It combines:
1.  **Daily News Summaries**: Generated from your local Bitcoin news analysis files.
2.  **Historical Price Data**: From Yahoo Finance.
3.  **Additional Market Data**: Gold and Oil prices for macro context.

The final dataset will be structured for an instruction-based fine-tuning task without any external dataset dependencies.

In [79]:
!pip install -q datasets pandas numpy yfinance tqdm

8374.68s - pydevd: Sending message related to process being replaced timed-out after 5 seconds



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [80]:
import pandas as pd
import numpy as np
import json
import glob
import os
from datetime import timedelta
from tqdm.notebook import tqdm
import yfinance as yf
from datasets import Dataset, load_dataset
from IPython.display import display

In [81]:
from huggingface_hub import login

# IMPORTANT: Replace "YOUR_HF_TOKEN" with your actual Hugging Face access token.
# For better security, it's recommended to use environment variables or secrets management.
hf_token = "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" 

if not hf_token or hf_token == "YOUR_HF_TOKEN":
    print("❌ Please replace 'YOUR_HF_TOKEN' with your actual Hugging Face access token and re-run the cell.")
else:
    try:
        login(token=hf_token, add_to_git_credential=False)
        print("✅ Successfully logged in.")
    except Exception as e:
        print(f"❌ Login failed: {e}")


✅ Successfully logged in.


In [82]:
# --- 1. Load Historical Price Data ---
print("Downloading price data for Bitcoin...")
try:
    data = yf.download('BTC-USD', start='2018-01-01', end='2024-06-01', progress=False)
    
    # Debug: Check the raw structure
    print("Raw data columns:", data.columns.tolist())
    
    # Handle MultiIndex columns from yfinance
    if isinstance(data.columns, pd.MultiIndex):
        # Flatten MultiIndex columns - take the first level (metric names like 'Close', 'Open', etc.)
        data.columns = [col[0] for col in data.columns]
        print("Flattened columns:", data.columns.tolist())
    
    # Create a clean price dataframe with a single 'btc_price' column
    if 'Close' in data.columns:
        df_prices = pd.DataFrame({'btc_price': data['Close']})
    elif 'Adj Close' in data.columns:
        df_prices = pd.DataFrame({'btc_price': data['Adj Close']})
    else:
        # Fallback to the first numeric column
        num_cols = [c for c in data.columns if data[c].dtype.kind in 'biufc']
        if num_cols:
            df_prices = pd.DataFrame({'btc_price': data[num_cols[0]]})
        else:
            df_prices = pd.DataFrame(columns=['btc_price'])
    
    df_prices.index = pd.to_datetime(df_prices.index, utc=True)
    
    # Remove any duplicate columns
    df_prices = df_prices.loc[:, ~df_prices.columns.duplicated()]
    
    print("✅ Bitcoin price data loaded.")
    print("Final price columns:", df_prices.columns.tolist())
    print(f"Data shape: {df_prices.shape}")
    
except Exception as e:
    print(f"❌ Could not load price data. Error: {e}")
    df_prices = pd.DataFrame(columns=['btc_price'])

# Try to load additional market data like in the other notebooks
print("\nLoading additional market data...")
try:
    print("Downloading Gold and Oil data...")
    # Download separately to avoid issues
    gold_data = yf.download('GC=F', start='2018-01-01', end='2024-06-01', progress=False)
    oil_data = yf.download('CL=F', start='2018-01-01', end='2024-06-01', progress=False)
    
    # Handle gold data
    if not gold_data.empty:
        if isinstance(gold_data.columns, pd.MultiIndex):
            gold_data.columns = [col[0] for col in gold_data.columns]
        df_gold = pd.DataFrame({'gold_price': gold_data.get('Close', gold_data.iloc[:, 0])})
        df_gold.index = pd.to_datetime(df_gold.index, utc=True)
        df_gold = df_gold.loc[:, ~df_gold.columns.duplicated()]
    else:
        df_gold = pd.DataFrame(columns=['gold_price'])
    
    # Handle oil data
    if not oil_data.empty:
        if isinstance(oil_data.columns, pd.MultiIndex):
            oil_data.columns = [col[0] for col in oil_data.columns]
        df_oil = pd.DataFrame({'oil_price': oil_data.get('Close', oil_data.iloc[:, 0])})
        df_oil.index = pd.to_datetime(df_oil.index, utc=True)
        df_oil = df_oil.loc[:, ~df_oil.columns.duplicated()]
    else:
        df_oil = pd.DataFrame(columns=['oil_price'])
    
    print("✅ Gold and Oil data loaded.")
    
except Exception as e:
    print(f"⚠️  Could not load commodity data: {e}")
    df_gold = pd.DataFrame(columns=['gold_price'])
    df_oil = pd.DataFrame(columns=['oil_price'])

# Skip loading external LLM metrics dataset - using only local data
print("\nSkipping external LLM-Augmented dataset (using local data only)...")
df_llm_metrics = pd.DataFrame()
print("✅ Using local Bitcoin news and price data only.")

print("\n--- Bitcoin Price Data ---")
display(df_prices.head(2))

Downloading price data for Bitcoin...


  data = yf.download('BTC-USD', start='2018-01-01', end='2024-06-01', progress=False)
  gold_data = yf.download('GC=F', start='2018-01-01', end='2024-06-01', progress=False)


Raw data columns: [('Close', 'BTC-USD'), ('High', 'BTC-USD'), ('Low', 'BTC-USD'), ('Open', 'BTC-USD'), ('Volume', 'BTC-USD')]
Flattened columns: ['Close', 'High', 'Low', 'Open', 'Volume']
✅ Bitcoin price data loaded.
Final price columns: ['btc_price']
Data shape: (2343, 1)

Loading additional market data...
Downloading Gold and Oil data...
✅ Gold and Oil data loaded.

Skipping external LLM-Augmented dataset (using local data only)...
✅ Using local Bitcoin news and price data only.

--- Bitcoin Price Data ---


  oil_data = yf.download('CL=F', start='2018-01-01', end='2024-06-01', progress=False)


Unnamed: 0_level_0,btc_price
Date,Unnamed: 1_level_1
2018-01-01 00:00:00+00:00,13657.200195
2018-01-02 00:00:00+00:00,14982.099609


In [83]:
# --- 3. Load Complete News Data ---
print("Loading complete daily news data...")
summaries_path = "/Users/tahamajs/Documents/uni/LLM/Files/Final Project/outputs_btc_effects/per_date/*.json"
all_summary_files = glob.glob(summaries_path)
summaries_list = []

# First, let's inspect the structure of a sample file
if all_summary_files:
    sample_file = all_summary_files[0]
    print(f"📁 Sample file structure from: {os.path.basename(sample_file)}")
    try:
        with open(sample_file, 'r', encoding='utf-8') as f:
            sample_data = json.load(f)
            print("Available keys:", list(sample_data.keys()))
            if 'daily_view' in sample_data:
                print("Daily view keys:", list(sample_data['daily_view'].keys()))
            print("-" * 50)
    except Exception as e:
        print(f"Could not read sample file: {e}")

for file_path in all_summary_files:
    if os.path.getsize(file_path) > 0:
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                # Extract the date from the filename (e.g., '2024-01-15.json')
                date_str = os.path.basename(file_path).split('.')[0]
                
                # Extract ALL available information from the JSON
                daily_info = {
                    'date': pd.to_datetime(date_str, utc=True),
                    # Basic summary (as before)
                    'summary': data.get('daily_view', {}).get('summary', 'No summary available.'),
                }
                
                # Add daily_view information if available
                daily_view = data.get('daily_view', {})
                if daily_view:
                    daily_info.update({
                        'daily_sentiment': daily_view.get('sentiment', 'neutral'),
                        'daily_key_events': daily_view.get('key_events', []),
                        'daily_market_impact': daily_view.get('market_impact', 'unknown'),
                        'daily_price_drivers': daily_view.get('price_drivers', []),
                        'daily_risk_factors': daily_view.get('risk_factors', []),
                        'daily_opportunities': daily_view.get('opportunities', []),
                    })
                
                # Add any other top-level keys from the JSON
                for key, value in data.items():
                    if key != 'daily_view' and isinstance(value, (str, int, float, list)):
                        daily_info[f'news_{key}'] = value
                    elif key != 'daily_view' and isinstance(value, dict):
                        # For nested dictionaries, flatten them with prefixes
                        for subkey, subvalue in value.items():
                            if isinstance(subvalue, (str, int, float, list)):
                                daily_info[f'news_{key}_{subkey}'] = subvalue
                
                # Add raw JSON as backup (for complex structures)
                daily_info['raw_news_data'] = json.dumps(data, ensure_ascii=False)
                
                summaries_list.append(daily_info)
                
            except json.JSONDecodeError:
                print(f"Warning: Could not decode JSON from {file_path}")

df_summaries = pd.DataFrame(summaries_list)
if not df_summaries.empty:
    df_summaries.set_index('date', inplace=True)
    df_summaries.sort_index(inplace=True)
    print(f"✅ Loaded {len(df_summaries)} complete news records.")
    print(f"📊 Available columns: {len(df_summaries.columns)}")
    print(f"📅 Date range: {df_summaries.index.min()} to {df_summaries.index.max()}")
    print(f"🔍 Column names: {list(df_summaries.columns)}")
    display(df_summaries.head(2))
else:
    print("❌ No summaries were loaded. Check the file path.")
    df_summaries = pd.DataFrame(columns=['summary'])

Loading complete daily news data...
📁 Sample file structure from: 2019-11-21.json📁 Sample file structure from: 2019-11-21.json
Available keys: ['date', 'long_term', 'short_term', 'daily_view']
Daily view keys: ['date', 'summary', 'scenario_probs', 'recommendation_short_term', 'recommendation_long_term', 'key_risks', 'watch_items']
--------------------------------------------------

Available keys: ['date', 'long_term', 'short_term', 'daily_view']
Daily view keys: ['date', 'summary', 'scenario_probs', 'recommendation_short_term', 'recommendation_long_term', 'key_risks', 'watch_items']
--------------------------------------------------
✅ Loaded 2437 complete news records.
📊 Available columns: 11
📅 Date range: 2018-01-01 00:00:00+00:00 to 2024-12-31 00:00:00+00:00
🔍 Column names: ['summary', 'daily_sentiment', 'daily_key_events', 'daily_market_impact', 'daily_price_drivers', 'daily_risk_factors', 'daily_opportunities', 'news_date', 'news_long_term', 'news_short_term', 'raw_news_data']
✅ L

Unnamed: 0_level_0,summary,daily_sentiment,daily_key_events,daily_market_impact,daily_price_drivers,daily_risk_factors,daily_opportunities,news_date,news_long_term,news_short_term,raw_news_data
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-01 00:00:00+00:00,The market sentiment for Bitcoin is decidedly ...,neutral,[],unknown,[],[],[],2018-01-01,"[{'pick_idx': 4, 'id': 'n0a76d5794f43', 'title...","[{'pick_idx': 1, 'id': 'n42d97c698aea', 'title...","{""date"": ""2018-01-01"", ""long_term"": [{""pick_id..."
2018-01-02 00:00:00+00:00,The market is showing mixed signals as 2018 be...,neutral,[],unknown,[],[],[],2018-01-02,"[{'pick_idx': 20, 'id': 'nb7a3ae68fdf7', 'titl...","[{'pick_idx': 7, 'id': 'nf6baa9c6d6a2', 'title...","{""date"": ""2018-01-02"", ""long_term"": [{""pick_id..."


In [84]:
# --- 4. Combine All Data Sources ---
print("Combining all data sources into a single DataFrame...")

# Helper function from the advanced notebooks
def get_value_for_day(df, day_str, col_name):
    if df.empty or day_str not in df.index:
        return np.nan
    try:
        value = df.loc[day_str, col_name]
        if not pd.api.types.is_scalar(value): 
            value = value.iloc[0] if hasattr(value, 'iloc') else value
        return value
    except (KeyError, IndexError, AttributeError):
        return np.nan

# Create a daily date range (use a narrower range where we know we have data)
daily_dates = pd.date_range(start='2018-01-01', end='2024-05-31', freq='D', tz='UTC')
final_data_list = []

# Debug counters
total_days = len(daily_dates)
days_with_no_price_data = 0
days_with_insufficient_history = 0
days_with_insufficient_future = 0
days_added = 0

for day in tqdm(daily_dates, desc="Aggregating Daily Data"):
    day_str = day.strftime('%Y-%m-%d')

    try:
        # Check if the price data exists and has the required column
        if df_prices.empty or 'btc_price' not in df_prices.columns:
            days_with_no_price_data += 1
            continue
            
        # Get historical prices (past 60 days) - FIX: Extract Series first, then convert to list
        past_prices_df = df_prices[(df_prices.index >= (day - timedelta(days=60))) & (df_prices.index < day)]
        if past_prices_df.empty:
            days_with_insufficient_history += 1
            continue
        past_60_day_prices = past_prices_df['btc_price'].dropna().tolist()  # Extract Series first
        
        if len(past_60_day_prices) < 30:  # Require at least 30 days of history
            days_with_insufficient_history += 1
            continue
        
        # Get future prices (next 10 days) - FIX: Extract Series first, then convert to list
        future_prices_df = df_prices[(df_prices.index > day) & (df_prices.index <= (day + timedelta(days=10)))]
        if future_prices_df.empty:
            days_with_insufficient_future += 1
            continue
        future_10_day_prices = future_prices_df['btc_price'].dropna().tolist()  # Extract Series first

        # Need a full 10-day target
        if len(future_10_day_prices) < 10:
            days_with_insufficient_future += 1
            continue

        # Get additional market data (like in the advanced notebooks)
        gold_price = get_value_for_day(df_gold, day_str, 'gold_price') if not df_gold.empty else np.nan
        oil_price = get_value_for_day(df_oil, day_str, 'oil_price') if not df_oil.empty else np.nan
        
        # Skip external LLM metrics - using only local data
        hash_rate = np.nan
        difficulty = np.nan
        transactions = np.nan
        unique_addresses = np.nan
        fng_index = np.nan
        cbbi_index = np.nan
        llm_sentiment = np.nan
        market_cap = np.nan
        total_bitcoins = np.nan
        estimated_tx_volume = np.nan

        # Get comprehensive news data for this date
        news_summary = get_value_for_day(df_summaries, day_str, 'summary')
        daily_sentiment = get_value_for_day(df_summaries, day_str, 'daily_sentiment')
        daily_key_events = get_value_for_day(df_summaries, day_str, 'daily_key_events')
        daily_market_impact = get_value_for_day(df_summaries, day_str, 'daily_market_impact')
        daily_price_drivers = get_value_for_day(df_summaries, day_str, 'daily_price_drivers')
        daily_risk_factors = get_value_for_day(df_summaries, day_str, 'daily_risk_factors')
        daily_opportunities = get_value_for_day(df_summaries, day_str, 'daily_opportunities')
        news_long_term = get_value_for_day(df_summaries, day_str, 'news_long_term')
        news_short_term = get_value_for_day(df_summaries, day_str, 'news_short_term')
        raw_news_data = get_value_for_day(df_summaries, day_str, 'raw_news_data')

        final_data_list.append({
            'date': day.date(),
            # Comprehensive news data
            'news_summary': news_summary,
            'daily_sentiment': daily_sentiment,
            'daily_key_events': daily_key_events,
            'daily_market_impact': daily_market_impact,
            'daily_price_drivers': daily_price_drivers,
            'daily_risk_factors': daily_risk_factors,
            'daily_opportunities': daily_opportunities,
            'news_long_term': news_long_term,
            'news_short_term': news_short_term,
            'raw_news_data': raw_news_data,
            # Price data
            'btc_price_history_60d': past_60_day_prices,
            'btc_price_target_10d': future_10_day_prices,
            # Additional market data
            'gold_close_price': gold_price,
            'oil_close_price': oil_price,
            # Note: Skipping external LLM metrics to avoid dependencies
            # Using only local Bitcoin news and price data
        })
        days_added += 1
        
    except Exception as e:
        print(f"Error processing data for {day_str}: {e}")
        continue

final_dataset = pd.DataFrame(final_data_list)

    # Fill missing values for available columns
if not final_dataset.empty:
    # Forward fill numeric columns that exist
    cols_to_fill_numeric = ['gold_close_price', 'oil_close_price']
    for col in cols_to_fill_numeric:
        if col in final_dataset.columns:
            final_dataset[col] = final_dataset[col].ffill()
    
    # Fill news text columns
    final_dataset['news_summary'].fillna('No news summary available.', inplace=True)
    final_dataset['daily_sentiment'].fillna('neutral', inplace=True)
    final_dataset['daily_market_impact'].fillna('unknown', inplace=True)
    final_dataset['daily_key_events'].fillna('[]', inplace=True)
    final_dataset['daily_price_drivers'].fillna('[]', inplace=True)
    final_dataset['daily_risk_factors'].fillna('[]', inplace=True)
    final_dataset['daily_opportunities'].fillna('[]', inplace=True)
    final_dataset['news_long_term'].fillna('[]', inplace=True)
    final_dataset['news_short_term'].fillna('[]', inplace=True)
    final_dataset['raw_news_data'].fillna('{}', inplace=True)
print(f"\n✅ Final combined dataset created with {len(final_dataset)} samples.")
print(f"\nDebug statistics:")
print(f"Total days processed: {total_days}")
print(f"Days with no price data: {days_with_no_price_data}")
print(f"Days with insufficient history (<30 days): {days_with_insufficient_history}")
print(f"Days with insufficient future (<10 days): {days_with_insufficient_future}")
print(f"Days successfully added to dataset: {days_added}")

if not final_dataset.empty:
    display(final_dataset.head())
    print(f"\nDataset columns: {final_dataset.columns.tolist()}")
    print(f"Sample price history length: {len(final_dataset.iloc[0]['btc_price_history_60d']) if not final_dataset.empty else 'N/A'}")
    print(f"Sample target length: {len(final_dataset.iloc[0]['btc_price_target_10d']) if not final_dataset.empty else 'N/A'}")
else:
    print("Dataset is empty. No samples were created.")

Combining all data sources into a single DataFrame...


Aggregating Daily Data:   0%|          | 0/2343 [00:00<?, ?it/s]


✅ Final combined dataset created with 2303 samples.

Debug statistics:
Total days processed: 2343
Days with no price data: 0
Days with insufficient history (<30 days): 30
Days with insufficient future (<10 days): 10
Days successfully added to dataset: 2303


Unnamed: 0,date,news_summary,daily_sentiment,daily_key_events,daily_market_impact,daily_price_drivers,daily_risk_factors,daily_opportunities,news_long_term,news_short_term,raw_news_data,btc_price_history_60d,btc_price_target_10d,gold_close_price,oil_close_price
0,2018-01-31,"The crypto market on January 31, 2018, was cha...",neutral,[],unknown,[],[],[],"[{'pick_idx': 11, 'id': 'n4d25fd07511b', 'titl...","[{'pick_idx': 5, 'id': 'nc285bbcb1a6b', 'title...","{""date"": ""2018-01-31"", ""long_term"": [{""pick_id...","[13657.2001953125, 14982.099609375, 15201.0, 1...","[9170.5400390625, 8830.75, 9174.91015625, 8277...",1339.0,64.730003
1,2018-02-01,Bitcoin experienced a significant downturn tod...,neutral,[],unknown,[],[],[],"[{'pick_idx': 8, 'id': 'n19dbb7010a58', 'title...","[{'pick_idx': 3, 'id': 'nc7631b9dff9a', 'title...","{""date"": ""2018-02-01"", ""long_term"": [{""pick_id...","[13657.2001953125, 14982.099609375, 15201.0, 1...","[8830.75, 9174.91015625, 8277.009765625, 6955....",1344.300049,65.800003
2,2018-02-02,The cryptocurrency market is experiencing a si...,neutral,[],unknown,[],[],[],"[{'pick_idx': 37, 'id': 'n05a6f3d453b5', 'titl...","[{'pick_idx': 4, 'id': 'ndc5191bddd44', 'title...","{""date"": ""2018-02-02"", ""long_term"": [{""pick_id...","[13657.2001953125, 14982.099609375, 15201.0, 1...","[9174.91015625, 8277.009765625, 6955.270019531...",1333.699951,65.449997
3,2018-02-03,The cryptocurrency market is experiencing sign...,neutral,[],unknown,[],[],[],"[{'pick_idx': 14, 'id': 'n153e90d94f5e', 'titl...","[{'pick_idx': 4, 'id': 'n0f56c708e25d', 'title...","{""date"": ""2018-02-03"", ""long_term"": [{""pick_id...","[13657.2001953125, 14982.099609375, 15201.0, 1...","[8277.009765625, 6955.27001953125, 7754.0, 762...",1333.699951,65.449997
4,2018-02-04,The crypto market is experiencing significant ...,neutral,[],unknown,[],[],[],"[{'pick_idx': 12, 'id': 'n21eb7aad55cf', 'titl...","[{'pick_idx': 13, 'id': 'ndcba5b9f608f', 'titl...","{""date"": ""2018-02-04"", ""long_term"": [{""pick_id...","[13657.2001953125, 14982.099609375, 15201.0, 1...","[6955.27001953125, 7754.0, 7621.2998046875, 82...",1333.699951,65.449997



Dataset columns: ['date', 'news_summary', 'daily_sentiment', 'daily_key_events', 'daily_market_impact', 'daily_price_drivers', 'daily_risk_factors', 'daily_opportunities', 'news_long_term', 'news_short_term', 'raw_news_data', 'btc_price_history_60d', 'btc_price_target_10d', 'gold_close_price', 'oil_close_price']
Sample price history length: 30
Sample target length: 10


In [85]:
# --- Debug Price Data Information ---
print("\nDebug Information:")
if 'df_prices' in locals() and not df_prices.empty:
    print(f"Price data date range: {df_prices.index.min()} to {df_prices.index.max()}")
    print(f"Price data column names: {df_prices.columns.tolist()}")
    print(f"Total price data points: {len(df_prices)}")
    
    # Handle duplicate columns by using iloc to access the first column
    if len(df_prices.columns) > 0:
        first_col_name = df_prices.columns[0]
        sample_prices = df_prices.iloc[:3, 0].tolist()  # Use iloc to avoid duplicate column issues
        print(f"Sample prices: {sample_prices}")
    
    # Check for data gaps
    date_diff = df_prices.index.to_series().diff().dropna()
    gaps = date_diff[date_diff > pd.Timedelta(days=1)]
    if not gaps.empty:
        print(f"Found {len(gaps)} gaps in price data. First 3 gaps:")
        for idx, gap in gaps.head(3).items():
            print(f"  Gap of {gap} after {idx}")
    
    # Check if we have duplicate columns
    if len(df_prices.columns) != len(df_prices.columns.unique()):
        print(f"⚠️  WARNING: Found duplicate columns! Unique columns: {df_prices.columns.unique()}")
        print("This needs to be fixed in the price loading cell.")
else:
    print("Price data is empty or not available.")


Debug Information:
Price data date range: 2018-01-01 00:00:00+00:00 to 2024-05-31 00:00:00+00:00
Price data column names: ['btc_price']
Total price data points: 2343
Sample prices: [13657.2001953125, 14982.099609375, 15201.0]


In [100]:
# --- 5. Create Advanced Instruction-Formatted Dataset ---
print("Starting transformation with advanced features...")

# Helper functions from the advanced notebooks
def _fmt_usd(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return "N/A"
    try:
        return f"${float(x):,.2f}"
    except Exception:
        return "N/A"

def _fmt_float(x, nd=2):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return "N/A"
    try:
        return f"{float(x):.{nd}f}"
    except Exception:
        return "N/A"

def _derive_price_features(prices):
    """Calculate technical indicators from price history"""
    if not isinstance(prices, (list, tuple)) or len(prices) == 0:
        return dict(
            last_close=np.nan, min60=np.nan, max60=np.nan,
            ret_1d_pct=np.nan, ret_7d_pct=np.nan, ret_30d_pct=np.nan,
            std14_pct=np.nan, avg_abs_change14=np.nan, drawdown_from_max_pct=np.nan
        )
    
    arr = np.array(prices, dtype=float)
    last_close = arr[-1]
    min60 = float(np.min(arr))
    max60 = float(np.max(arr))

    def _pct(a, b):
        try:
            return (float(a) / float(b) - 1.0) * 100.0
        except Exception:
            return np.nan

    ret_1d_pct  = _pct(arr[-1], arr[-2])  if arr.size >= 2  else np.nan
    ret_7d_pct  = _pct(arr[-1], arr[-8])  if arr.size >= 8  else np.nan
    ret_30d_pct = _pct(arr[-1], arr[-31]) if arr.size >= 31 else np.nan

    if arr.size >= 15:
        # percent returns over last 14 intervals
        rets = np.diff(arr[-15:]) / arr[-15:-1] * 100.0
        std14_pct = float(np.std(rets, ddof=1))
        avg_abs_change14 = float(np.mean(np.abs(np.diff(arr[-15:]))))
    else:
        std14_pct = np.nan
        avg_abs_change14 = np.nan

    drawdown_from_max_pct = _pct(arr[-1], max60)
    return dict(
        last_close=float(last_close), min60=min60, max60=max60,
        ret_1d_pct=float(ret_1d_pct) if not pd.isna(ret_1d_pct) else np.nan,
        ret_7d_pct=float(ret_7d_pct) if not pd.isna(ret_7d_pct) else np.nan,
        ret_30d_pct=float(ret_30d_pct) if not pd.isna(ret_30d_pct) else np.nan,
        std14_pct=float(std14_pct) if not pd.isna(std14_pct) else np.nan,
        avg_abs_change14=float(avg_abs_change14) if not pd.isna(avg_abs_change14) else np.nan,
        drawdown_from_max_pct=float(drawdown_from_max_pct) if not pd.isna(drawdown_from_max_pct) else np.nan
    )

def _label_action(last_close, future_10, std14_pct):
    """Generate trading action based on volatility-aware analysis"""
    if last_close is None or np.isnan(last_close) or future_10 is None or len(future_10) < 10:
        return "HOLD", 50
    day10 = float(future_10[-1])
    ret10_pct = (day10 / float(last_close) - 1.0) * 100.0
    vol_ref = float(std14_pct) if std14_pct is not None and not np.isnan(std14_pct) else 2.0
    thr = max(1.0, 0.35 * vol_ref)  # %
    if ret10_pct >= thr:
        action = "BUY"
    elif ret10_pct <= -thr:
        action = "SELL"
    else:
        action = "HOLD"
    conf = int(np.clip(10 + (abs(ret10_pct) / max(0.5, vol_ref)) * 40, 5, 95))
    return action, conf

def _risk_bands(last_close, avg_abs_change14):
    """Calculate stop-loss and take-profit levels"""
    if last_close is None or np.isnan(last_close):
        return np.nan, np.nan
    step = float(avg_abs_change14) if avg_abs_change14 is not None and not np.isnan(avg_abs_change14) else 0.02 * float(last_close)
    step = max(step, 0.005 * float(last_close))  # at least 0.5% of price
    sl = max(0.01, float(last_close) - 2.0 * step)
    tp = float(last_close) + 2.0 * step
    return sl, tp

llm_finetuning_data = []
rows_seen, rows_kept, rows_skipped = 0, 0, 0

for index, row in final_dataset.iterrows():
    rows_seen += 1
    
    # Ensure we have valid price data
    if not row['btc_price_history_60d'] or not row['btc_price_target_10d'] or len(row['btc_price_target_10d']) < 10:
        rows_skipped += 1
        continue

    # Calculate price features
    hist_prices = row['btc_price_history_60d']
    price_history_str = ", ".join([f"{p:.2f}" for p in hist_prices])
    feats = _derive_price_features(hist_prices)
    
    # Generate trading signals
    action, confidence = _label_action(feats['last_close'], row['btc_price_target_10d'], feats['std14_pct'])
    sl, tp = _risk_bands(feats['last_close'], feats['avg_abs_change14'])

    # Format additional data
    gold_price = _fmt_usd(row.get('gold_close_price'))
    oil_price = _fmt_usd(row.get('oil_close_price'))
    mcap_str = _fmt_usd(row.get('btc_market_cap'))
    hash_rate = _fmt_float(row.get('btc_hash_rate'))
    tx_count = _fmt_float(row.get('btc_transactions'), 0)
    uniq_addr = _fmt_float(row.get('btc_unique_addresses'), 0)
    fng_str = _fmt_float(row.get('fear_and_greed_index'))
    cbbi_str = _fmt_float(row.get('cbbi_index'))
    diff_str = _fmt_float(row.get('btc_difficulty'), 0)
    tot_supply = _fmt_float(row.get('btc_total_supply'), 0)
    est_tx_vol = _fmt_usd(row.get('btc_estimated_tx_volume_usd'))
    llm_sent = "N/A" if pd.isna(row.get('llm_sentiment_class')) else str(row.get('llm_sentiment_class'))

    # --- Advanced Instruction ---
    instruction = (
        "You are an expert quantitative crypto analyst. Your tasks:\n"
        "1) Analyze the context and decide an actionable stance for BTC-USD: BUY, SELL, or HOLD.\n"
        "2) Forecast the NEXT 10 daily CLOSING prices (USD).\n\n"
        f"CONTEXT DATE: {row['date']}\n\n"
        "ANALYSIS FRAMEWORK:\n"
        "• Technical Analysis: Use price trends, volatility, and momentum indicators\n"
        "• Macro Analysis: Consider gold/oil prices for broader market context\n"
        "• News Analysis: Integrate comprehensive daily news summaries for market catalysts\n\n"
        "OUTPUT FORMAT (JSON ONLY):\n"
        "Return a single JSON object with EXACTLY these keys:\n"
        "{\"action\":\"BUY|SELL|HOLD\",\"confidence\":<int 1-99>,"
        "\"stop_loss\":<price 2dp>,\"take_profit\":<price 2dp>,"
        "\"forecast_10d\":[<10 prices 2dp>]}\n"
        "No extra text, no explanations, just the JSON."
    )

    # --- Enhanced Input (using only local data) ---
    input_text = f"""
Daily Context — {row['date']}

[Technical Price Analysis]
- Current Price: {_fmt_usd(feats['last_close'])}
- 60-Day Range: {_fmt_usd(feats['min60'])} → {_fmt_usd(feats['max60'])}
- 1D Return: {_fmt_float(feats['ret_1d_pct'])}%
- 7D Return: {_fmt_float(feats['ret_7d_pct'])}%
- 30D Return: {_fmt_float(feats['ret_30d_pct'])}%
- Volatility (14d): {_fmt_float(feats['std14_pct'])}%
- Avg Daily Change (14d): {_fmt_float(feats['avg_abs_change14'])}
- Drawdown from Max: {_fmt_float(feats['drawdown_from_max_pct'])}%

[Price History (Last 60 Days USD)]
[{price_history_str}]

[Macro & Commodities Context]
- Gold Price: {gold_price}
- Crude Oil Price: {oil_price}

[Market Context]
- Bitcoin dominates crypto market as leading digital asset
- Price influenced by adoption, regulation, and macro factors

[Comprehensive News Analysis]
Summary: {row['news_summary']}

Sentiment: {row.get('daily_sentiment', 'neutral')}
Market Impact: {row.get('daily_market_impact', 'unknown')}

Key Events: {', '.join(row.get('daily_key_events', [])) if isinstance(row.get('daily_key_events'), list) and row.get('daily_key_events') else 'No key events'}

Price Drivers: {', '.join(row.get('daily_price_drivers', [])) if isinstance(row.get('daily_price_drivers'), list) and row.get('daily_price_drivers') else 'No specific drivers'}

Risk Factors: {', '.join(row.get('daily_risk_factors', [])) if isinstance(row.get('daily_risk_factors'), list) and row.get('daily_risk_factors') else 'No major risks'}

Opportunities: {', '.join(row.get('daily_opportunities', [])) if isinstance(row.get('daily_opportunities'), list) and row.get('daily_opportunities') else 'No specific opportunities'}

Short-term News: {str(row.get('news_short_term', 'No short-term news'))[:200] + '...' if len(str(row.get('news_short_term', ''))) > 200 else str(row.get('news_short_term', 'No short-term news'))}

Long-term News: {str(row.get('news_long_term', 'No long-term news'))[:200] + '...' if len(str(row.get('news_long_term', ''))) > 200 else str(row.get('news_long_term', 'No long-term news'))}

Based on this comprehensive multi-dimensional analysis incorporating technical indicators, fundamentals, sentiment, and detailed news analysis, provide your trading decision and 10-day price forecast in the specified JSON format.
""".strip()

    # --- Enhanced Output ---
    forecast_str = ", ".join([f"{p:.2f}" for p in row['btc_price_target_10d'][:10]])
    output_text = (
        '{'
        f'"action":"{action}",'
        f'"confidence":{confidence},'
        f'"stop_loss":{sl:.2f},'
        f'"take_profit":{tp:.2f},'
        f'"forecast_10d":[{forecast_str}]'
        '}'
    )

    llm_finetuning_data.append({
        'instruction': instruction.strip(),
        'input': input_text,
        'output': output_text
    })
    rows_kept += 1

# Convert to Hugging Face Dataset
df_finetuning = pd.DataFrame(llm_finetuning_data)
hf_finetuning_dataset = Dataset.from_pandas(df_finetuning)

print(f"\n✅ Advanced dataset transformation complete!")
print(f"   Rows processed: {rows_seen}")
print(f"   Rows kept: {rows_kept}")
print(f"   Rows skipped: {rows_skipped}")
print(f"   Final dataset size: {len(hf_finetuning_dataset)} samples")

if not df_finetuning.empty:
    print("\nSample entry:")
    display(df_finetuning.head(1))
    print("Instruction:", df_finetuning.iloc[0]['instruction'][:200] + "...")
    print("Input:", df_finetuning.iloc[0]['input'][:300] + "...")
    print("Output:", df_finetuning.iloc[0]['output'])

Starting transformation with advanced features...

✅ Advanced dataset transformation complete!
   Rows processed: 2303
   Rows kept: 2303
   Rows skipped: 0
   Final dataset size: 2303 samples

Sample entry:

✅ Advanced dataset transformation complete!
   Rows processed: 2303
   Rows kept: 2303
   Rows skipped: 0
   Final dataset size: 2303 samples

Sample entry:


Unnamed: 0,instruction,input,output
0,You are an expert quantitative crypto analyst....,Daily Context — 2018-01-31\n\n[Technical Price...,"{""action"":""SELL"",""confidence"":95,""stop_loss"":9..."


Instruction: You are an expert quantitative crypto analyst. Your tasks:
1) Analyze the context and decide an actionable stance for BTC-USD: BUY, SELL, or HOLD.
2) Forecast the NEXT 10 daily CLOSING prices (USD).

...
Input: Daily Context — 2018-01-31

[Technical Price Analysis]
- Current Price: $10,106.30
- 60-Day Range: $10,106.30 → $17,527.00
- 1D Return: -10.54%
- 7D Return: -7.01%
- 30D Return: N/A%
- Volatility (14d): 5.76%
- Avg Daily Change (14d): 501.23
- Drawdown from Max: -42.34%

[Price History (Last 60 Days...
Output: {"action":"SELL","confidence":95,"stop_loss":9103.84,"take_profit":11108.76,"forecast_10d":[9170.54, 8830.75, 9174.91, 8277.01, 6955.27, 7754.00, 7621.30, 8265.59, 8736.98, 8621.90]}


In [87]:
# --- 6. Save Enhanced Dataset to Hugging Face Hub ---
if not df_finetuning.empty:
    print("Saving enhanced dataset to Hugging Face Hub...")
    
    # Define repository name
    repo_id = "tahamajs/bitcoin-prediction-dataset-with-local-news-summaries"
    
    try:
        # Push to Hugging Face Hub
        hf_finetuning_dataset.push_to_hub(
            repo_id,
            commit_message="Bitcoin prediction dataset with local news summaries and technical indicators (no external dependencies)"
        )
        
        print(f"✅ Dataset successfully uploaded!")
        print(f"🔗 View at: https://huggingface.co/datasets/{repo_id}")
        print(f"📊 Total samples: {len(hf_finetuning_dataset)}")
        
    except Exception as e:
        print(f"❌ Error uploading to Hugging Face: {e}")
        print("💡 Make sure you're logged in with a valid token.")
        
else:
    print("❌ No data to save - dataset is empty")

Saving enhanced dataset to Hugging Face Hub...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   1%|          | 16.7kB / 1.86MB            

✅ Dataset successfully uploaded!
🔗 View at: https://huggingface.co/datasets/tahamajs/bitcoin-prediction-dataset-with-local-news-summaries
📊 Total samples: 2303


In [88]:
# --- Print One Complete Sample ---
print("="*80)
print("COMPLETE SAMPLE FROM THE BITCOIN PREDICTION DATASET")
print("="*80)

if not df_finetuning.empty:
    # Get the first sample
    sample = df_finetuning.iloc[0]
    
    print("\n🎯 INSTRUCTION:")
    print("-" * 50)
    print(sample['instruction'])
    
    print("\n📊 INPUT:")
    print("-" * 50) 
    print(sample['input'])
    
    print("\n🎲 OUTPUT:")
    print("-" * 50)
    print(sample['output'])
    
    print("\n" + "="*80)
    print(f"This is sample 1 of {len(df_finetuning)} total samples in the dataset")
    print("="*80)
else:
    print("❌ No data available in df_finetuning")

COMPLETE SAMPLE FROM THE BITCOIN PREDICTION DATASET

🎯 INSTRUCTION:
--------------------------------------------------
You are an expert quantitative crypto analyst. Your tasks:
1) Analyze the context and decide an actionable stance for BTC-USD: BUY, SELL, or HOLD.
2) Forecast the NEXT 10 daily CLOSING prices (USD).

CONTEXT DATE: 2018-01-31

ANALYSIS FRAMEWORK:
• Technical Analysis: Use price trends, volatility, and momentum indicators
• Macro Analysis: Consider gold/oil prices for broader market context
• News Analysis: Integrate comprehensive daily news summaries for market catalysts

OUTPUT FORMAT (JSON ONLY):
Return a single JSON object with EXACTLY these keys:
{"action":"BUY|SELL|HOLD","confidence":<int 1-99>,"stop_loss":<price 2dp>,"take_profit":<price 2dp>,"forecast_10d":[<10 prices 2dp>]}
No extra text, no explanations, just the JSON.

📊 INPUT:
--------------------------------------------------
Daily Context — 2018-01-31

[Technical Price Analysis]
- Current Price: $10,106.30


In [89]:
# --- Print Condensed Sample with Key Sections ---
print("="*80)
print("CONDENSED SAMPLE FROM BITCOIN PREDICTION DATASET")
print("="*80)

if not df_finetuning.empty:
    sample = df_finetuning.iloc[0]
    
    print("\n🎯 INSTRUCTION (First 300 characters):")
    print("-" * 50)
    print(sample['instruction'] + "...")
    
    print(f"\n📊 INPUT SECTIONS:")
    print("-" * 50)
    input_lines = sample['input'].split('\n')
    
    # Show key sections
    current_section = ""
    for line in input_lines:
        if line.startswith('[') and line.endswith(']'):
            current_section = line
            print(f"\n{current_section}")
            continue
        elif line.strip() and current_section:
            # Show first few lines of each section
            if current_section == "[Technical Price Analysis]":
                if line.startswith('- Current Price:') or line.startswith('- 60-Day Range:'):
                    print(f"  {line}")
            elif current_section == "[Price History (Last 60 Days USD)]":
                if line.startswith('['):
                    prices = line[1:-1].split(', ')
                    print(f"  [First 5 prices: {', '.join(prices[:5])}, ... Last 5: {', '.join(prices[-5:])}]")
            elif current_section == "[On-Chain & Market Fundamentals]":
                if line.startswith('- Market Cap:') or line.startswith('- Hash Rate:'):
                    print(f"  {line}")
            elif current_section == "[News Summary]":
                print(f"  {line[:100]}{'...' if len(line) > 100 else ''}")
                break
    
    print(f"\n🎲 OUTPUT:")
    print("-" * 50)
    print(sample['output'])
    
    print(f"\n📈 DATASET STATS:")
    print("-" * 50)
    print(f"• Total samples: {len(df_finetuning)}")
    print(f"• Avg instruction length: {df_finetuning['instruction'].str.len().mean():.0f} chars")
    print(f"• Avg input length: {df_finetuning['input'].str.len().mean():.0f} chars") 
    print(f"• Avg output length: {df_finetuning['output'].str.len().mean():.0f} chars")
    
    print("\n" + "="*80)
else:
    print("❌ No data available in df_finetuning")

CONDENSED SAMPLE FROM BITCOIN PREDICTION DATASET

🎯 INSTRUCTION (First 300 characters):
--------------------------------------------------
You are an expert quantitative crypto analyst. Your tasks:
1) Analyze the context and decide an actionable stance for BTC-USD: BUY, SELL, or HOLD.
2) Forecast the NEXT 10 daily CLOSING prices (USD).

CONTEXT DATE: 2018-01-31

ANALYSIS FRAMEWORK:
• Technical Analysis: Use price trends, volatility, and momentum indicators
• Macro Analysis: Consider gold/oil prices for broader market context
• News Analysis: Integrate comprehensive daily news summaries for market catalysts

OUTPUT FORMAT (JSON ONLY):
Return a single JSON object with EXACTLY these keys:
{"action":"BUY|SELL|HOLD","confidence":<int 1-99>,"stop_loss":<price 2dp>,"take_profit":<price 2dp>,"forecast_10d":[<10 prices 2dp>]}
No extra text, no explanations, just the JSON....

📊 INPUT SECTIONS:
--------------------------------------------------

[Technical Price Analysis]
  - Current Price: $10,

In [90]:
# --- Analyze News Summary Coverage ---
print("="*80)
print("NEWS SUMMARY COVERAGE ANALYSIS")
print("="*80)

print("\n📰 SUMMARY STATISTICS:")
print("-" * 50)

# Check original summaries loaded
print(f"• Total news summary files found: {len(all_summary_files)}")
print(f"• Total summaries loaded into df_summaries: {len(df_summaries)}")
print(f"• Summary date range: {df_summaries.index.min()} to {df_summaries.index.max()}")

# Check final dataset coverage
print(f"\n• Total samples in final dataset: {len(final_dataset)}")
print(f"• Final dataset date range: {final_dataset['date'].min()} to {final_dataset['date'].max()}")

# Check how many samples have news summaries vs missing ones
samples_with_news = final_dataset['news_summary'].notna().sum()
samples_missing_news = final_dataset['news_summary'].isna().sum()
samples_with_default_text = (final_dataset['news_summary'] == 'No news summary available.').sum()

print(f"\n📊 NEWS COVERAGE IN FINAL DATASET:")
print("-" * 50)
print(f"• Samples with actual news summaries: {samples_with_news - samples_with_default_text}")
print(f"• Samples with default 'No news summary available': {samples_with_default_text}")
print(f"• Samples with completely missing news: {samples_missing_news}")
print(f"• News coverage percentage: {((samples_with_news - samples_with_default_text) / len(final_dataset) * 100):.1f}%")

# Show some sample dates that have news vs don't have news
print(f"\n📅 SAMPLE DATES WITH NEWS:")
print("-" * 50)
dates_with_news = final_dataset[
    (final_dataset['news_summary'].notna()) & 
    (final_dataset['news_summary'] != 'No news summary available.')
]['date'].head(5)
for date in dates_with_news:
    print(f"  ✅ {date}")

print(f"\n📅 SAMPLE DATES WITHOUT NEWS:")
print("-" * 50)
dates_without_news = final_dataset[
    (final_dataset['news_summary'].isna()) | 
    (final_dataset['news_summary'] == 'No news summary available.')
]['date'].head(5)
for date in dates_without_news:
    print(f"  ❌ {date}")

# Check if there's a pattern in missing dates
print(f"\n🔍 MISSING NEWS PATTERN ANALYSIS:")
print("-" * 50)
missing_news_df = final_dataset[
    (final_dataset['news_summary'].isna()) | 
    (final_dataset['news_summary'] == 'No news summary available.')
]

if not missing_news_df.empty:
    print(f"• Earliest missing date: {missing_news_df['date'].min()}")
    print(f"• Latest missing date: {missing_news_df['date'].max()}")
    
    # Check if missing dates are mostly early dates (before news collection started)
    missing_dates = pd.to_datetime(missing_news_df['date'])
    early_missing = (missing_dates < pd.to_datetime('2023-01-01')).sum()
    recent_missing = (missing_dates >= pd.to_datetime('2023-01-01')).sum()
    
    print(f"• Missing dates before 2023: {early_missing}")
    print(f"• Missing dates from 2023 onwards: {recent_missing}")
else:
    print("• No missing news summaries found!")

print("\n" + "="*80)

NEWS SUMMARY COVERAGE ANALYSIS

📰 SUMMARY STATISTICS:
--------------------------------------------------
• Total news summary files found: 2437
• Total summaries loaded into df_summaries: 2437
• Summary date range: 2018-01-01 00:00:00+00:00 to 2024-12-31 00:00:00+00:00

• Total samples in final dataset: 2303
• Final dataset date range: 2018-01-31 to 2024-05-21

📊 NEWS COVERAGE IN FINAL DATASET:
--------------------------------------------------
• Samples with actual news summaries: 2193
• Samples with default 'No news summary available': 110
• Samples with completely missing news: 0
• News coverage percentage: 95.2%

📅 SAMPLE DATES WITH NEWS:
--------------------------------------------------
  ✅ 2018-01-31
  ✅ 2018-02-01
  ✅ 2018-02-02
  ✅ 2018-02-03
  ✅ 2018-02-04

📅 SAMPLE DATES WITHOUT NEWS:
--------------------------------------------------
  ❌ 2018-02-12
  ❌ 2018-02-28
  ❌ 2018-03-01
  ❌ 2018-03-10
  ❌ 2018-03-11

🔍 MISSING NEWS PATTERN ANALYSIS:
---------------------------------

In [91]:
# --- Analyze News Summary Completeness Per Date ---
print("="*80)
print("PER-DATE NEWS SUMMARY COMPLETENESS ANALYSIS")
print("="*80)

print("\n🔍 CHECKING INDIVIDUAL DATE COVERAGE:")
print("-" * 50)

# Check what news summaries are available vs what's in the final dataset
available_news_dates = set(df_summaries.index.date)
final_dataset_dates = set(pd.to_datetime(final_dataset['date']).dt.date)

print(f"• Dates with available news summaries: {len(available_news_dates)}")
print(f"• Dates in final dataset: {len(final_dataset_dates)}")

# Find overlap and missing dates
overlap_dates = available_news_dates.intersection(final_dataset_dates)
news_available_but_not_used = available_news_dates - final_dataset_dates
news_missing_for_dataset_dates = final_dataset_dates - available_news_dates

print(f"• Dates with both news and price data: {len(overlap_dates)}")
print(f"• Dates with news but not in final dataset: {len(news_available_but_not_used)}")
print(f"• Dates in dataset but missing news: {len(news_missing_for_dataset_dates)}")

# Detailed analysis for each date in final dataset
print(f"\n📊 DETAILED PER-DATE ANALYSIS:")
print("-" * 50)

dates_with_complete_news = 0
dates_with_partial_news = 0
dates_with_no_news = 0

# Sample some dates to check individual completeness
sample_dates = final_dataset.head(10)  # Check first 10 dates as examples

for idx, row in sample_dates.iterrows():
    date_obj = pd.to_datetime(row['date']).date()
    news_summary = row['news_summary']
    
    # Check if this date has news available in our summaries
    date_str = date_obj.strftime('%Y-%m-%d')
    has_news_file = date_obj in available_news_dates
    
    if pd.isna(news_summary) or news_summary == 'No news summary available.':
        status = "❌ NO NEWS"
        dates_with_no_news += 1
    elif has_news_file and len(news_summary) > 50:  # Assuming substantial content
        status = "✅ COMPLETE"
        dates_with_complete_news += 1
    else:
        status = "⚠️  PARTIAL"
        dates_with_partial_news += 1
    
    print(f"  {date_obj} | {status} | News file exists: {has_news_file} | Summary length: {len(str(news_summary)) if news_summary else 0}")

# Check if news summaries contain all available information for each date
print(f"\n🔬 SAMPLE NEWS CONTENT ANALYSIS:")
print("-" * 50)

# Pick a specific date and check what's in the original file vs what's in the dataset
if not df_summaries.empty and not final_dataset.empty:
    # Get a date that exists in both
    common_date = list(overlap_dates)[0] if overlap_dates else None
    
    if common_date:
        # Get original summary from file
        original_summary = df_summaries.loc[pd.to_datetime(common_date, utc=True), 'summary']
        
        # Get summary from final dataset
        dataset_row = final_dataset[final_dataset['date'] == common_date]
        if not dataset_row.empty:
            dataset_summary = dataset_row.iloc[0]['news_summary']
            
            print(f"Sample Date: {common_date}")
            print(f"Original summary length: {len(str(original_summary))}")
            print(f"Dataset summary length: {len(str(dataset_summary))}")
            print(f"Summaries match: {str(original_summary) == str(dataset_summary)}")
            
            if len(str(original_summary)) > 100:
                print(f"\nOriginal preview: {str(original_summary)[:200]}...")
                print(f"Dataset preview:  {str(dataset_summary)[:200]}...")

# Summary statistics
total_checked = len(sample_dates)
print(f"\n📈 SUMMARY (Sample of {total_checked} dates):")
print("-" * 50)
print(f"• Dates with complete news: {dates_with_complete_news}")
print(f"• Dates with partial news: {dates_with_partial_news}")
print(f"• Dates with no news: {dates_with_no_news}")

if total_checked > 0:
    print(f"• Complete coverage rate: {(dates_with_complete_news/total_checked)*100:.1f}%")

print("\n" + "="*80)

PER-DATE NEWS SUMMARY COMPLETENESS ANALYSIS

🔍 CHECKING INDIVIDUAL DATE COVERAGE:
--------------------------------------------------
• Dates with available news summaries: 2437
• Dates in final dataset: 2303
• Dates with both news and price data: 2193
• Dates with news but not in final dataset: 244
• Dates in dataset but missing news: 110

📊 DETAILED PER-DATE ANALYSIS:
--------------------------------------------------
  2018-01-31 | ✅ COMPLETE | News file exists: True | Summary length: 626
  2018-02-01 | ✅ COMPLETE | News file exists: True | Summary length: 554
  2018-02-02 | ✅ COMPLETE | News file exists: True | Summary length: 758
  2018-02-03 | ✅ COMPLETE | News file exists: True | Summary length: 506
  2018-02-04 | ✅ COMPLETE | News file exists: True | Summary length: 458
  2018-02-05 | ✅ COMPLETE | News file exists: True | Summary length: 537
  2018-02-06 | ✅ COMPLETE | News file exists: True | Summary length: 495
  2018-02-07 | ✅ COMPLETE | News file exists: True | Summary lengt

In [92]:
# --- Show Enhanced Dataset Sample with Comprehensive News ---
print("="*80)
print("ENHANCED DATASET WITH COMPREHENSIVE NEWS INFORMATION")
print("="*80)

if not df_finetuning.empty:
    sample = df_finetuning.iloc[0]
    
    print("\n🎯 ENHANCED INPUT PREVIEW:")
    print("-" * 50)
    
    # Extract just the news section to see the enhancement
    input_text = sample['input']
    lines = input_text.split('\n')
    
    # Find the news section
    in_news_section = False
    news_lines = []
    
    for line in lines:
        if '[Comprehensive News Analysis]' in line:
            in_news_section = True
        elif in_news_section and line.startswith('[') and line.endswith(']') and 'News' not in line:
            break  # End of news section
        
        if in_news_section:
            news_lines.append(line)
    
    # Show the enhanced news section
    for line in news_lines[:15]:  # Show first 15 lines
        print(line)
    
    if len(news_lines) > 15:
        print("... (truncated for display)")
    
    print(f"\n📊 DATASET ENHANCEMENTS:")
    print("-" * 50)
    print(f"• Total samples: {len(df_finetuning)}")
    print(f"• Columns in raw dataset: {len(final_dataset.columns)}")
    
    # Count news-related columns
    news_cols = [col for col in final_dataset.columns if 'news' in col.lower() or 'daily' in col.lower()]
    print(f"• News-related columns: {len(news_cols)}")
    print(f"  - {news_cols}")
    
    # Check data richness
    sample_row = final_dataset.iloc[0]
    has_sentiment = pd.notna(sample_row.get('daily_sentiment'))
    has_events = pd.notna(sample_row.get('daily_key_events')) and str(sample_row.get('daily_key_events')) != '[]'
    has_drivers = pd.notna(sample_row.get('daily_price_drivers')) and str(sample_row.get('daily_price_drivers')) != '[]'
    
    print(f"\n🔍 SAMPLE DATA RICHNESS:")
    print(f"• Has sentiment data: {has_sentiment}")
    print(f"• Has key events: {has_events}")
    print(f"• Has price drivers: {has_drivers}")
    print(f"• Has long-term news: {pd.notna(sample_row.get('news_long_term'))}")
    print(f"• Has short-term news: {pd.notna(sample_row.get('news_short_term'))}")
    
    print("\n" + "="*80)
else:
    print("❌ No data available in df_finetuning")

ENHANCED DATASET WITH COMPREHENSIVE NEWS INFORMATION

🎯 ENHANCED INPUT PREVIEW:
--------------------------------------------------
[Comprehensive News Analysis]
Summary: The crypto market on January 31, 2018, was characterized by continued volatility and regulatory concerns. Facebook's announcement to ban crypto ads was a significant bearish signal, restricting marketing channels. While South Korea's finance minister clarified no ban was planned, the uncovering of illegal trades signaled ongoing regulatory scrutiny. Bitcoin remained below $10,000, indicating persistent bearish momentum, further pressured by news of the Bitfinex subpoena by the CFTC and the Coincheck hack aftermath. Despite some short-term price recoveries, the overall sentiment remained cautious due to these headwinds.

Sentiment: neutral
Market Impact: unknown

Key Events: No key events

Price Drivers: No specific drivers

Risk Factors: No major risks

Opportunities: No specific opportunities

Short-term News: [{'pick

  has_events = pd.notna(sample_row.get('daily_key_events')) and str(sample_row.get('daily_key_events')) != '[]'
  has_drivers = pd.notna(sample_row.get('daily_price_drivers')) and str(sample_row.get('daily_price_drivers')) != '[]'


In [93]:
# --- Check Current News Section Structure ---
if not df_finetuning.empty:
    sample = df_finetuning.iloc[0]
    input_text = sample['input']
    
    # Find and print the news section
    lines = input_text.split('\n')
    in_news_section = False
    news_section_lines = []
    
    for line in lines:
        if '[Comprehensive News Analysis]' in line:
            in_news_section = True
        elif in_news_section and line.startswith('[') and line.endswith(']') and 'News' not in line:
            break
        
        if in_news_section:
            news_section_lines.append(line)
    
    print("Current News Section Structure:")
    print("="*50)
    for line in news_section_lines:
        print(repr(line))  # Use repr to see exact formatting

Current News Section Structure:
'[Comprehensive News Analysis]'
"Summary: The crypto market on January 31, 2018, was characterized by continued volatility and regulatory concerns. Facebook's announcement to ban crypto ads was a significant bearish signal, restricting marketing channels. While South Korea's finance minister clarified no ban was planned, the uncovering of illegal trades signaled ongoing regulatory scrutiny. Bitcoin remained below $10,000, indicating persistent bearish momentum, further pressured by news of the Bitfinex subpoena by the CFTC and the Coincheck hack aftermath. Despite some short-term price recoveries, the overall sentiment remained cautious due to these headwinds."
''
'Sentiment: neutral'
'Market Impact: unknown'
''
'Key Events: No key events'
''
'Price Drivers: No specific drivers'
''
'Risk Factors: No major risks'
''
'Opportunities: No specific opportunities'
''
"Short-term News: [{'pick_idx': 5, 'id': 'nc285bbcb1a6b', 'title': 'Bitcoin and Ethereum Price 

In [108]:
# --- 5. Create ENHANCED Instruction-Formatted Dataset with ALL News Info ---
print("Creating ENHANCED instruction-formatted dataset with comprehensive news...")

# Helper functions (same as before)
def _fmt_usd(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return "N/A"
    try:
        return f"${float(x):,.2f}"
    except Exception:
        return "N/A"

def _fmt_float(x, nd=2):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return "N/A"
    try:
        return f"{float(x):.{nd}f}"
    except Exception:
        return "N/A"

def _derive_price_features(prices):
    """Calculate technical indicators from price history"""
    if not isinstance(prices, (list, tuple)) or len(prices) == 0:
        return dict(
            last_close=np.nan, min60=np.nan, max60=np.nan,
            ret_1d_pct=np.nan, ret_7d_pct=np.nan, ret_30d_pct=np.nan,
            std14_pct=np.nan, avg_abs_change14=np.nan, drawdown_from_max_pct=np.nan
        )
    
    arr = np.array(prices, dtype=float)
    last_close = arr[-1]
    min60 = float(np.min(arr))
    max60 = float(np.max(arr))

    def _pct(a, b):
        try:
            return (float(a) / float(b) - 1.0) * 100.0
        except Exception:
            return np.nan

    ret_1d_pct  = _pct(arr[-1], arr[-2])  if arr.size >= 2  else np.nan
    ret_7d_pct  = _pct(arr[-1], arr[-8])  if arr.size >= 8  else np.nan
    ret_30d_pct = _pct(arr[-1], arr[-31]) if arr.size >= 31 else np.nan

    if arr.size >= 15:
        rets = np.diff(arr[-15:]) / arr[-15:-1] * 100.0
        std14_pct = float(np.std(rets, ddof=1))
        avg_abs_change14 = float(np.mean(np.abs(np.diff(arr[-15:]))))
    else:
        std14_pct = np.nan
        avg_abs_change14 = np.nan

    drawdown_from_max_pct = _pct(arr[-1], max60)
    return dict(
        last_close=float(last_close), min60=min60, max60=max60,
        ret_1d_pct=float(ret_1d_pct) if not pd.isna(ret_1d_pct) else np.nan,
        ret_7d_pct=float(ret_7d_pct) if not pd.isna(ret_7d_pct) else np.nan,
        ret_30d_pct=float(ret_30d_pct) if not pd.isna(ret_30d_pct) else np.nan,
        std14_pct=float(std14_pct) if not pd.isna(std14_pct) else np.nan,
        avg_abs_change14=float(avg_abs_change14) if not pd.isna(avg_abs_change14) else np.nan,
        drawdown_from_max_pct=float(drawdown_from_max_pct) if not pd.isna(drawdown_from_max_pct) else np.nan
    )

def _label_action(last_close, future_10, std14_pct):
    """Generate trading action based on volatility-aware analysis"""
    if last_close is None or np.isnan(last_close) or future_10 is None or len(future_10) < 10:
        return "HOLD", 50
    day10 = float(future_10[-1])
    ret10_pct = (day10 / float(last_close) - 1.0) * 100.0
    vol_ref = float(std14_pct) if std14_pct is not None and not np.isnan(std14_pct) else 2.0
    thr = max(1.0, 0.35 * vol_ref)
    if ret10_pct >= thr:
        action = "BUY"
    elif ret10_pct <= -thr:
        action = "SELL"
    else:
        action = "HOLD"
    conf = min(99, max(1, int(50 + 15 * abs(ret10_pct) / max(1, thr))))
    return action, conf

def _stop_take(action, last_close, std14_pct):
    """Calculate stop loss and take profit based on volatility"""
    if std14_pct is None or np.isnan(std14_pct):
        std14_pct = 2.0
    vol_ref = float(std14_pct)
    if action == "BUY":
        sl = last_close * (1 - 0.015 - 0.005 * vol_ref)
        tp = last_close * (1 + 0.025 + 0.01 * vol_ref)
    elif action == "SELL":
        sl = last_close * (1 + 0.015 + 0.005 * vol_ref)
        tp = last_close * (1 - 0.025 - 0.01 * vol_ref)
    else:  # HOLD
        sl = last_close * 0.95
        tp = last_close * 1.05
    return float(sl), float(tp)

# Create enhanced dataset with complete news information
enhanced_finetuning_data = []
rows_seen = 0
rows_kept = 0
rows_skipped = 0

for idx, row in final_dataset.iterrows():
    rows_seen += 1
    
    # Calculate price features
    hist_prices = eval(row['btc_price_history_60d']) if isinstance(row['btc_price_history_60d'], str) else row['btc_price_history_60d']
    future_prices = eval(row['btc_price_target_10d']) if isinstance(row['btc_price_target_10d'], str) else row['btc_price_target_10d']
    
    feats = _derive_price_features(hist_prices)
    current_price = _fmt_usd(feats['last_close'])
    
    # Generate trading signals
    action, confidence = _label_action(feats['last_close'], future_prices, feats['std14_pct'])
    sl, tp = _stop_take(action, feats['last_close'], feats['std14_pct'])
    
    # Format price history
    price_history_str = ", ".join([f"{p:.2f}" for p in hist_prices[-10:]])  # Show last 10 days
    
    # Format commodity data
    gold_price = _fmt_usd(row.get('gold_price', np.nan))
    oil_price = _fmt_usd(row.get('oil_price', np.nan))
    
    # Enhanced Instruction
    instruction = (
        f"CONTEXT DATE: {row['date']}\n\n"
        "ANALYSIS FRAMEWORK:\n"
        "• Technical Analysis: Use price trends, volatility, and momentum indicators\n"
        "• Macro Analysis: Consider gold/oil prices for broader market context\n"
        "• News Analysis: Integrate comprehensive daily news summaries for market catalysts\n\n"
        "OUTPUT FORMAT (JSON ONLY):\n"
        "Return a single JSON object with EXACTLY these keys:\n"
        "{\"action\":\"BUY|SELL|HOLD\",\"confidence\":<int 1-99>,"
        "\"stop_loss\":<price 2dp>,\"take_profit\":<price 2dp>,"
        "\"forecast_10d\":[<10 prices 2dp>]}\n"
        "No extra text, no explanations, just the JSON."
    )

    # Check if there's meaningful news data
    has_real_news = (
        (row.get('news_summary') and 
         row.get('news_summary') not in ['No news summary available.', '']) or
        (row.get('daily_sentiment') and 
         row.get('daily_sentiment') not in ['neutral', 'No sentiment data available.', '']) or
        (row.get('daily_market_impact') and 
         row.get('daily_market_impact') not in ['unknown', 'No market impact assessment available.', '']) or
        (row.get('daily_key_events') and 
         len(row.get('daily_key_events', [])) > 0) or
        (row.get('daily_price_drivers') and 
         len(row.get('daily_price_drivers', [])) > 0) or
        (row.get('daily_risk_factors') and 
         len(row.get('daily_risk_factors', [])) > 0) or
        (row.get('daily_opportunities') and 
         len(row.get('daily_opportunities', [])) > 0) or
        (row.get('news_short_term') and 
         row.get('news_short_term') not in ['No short-term news available.', '']) or
        (row.get('news_long_term') and 
         row.get('news_long_term') not in ['No long-term news available.', ''])
    )
    
    # Build news section only if there's real data
    news_section = ""
    if has_real_news:
        news_section = f"""

[Comprehensive News & Market Analysis]

Daily News Summary:
{row.get('news_summary', 'No news summary available.')}

Market Sentiment: {row.get('daily_sentiment', 'No sentiment data available.')}
Market Impact: {row.get('daily_market_impact', 'No market impact assessment available.')}

Short-term News Analysis:
{str(row.get('news_short_term', 'No short-term news available.'))}

Long-term News Analysis:
{str(row.get('news_long_term', 'No long-term news available.'))}
"""

    # ENHANCED INPUT with conditional comprehensive news
    input_text = f"""
Daily Context — {row['date']}

[Technical Price Analysis]
- Current Price: {_fmt_usd(feats['last_close'])}
- 60-Day Range: {_fmt_usd(feats['min60'])} → {_fmt_usd(feats['max60'])}
- 1D Return: {_fmt_float(feats['ret_1d_pct'])}%
- 7D Return: {_fmt_float(feats['ret_7d_pct'])}%
- 30D Return: {_fmt_float(feats['ret_30d_pct'])}%
- Volatility (14d): {_fmt_float(feats['std14_pct'])}%
- Avg Daily Change (14d): {_fmt_float(feats['avg_abs_change14'])}
- Drawdown from Max: {_fmt_float(feats['drawdown_from_max_pct'])}%

[Price History (Last 60 Days USD)]
[{price_history_str}]

[Macro & Commodities Context]
- Gold Price: {gold_price}
- Crude Oil Price: {oil_price}

[Market Context]
- Bitcoin dominates crypto market as leading digital asset
- Price influenced by adoption, regulation, and macro factors{news_section}

Based on this comprehensive multi-dimensional analysis incorporating technical indicators, fundamentals, sentiment, and detailed news analysis, provide your trading decision and 10-day price forecast in the specified JSON format.
""".strip()

    # Enhanced Output
    forecast_str = ", ".join([f"{p:.2f}" for p in row['btc_price_target_10d'][:10]])
    output_text = (
        '{'
        f'"action":"{action}",'
        f'"confidence":{confidence},'
        f'"stop_loss":{sl:.2f},'
        f'"take_profit":{tp:.2f},'
        f'"forecast_10d":[{forecast_str}]'
        '}'
    )

    enhanced_finetuning_data.append({
        'instruction': instruction.strip(),
        'input': input_text,
        'output': output_text
    })
    rows_kept += 1

# Convert to Hugging Face Dataset
df_enhanced_finetuning = pd.DataFrame(enhanced_finetuning_data)
hf_enhanced_dataset = Dataset.from_pandas(df_enhanced_finetuning)

print(f"\n✅ ENHANCED dataset transformation complete!")
print(f"   Rows processed: {rows_seen}")
print(f"   Rows kept: {rows_kept}")
print(f"   Rows skipped: {rows_skipped}")
print(f"   Final enhanced dataset size: {len(hf_enhanced_dataset)} samples")

if not df_enhanced_finetuning.empty:
    print("\nEnhanced sample preview:")
    sample_input = df_enhanced_finetuning.iloc[0]['input']
    
    # Show just the news section
    lines = sample_input.split('\n')
    news_start = False
    news_lines = []
    
    for line in lines:
        if '[Comprehensive News & Market Analysis]' in line:
            news_start = True
        elif news_start and line.startswith('[') and ']' in line:
            break
        if news_start:
            news_lines.append(line)
    
    if news_lines:
        print("Enhanced News Section:")
        for line in news_lines[:15]:  # Show first 15 lines
            print(f"  {line}")
        if len(news_lines) > 15:
            print(f"  ... ({len(news_lines)-15} more lines)")
    else:
        print("No news section (only technical analysis)")
        
    print(f"\nSample input length: {len(sample_input):,} chars")
    print(f"Sample output: {df_enhanced_finetuning.iloc[0]['output']}")

Creating ENHANCED instruction-formatted dataset with comprehensive news...

✅ ENHANCED dataset transformation complete!
   Rows processed: 2303
   Rows kept: 2303
   Rows skipped: 0
   Final enhanced dataset size: 2303 samples

Enhanced sample preview:
Enhanced News Section:
  [Comprehensive News & Market Analysis]
  
  Daily News Summary:
  The crypto market on January 31, 2018, was characterized by continued volatility and regulatory concerns. Facebook's announcement to ban crypto ads was a significant bearish signal, restricting marketing channels. While South Korea's finance minister clarified no ban was planned, the uncovering of illegal trades signaled ongoing regulatory scrutiny. Bitcoin remained below $10,000, indicating persistent bearish momentum, further pressured by news of the Bitfinex subpoena by the CFTC and the Coincheck hack aftermath. Despite some short-term price recoveries, the overall sentiment remained cautious due to these headwinds.
  
  Market Sentiment: neutra

In [109]:
# --- DEBUG: Check news data patterns ---
print("🔍 Examining news data patterns...")

# Sample a few rows to see various news patterns
sample_indices = [0, 100, 500, 1000, 1500, 2000, 2300]
news_patterns = []

for i in sample_indices:
    if i < len(final_dataset):
        row = final_dataset.iloc[i]
        
        # Check each news field
        news_summary = row.get('news_summary', '')
        daily_sentiment = row.get('daily_sentiment', '')
        daily_market_impact = row.get('daily_market_impact', '')
        daily_key_events = row.get('daily_key_events', [])
        daily_price_drivers = row.get('daily_price_drivers', [])
        
        # Determine if this has real news
        has_real_news = (
            (news_summary and news_summary not in ['No news summary available.', '']) and
            (daily_sentiment and daily_sentiment not in ['neutral', 'No sentiment data available.', '']) and
            (daily_market_impact and daily_market_impact not in ['unknown', 'No market impact assessment available.', '']) and
            (daily_key_events and len(daily_key_events) > 0) and
            (daily_price_drivers and len(daily_price_drivers) > 0)
        )
        
        pattern = {
            'date': row['date'],
            'has_real_news': has_real_news,
            'summary_length': len(str(news_summary)),
            'sentiment': daily_sentiment,
            'impact': daily_market_impact,
            'events_count': len(daily_key_events) if daily_key_events else 0,
            'drivers_count': len(daily_price_drivers) if daily_price_drivers else 0
        }
        news_patterns.append(pattern)

print("\n📊 NEWS DATA ANALYSIS:")
for pattern in news_patterns:
    print(f"Date: {pattern['date']} | Real News: {pattern['has_real_news']} | "
          f"Summary: {pattern['summary_length']} chars | "
          f"Sentiment: {pattern['sentiment']} | "
          f"Impact: {pattern['impact']} | "
          f"Events: {pattern['events_count']} | "
          f"Drivers: {pattern['drivers_count']}")

# Count overall patterns
total_with_real_news = sum(1 for p in news_patterns if p['has_real_news'])
print(f"\n📈 SUMMARY: {total_with_real_news}/{len(news_patterns)} samples have COMPLETE real news data")
print(f"📉 Samples with PLACEHOLDER data: {len(news_patterns) - total_with_real_news}/{len(news_patterns)}")

# Check specific placeholder patterns
placeholder_count = 0
for idx, row in final_dataset.iterrows():
    if (row.get('daily_sentiment') == 'neutral' and
        row.get('daily_market_impact') == 'unknown' and
        not row.get('daily_key_events') and
        not row.get('daily_price_drivers')):
        placeholder_count += 1

print(f"🎭 Rows with CLASSIC placeholder pattern (neutral + unknown + no events/drivers): {placeholder_count}/{len(final_dataset)}")

🔍 Examining news data patterns...

📊 NEWS DATA ANALYSIS:
Date: 2018-01-31 | Real News: False | Summary: 626 chars | Sentiment: neutral | Impact: unknown | Events: 0 | Drivers: 0
Date: 2018-05-11 | Real News: False | Summary: 529 chars | Sentiment: neutral | Impact: unknown | Events: 0 | Drivers: 0
Date: 2019-06-15 | Real News: False | Summary: 633 chars | Sentiment: neutral | Impact: unknown | Events: 0 | Drivers: 0
Date: 2020-10-27 | Real News: False | Summary: 462 chars | Sentiment: neutral | Impact: unknown | Events: 0 | Drivers: 0
Date: 2022-03-11 | Real News: False | Summary: 545 chars | Sentiment: neutral | Impact: unknown | Events: 0 | Drivers: 0
Date: 2023-07-24 | Real News: False | Summary: 606 chars | Sentiment: neutral | Impact: unknown | Events: 0 | Drivers: 0
Date: 2024-05-19 | Real News: False | Summary: 461 chars | Sentiment: neutral | Impact: unknown | Events: 0 | Drivers: 0

📈 SUMMARY: 0/7 samples have COMPLETE real news data
📉 Samples with PLACEHOLDER data: 7/7
🎭 Rows

In [110]:
# --- 6. Save ENHANCED Dataset to Hugging Face Hub ---
if not df_enhanced_finetuning.empty:
    print("Saving ENHANCED dataset with comprehensive news to Hugging Face Hub...")
    
    # Define repository name for enhanced dataset
    repo_id = "tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news"
    
    try:
        # Push to Hugging Face Hub
        hf_enhanced_dataset.push_to_hub(
            repo_id,
            commit_message="Enhanced Bitcoin prediction dataset with comprehensive local news summaries, short-term and long-term analysis, and technical indicators (no external dependencies)"
        )
        
        print(f"✅ Enhanced dataset successfully uploaded!")
        print(f"🔗 View at: https://huggingface.co/datasets/{repo_id}")
        print(f"📊 Total samples: {len(hf_enhanced_dataset)}")
        
        # Compare with previous dataset
        if 'df_finetuning' in locals():
            print(f"\n📈 Enhancement Summary:")
            print(f"   Previous dataset: {len(df_finetuning)} samples")
            print(f"   Enhanced dataset: {len(df_enhanced_finetuning)} samples")
            
            # Check input length improvement
            prev_avg_len = df_finetuning['input'].str.len().mean()
            enhanced_avg_len = df_enhanced_finetuning['input'].str.len().mean()
            improvement = ((enhanced_avg_len - prev_avg_len) / prev_avg_len) * 100
            
            print(f"   Avg input length improvement: {improvement:.1f}%")
            print(f"   Previous avg: {prev_avg_len:.0f} chars")
            print(f"   Enhanced avg: {enhanced_avg_len:.0f} chars")
        
    except Exception as e:
        print(f"❌ Error uploading enhanced dataset to Hugging Face: {e}")
        print("💡 Make sure you're logged in with a valid token.")
        
else:
    print("❌ No enhanced data to save - dataset is empty")

Saving ENHANCED dataset with comprehensive news to Hugging Face Hub...


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|2         |  526kB / 22.8MB            

✅ Enhanced dataset successfully uploaded!
🔗 View at: https://huggingface.co/datasets/tahamajs/bitcoin-enhanced-prediction-dataset-with-local-comprehensive-news
📊 Total samples: 2303

📈 Enhancement Summary:
   Previous dataset: 2303 samples
   Enhanced dataset: 2303 samples
   Avg input length improvement: 814.3%
   Previous avg: 2507 chars
   Enhanced avg: 22925 chars
   Avg input length improvement: 814.3%
   Previous avg: 2507 chars
   Enhanced avg: 22925 chars
