# Pre-compute ML Results for Quick View

This notebook pre-computes ML predictions for all commodities and frequencies.

Run this overnight to populate the cache for instant Quick View demos.

In [None]:
import sys
from pathlib import Path

# Add src to path (handle both notebook/ and repo root execution)
repo_root = Path.cwd()
if repo_root.name == 'notebooks':
    repo_root = repo_root.parent

sys.path.insert(0, str(repo_root / 'src'))
sys.path.insert(0, str(repo_root / 'apps'))

print(f"Repo root: {repo_root}")
print(f"Python path: {sys.path[:3]}")

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

from data.commodities import load_commodities_data, COMMODITIES_CONFIG
from data.ml_features import create_ml_features_with_transparency
from models.commodity_direction import compare_models
from utils.ml_cache import save_ml_results

print("‚úÖ Imports successful")

## Configuration

In [None]:
# Symbols to pre-compute
SYMBOLS = ["GLD", "SLV", "GDX"]  # Add more as needed

# Frequencies to test
FREQUENCIES = ["Daily", "Weekly", "Monthly"]

# Model configurations (optimized for fast training)
CONFIGS = {
    "Daily": {
        "train_size": 252,  # 1 year
        "test_size": 5,     # 1 week
        "seq_len": 60,      # 3 months
        "max_splits": 50,   # Limit for speed
    },
    "Weekly": {
        "train_size": 52,   # 1 year
        "test_size": 4,     # 1 month
        "seq_len": 20,      # 5 months
        "max_splits": 50,
    },
    "Monthly": {
        "train_size": 36,   # 3 years
        "test_size": 3,     # 3 months
        "seq_len": 12,      # 1 year
        "max_splits": 50,
    },
}

print(f"Will pre-compute: {len(SYMBOLS)} symbols √ó {len(FREQUENCIES)} frequencies = {len(SYMBOLS) * len(FREQUENCIES)} total")

## Load Data

In [None]:
# Load all commodity data
df = load_commodities_data()

print(f"Loaded data: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Columns: {df.columns.tolist()}")

## Pre-compute Results

This will take a while! Estimated:
- Daily: ~30 sec per symbol
- Weekly: ~20 sec per symbol  
- Monthly: ~15 sec per symbol

**Total: ~5-10 minutes for 3 symbols √ó 3 frequencies**

In [None]:
results_log = []

for symbol in SYMBOLS:
    if symbol not in df.columns:
        print(f"‚ö†Ô∏è Skipping {symbol} (not in data)")
        continue
    
    print(f"\n{'='*60}")
    print(f"Processing {symbol} ({COMMODITIES_CONFIG[symbol]['name']})")
    print(f"{'='*60}")
    
    price_series = df[symbol].dropna()
    
    for freq in FREQUENCIES:
        print(f"\nüìä {freq} frequency...")
        
        try:
            # Resample if needed
            if freq == "Weekly":
                price_resampled = price_series.resample('W-FRI').last().dropna()
            elif freq == "Monthly":
                price_resampled = price_series.resample('M').last().dropna()
            else:
                price_resampled = price_series
            
            print(f"  Data points: {len(price_resampled)}")
            
            # Create features
            features_df, metadata = create_ml_features_with_transparency(price_resampled, symbol=symbol)
            print(f"  Features: {features_df.shape[1]}, Rows: {len(features_df)}")
            
            # Get config for this frequency
            config = CONFIGS[freq]
            
            # Check data sufficiency
            min_required = config['train_size'] + config['seq_len'] + config['test_size']
            if len(features_df) < min_required:
                print(f"  ‚ùå Insufficient data: {len(features_df)} < {min_required}")
                results_log.append({
                    'symbol': symbol,
                    'freq': freq,
                    'status': 'INSUFFICIENT_DATA',
                    'rows': len(features_df),
                    'required': min_required,
                })
                continue
            
            # Run comparison
            start_time = datetime.now()
            
            results = compare_models(
                features_df,
                initial_train_days=config['train_size'],
                test_days=config['test_size'],
                max_splits=config['max_splits'],
                xgb_params={'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1},
                lstm_params={
                    'sequence_length': config['seq_len'],
                    'hidden_units': 50,
                    'dropout_rate': 0.2,
                    'epochs': 20,
                },
                verbose=False,
            )
            
            elapsed = (datetime.now() - start_time).total_seconds()
            
            # Save to cache
            cache_metadata = {
                'symbol': symbol,
                'freq': freq,
                'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'config': config,
                'elapsed_seconds': elapsed,
            }
            
            save_ml_results(symbol, freq, "compare", results, cache_metadata)
            
            # Log results
            xgb_acc = results['xgboost']['overall_metrics']['accuracy']
            lstm_acc = results['lstm']['overall_metrics']['accuracy']
            winner = results['winner']
            
            print(f"  ‚úÖ Complete in {elapsed:.0f}s")
            print(f"     XGBoost: {xgb_acc:.1%}, LSTM: {lstm_acc:.1%}, Winner: {winner.upper()}")
            
            results_log.append({
                'symbol': symbol,
                'freq': freq,
                'status': 'SUCCESS',
                'xgb_accuracy': xgb_acc,
                'lstm_accuracy': lstm_acc,
                'winner': winner,
                'elapsed_seconds': elapsed,
            })
            
        except Exception as e:
            print(f"  ‚ùå Error: {e}")
            results_log.append({
                'symbol': symbol,
                'freq': freq,
                'status': 'ERROR',
                'error': str(e),
            })

print(f"\n{'='*60}")
print("‚úÖ Pre-computation complete!")
print(f"{'='*60}")

## Summary

In [None]:
summary_df = pd.DataFrame(results_log)
summary_df

In [None]:
# Success rate
success_count = (summary_df['status'] == 'SUCCESS').sum()
total_count = len(summary_df)

print(f"\nüìä Summary:")
print(f"  Total: {total_count}")
print(f"  Success: {success_count} ({success_count/total_count*100:.0f}%)")
print(f"  Errors: {(summary_df['status'] == 'ERROR').sum()}")
print(f"  Insufficient data: {(summary_df['status'] == 'INSUFFICIENT_DATA').sum()}")

if success_count > 0:
    avg_time = summary_df[summary_df['status'] == 'SUCCESS']['elapsed_seconds'].mean()
    print(f"\n‚è±Ô∏è Average time: {avg_time:.0f} seconds")

In [None]:
# List cached files
from utils.ml_cache import list_cached_results

cached = list_cached_results()
print(f"\nüíæ Cached results ({len(cached)} files):")
for symbol, freq, model_type, path in cached:
    size_mb = path.stat().st_size / (1024 * 1024)
    print(f"  {symbol:5} | {freq:7} | {model_type:10} | {size_mb:.2f} MB")

## Done!

Results are now cached in `outputs/ml_results/`.

Use **Quick View mode** in Streamlit for instant access!