---
## Step 1: Setup Environment

In [None]:
# Check GPU availability
import subprocess
try:
    gpu_info = subprocess.check_output(['nvidia-smi'], encoding='utf-8')
    print("‚úÖ GPU DETECTED:")
    print(gpu_info)
except:
    print("‚ö†Ô∏è NO GPU DETECTED - Training will be slower")
    print("   Enable GPU: Runtime > Change runtime type > GPU")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("‚úÖ Google Drive mounted at /content/drive")

In [None]:
# Set working directory (adjust path to your folder)
import os
WORKSPACE_DIR = '/content/drive/MyDrive/quantum-ai-trader_v1.1'

# If folder doesn't exist, clone from GitHub
if not os.path.exists(WORKSPACE_DIR):
    print("üì• Cloning repository...")
    !git clone https://github.com/alexpayne556-collab/quantum-ai-trader_v1.1.git $WORKSPACE_DIR
    print("‚úÖ Repository cloned")
else:
    print("‚úÖ Workspace found")

os.chdir(WORKSPACE_DIR)
print(f"Current directory: {os.getcwd()}")

In [None]:
# Install ML requirements
print("üì¶ Installing ML packages...")
!pip install -q -r requirements_ml.txt
print("‚úÖ Packages installed")

# Verify GPU support
import xgboost as xgb
import lightgbm as lgb
print(f"\n‚úÖ XGBoost version: {xgb.__version__}")
print(f"‚úÖ LightGBM version: {lgb.__version__}")
print(f"‚úÖ XGBoost GPU support: {xgb.build_info()['USE_CUDA']}")

---
## Step 2: Load Training Data

In [None]:
import sys
sys.path.insert(0, WORKSPACE_DIR)

from src.ml.dataset_loader import DatasetLoader
import pandas as pd
import numpy as np

print("üìÇ Loading training dataset...")

# Option 1: Load from existing CSV
loader = DatasetLoader(data_dir='data/training')

# Try to load pre-built dataset
try:
    dataset = loader.load_from_csv('data/training/training_dataset.csv')
    print("‚úÖ Loaded existing dataset")
except FileNotFoundError:
    print("‚ö†Ô∏è No existing dataset found")
    print("üì• Building dataset from scratch...")
    
    # Option 2: Build from scratch (simplified)
    tickers = ['NVDA', 'TSLA', 'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'NFLX',
               'AMD', 'INTC', 'ORCL', 'CRM', 'ADBE', 'PYPL', 'SQ', 'SHOP']
    
    dataset = loader.download_and_build_dataset(
        tickers=tickers,
        period='2y',
        min_samples_per_ticker=100
    )
    
    # Save for future use
    loader.save_dataset(dataset, 'data/training/training_dataset.csv')

# Validate dataset
is_valid, message = loader.validate_dataset(dataset)
if not is_valid:
    raise ValueError(f"Dataset validation failed: {message}")

print("\n‚úÖ DATASET READY")
print(f"   Samples: {len(dataset['X']):,}")
print(f"   Features: {dataset['X'].shape[1]}")
print(f"   Tickers: {dataset['tickers'].nunique()}")
print(f"   Label distribution: {dict(dataset['y'].value_counts())}")

---
## Step 3: Initialize Trident Trainer

In [None]:
from src.ml.train_trident import TridenTrainer

print("üîß Initializing Trident Trainer...")

trainer = TridenTrainer(
    use_gpu=True,                    # Enable GPU acceleration
    optimize_hyperparams=True,       # Run Optuna optimization
    n_trials=50,                     # 50 trials per model (150 total per cluster)
    cv_folds=5,                      # 5-fold cross-validation
    n_clusters=5,                    # 5 ticker clusters
    random_state=42
)

print("‚úÖ Trainer initialized")
print(f"   GPU enabled: {trainer.use_gpu}")
print(f"   Optimization trials: {trainer.n_trials} per model")
print(f"   CV folds: {trainer.cv_folds}")
print(f"   Clusters: {trainer.n_clusters}")

---
## Step 4: Train Trident Ensemble

‚è±Ô∏è **Expected time:** 2.5-5 hours on Colab Pro GPU

**What happens:**
1. Cluster tickers into 5 groups (K-Means)
2. For each cluster, train 3 models:
   - XGBoost (pure tabular)
   - LightGBM (speed + microstructure)
   - CatBoost (categorical + robust)
3. Optuna optimization (50 trials √ó 3 models √ó 5 clusters = 750 trials)
4. PurgedKFold CV (5 folds, 1% embargo)
5. SHAP feature importance
6. Save 15 models + reports

In [None]:
import time
from datetime import datetime

print("="*60)
print("üöÄ STARTING TRIDENT TRAINING")
print("="*60)
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Expected duration: 2.5-5 hours\n")

start_time = time.time()

# TRAIN
results = trainer.train(
    X=dataset['X'],
    y=dataset['y'],
    tickers=dataset['tickers'],
    ticker_features=dataset['ticker_features']
)

elapsed = time.time() - start_time
hours, remainder = divmod(elapsed, 3600)
minutes, seconds = divmod(remainder, 60)

print("\n" + "="*60)
print("‚úÖ TRAINING COMPLETE")
print("="*60)
print(f"End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Duration: {int(hours)}h {int(minutes)}m {int(seconds)}s")
print(f"\nModels trained: {len(results['models'])}")
print(f"Clusters created: {results['n_clusters']}")

---
## Step 5: Review Results

In [None]:
# Display cluster assignments
print("\nüìä CLUSTER ASSIGNMENTS")
print("="*60)

cluster_df = pd.DataFrame([
    {'Cluster': k, 'Name': v['name'], 'Tickers': ', '.join(v['tickers'][:5]) + '...'}
    for k, v in results['clusters'].items()
])
print(cluster_df.to_string(index=False))

# Display CV accuracies
print("\nüìà CROSS-VALIDATION RESULTS")
print("="*60)

for cluster_id in range(results['n_clusters']):
    print(f"\nCluster {cluster_id}: {results['clusters'][cluster_id]['name']}")
    for model_name in ['xgb', 'lgb', 'cat']:
        key = f'cluster_{cluster_id}_{model_name}'
        if key in results['models']:
            model_info = results['models'][key]
            print(f"   {model_name.upper()}: {model_info['cv_accuracy']:.1%}")

In [None]:
# Display SHAP feature importance (top 10 global)
print("\nüîç TOP 10 GLOBAL FEATURES (SHAP)")
print("="*60)

if 'shap_importances' in results:
    # Average SHAP values across all models
    all_importances = {}
    for cluster_id in range(results['n_clusters']):
        key = f'cluster_{cluster_id}'
        if key in results['shap_importances']:
            for feat, val in results['shap_importances'][key].items():
                if feat not in all_importances:
                    all_importances[feat] = []
                all_importances[feat].append(val)
    
    # Average and sort
    avg_importances = {k: np.mean(v) for k, v in all_importances.items()}
    top_10 = sorted(avg_importances.items(), key=lambda x: x[1], reverse=True)[:10]
    
    for i, (feat, importance) in enumerate(top_10, 1):
        print(f"{i:2d}. {feat:30s} {importance:.4f}")
else:
    print("‚ö†Ô∏è SHAP importances not computed")

---
## Step 6: Save Models to Google Drive

In [None]:
# Save models
output_dir = '/content/drive/MyDrive/trident_models'

print(f"üíæ Saving models to {output_dir}...")

trainer.save_models(
    output_dir=output_dir,
    results=results
)

print("\n‚úÖ MODELS SAVED")
print(f"   Location: {output_dir}")
print(f"   Files:")
print(f"      - 15 model files (cluster_X_{{xgb,lgb,cat}}.*)")
print(f"      - cluster_assignments.json")
print(f"      - training_report.md")
print(f"      - ticker_features.csv")

---
## Step 7: Quick Inference Test

In [None]:
from src.ml.inference_engine import TridenInference

print("üß™ Testing inference engine...")

# Initialize inference
engine = TridenInference(model_dir=output_dir)

# Get a sample from dataset
sample_idx = 0
sample_ticker = dataset['tickers'].iloc[sample_idx]
sample_features = dataset['X'].iloc[sample_idx]

# Predict
prediction = engine.predict(
    ticker=sample_ticker,
    features=sample_features
)

print("\n‚úÖ INFERENCE TEST")
print(f"   Ticker: {prediction['ticker']}")
print(f"   Signal: {prediction['signal']}")
print(f"   Confidence: {prediction['confidence']:.1f}%")
print(f"   Probability: {prediction['probability']:.3f}")
print(f"   Cluster ID: {prediction['cluster_id']}")
print(f"   Model votes: {prediction['model_votes']}")
print(f"   Timestamp: {prediction['timestamp']}")

---
## Step 8: View Training Report

In [None]:
# Display training report
report_path = f"{output_dir}/training_report.md"

if os.path.exists(report_path):
    with open(report_path, 'r') as f:
        report = f.read()
    
    from IPython.display import Markdown
    display(Markdown(report))
else:
    print("‚ö†Ô∏è Training report not found")

---
## üéâ TRAINING COMPLETE

**Next Steps:**

1. **Download models** from Google Drive to your local workspace
2. **Run backtest** using `src/ml/backtest_trident.py`
3. **Analyze SHAP** using `notebooks/SHAP_ANALYSIS.ipynb`
4. **Test in production** using inference_engine.py
5. **Build Portfolio Tracker** (Day 7)
6. **Build Watchlist Engine** (Day 8)
7. **Build Ultimate Companion** (Week 2)

**Expected Performance:**
- Baseline: 71.1% WR
- After Trident: 75-80% WR ‚ú®
- Sharpe Ratio: 2.5-3.5
- Max Drawdown: -10% to -15%

**Ready to make 15%/day sustainable!** üöÄ