# NBA DFS Walk-Forward Backtest - Google Colab

This notebook runs per-player model training on Google Colab with Google Drive persistence.

## Setup Requirements
1. Upload your data to Google Drive: `MyDrive/nba_dfs/data/inputs/`
2. Run all cells in order
3. Results saved to: `MyDrive/nba_dfs/outputs/`

## Estimated Time
- Free Colab: ~21 min per slate
- Colab Pro: ~10 min per slate
- Colab Pro+: ~5 min per slate

## 1. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/nba_dfs')
print(f"Working directory: {os.getcwd()}")

## 2. Install Dependencies

In [None]:
!pip install -q xgboost==1.7.6 pyarrow fastparquet pyyaml python-dotenv joblib scipy tqdm

## 3. Setup Project Structure

First time only: Upload your project files to Drive or clone from GitHub

In [None]:
import sys
from pathlib import Path

project_root = Path('/content/drive/MyDrive/nba_dfs')

if not project_root.exists():
    print("Creating project structure...")
    project_root.mkdir(parents=True, exist_ok=True)
    (project_root / 'data' / 'inputs').mkdir(parents=True, exist_ok=True)
    (project_root / 'data' / 'outputs').mkdir(parents=True, exist_ok=True)
    print("Project structure created. Please upload your src/ and config/ folders.")
else:
    print(f"Project exists at {project_root}")

sys.path.insert(0, str(project_root))
print(f"Python path: {sys.path[0]}")

## 4. Verify Data

In [None]:
data_dir = project_root / 'data' / 'inputs'

print("Data directories:")
for subdir in ['box_scores', 'dfs_salaries', 'betting_odds', 'schedule']:
    path = data_dir / subdir
    if path.exists():
        count = len(list(path.glob('*.parquet')))
        print(f"  {subdir}: {count} files")
    else:
        print(f"  {subdir}: NOT FOUND")

## 5. Check System Resources

In [None]:
import psutil
import multiprocessing

cpu_count = multiprocessing.cpu_count()
ram_gb = psutil.virtual_memory().total / (1024**3)

print(f"CPU Cores: {cpu_count}")
print(f"RAM: {ram_gb:.1f} GB")
print(f"Recommended n_jobs: {cpu_count}")

if ram_gb < 12:
    print("WARNING: Low RAM detected. Consider reducing n_jobs or processing fewer players.")

## 6. Configure Backtest Parameters

In [None]:
TRAIN_START = '20241001'
TRAIN_END = '20241130'
TEST_START = '20241201'
TEST_END = '20241215'

MODEL_TYPE = 'xgboost'
FEATURE_CONFIG = 'default_features'
MIN_PLAYER_GAMES = 10
RECALIBRATE_DAYS = 7

N_JOBS = -1

DB_PATH = str(project_root / 'nba_dfs.db')
OUTPUT_DIR = str(project_root / 'data' / 'outputs')

print("Configuration:")
print(f"  Training: {TRAIN_START} to {TRAIN_END}")
print(f"  Testing: {TEST_START} to {TEST_END}")
print(f"  Model: {MODEL_TYPE}")
print(f"  Features: {FEATURE_CONFIG}")
print(f"  Parallel jobs: {N_JOBS} (all cores)")
print(f"  Output: {OUTPUT_DIR}")

## 7. Import Dependencies

In [None]:
import logging
import pandas as pd
from datetime import datetime

from src.walk_forward_backtest import WalkForwardBacktest

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

print("Imports successful")

## 8. Initialize Backtest

In [None]:
model_params = {
    'max_depth': 6,
    'learning_rate': 0.05,
    'n_estimators': 200,
    'min_child_weight': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

backtest = WalkForwardBacktest(
    db_path=DB_PATH,
    train_start=TRAIN_START,
    train_end=TRAIN_END,
    test_start=TEST_START,
    test_end=TEST_END,
    model_type=MODEL_TYPE,
    model_params=model_params,
    feature_config=FEATURE_CONFIG,
    output_dir=OUTPUT_DIR,
    per_player_models=True,
    min_player_games=MIN_PLAYER_GAMES,
    recalibrate_days=RECALIBRATE_DAYS,
    save_models=True,
    save_predictions=True,
    n_jobs=N_JOBS
)

print("Backtest initialized")

## 9. Run Backtest

This will take 5-21 minutes per slate depending on your Colab tier.

In [None]:
start_time = datetime.now()
print(f"Starting backtest at {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

results = backtest.run()

end_time = datetime.now()
elapsed = end_time - start_time
print("="*80)
print(f"Backtest completed at {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total time: {elapsed}")

## 10. View Results Summary

In [None]:
print("\n" + "="*80)
print("BACKTEST RESULTS SUMMARY")
print("="*80)
print(f"Slates processed: {results['num_slates']}")
print(f"Date range: {results['date_range']}")
print(f"Total players evaluated: {results['total_players_evaluated']:.0f}")
print()
print("Model Performance:")
print(f"  Mean MAPE: {results['model_mean_mape']:.2f}%")
print(f"  Median MAPE: {results['model_median_mape']:.2f}%")
print(f"  Mean RMSE: {results['model_mean_rmse']:.2f}")
print(f"  Mean MAE: {results['model_mean_mae']:.2f}")
print(f"  Mean Correlation: {results['model_mean_correlation']:.3f}")
print()
print("Benchmark Performance:")
print(f"  Mean MAPE: {results['benchmark_mean_mape']:.2f}%")
print(f"  Improvement: {results['mape_improvement']:+.2f}%")
print("="*80)

## 11. View Daily Results

In [None]:
daily_df = results['daily_results']
display(daily_df[[
    'date', 'num_players', 'model_mape', 'model_rmse', 
    'model_corr', 'benchmark_mape', 'mean_actual', 'mean_projected'
]])

## 12. Visualize Results

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

axes[0, 0].plot(daily_df['date'], daily_df['model_mape'], marker='o', label='Model')
axes[0, 0].plot(daily_df['date'], daily_df['benchmark_mape'], marker='s', label='Benchmark')
axes[0, 0].set_title('MAPE by Date')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('MAPE (%)')
axes[0, 0].legend()
axes[0, 0].tick_params(axis='x', rotation=45)

axes[0, 1].plot(daily_df['date'], daily_df['model_corr'], marker='o', color='green')
axes[0, 1].set_title('Correlation by Date')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Correlation')
axes[0, 1].tick_params(axis='x', rotation=45)

axes[1, 0].bar(range(len(daily_df)), daily_df['num_players'])
axes[1, 0].set_title('Players Evaluated per Slate')
axes[1, 0].set_xlabel('Slate Index')
axes[1, 0].set_ylabel('Number of Players')

all_preds = results['all_predictions']
if not all_preds.empty:
    axes[1, 1].scatter(all_preds['actual_fpts'], all_preds['projected_fpts'], alpha=0.5)
    axes[1, 1].plot([0, 70], [0, 70], 'r--', label='Perfect Prediction')
    axes[1, 1].set_title('Actual vs Projected Fantasy Points')
    axes[1, 1].set_xlabel('Actual FPts')
    axes[1, 1].set_ylabel('Projected FPts')
    axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 13. Save Results to Drive

In [None]:
if 'report_path' in results:
    print(f"Report saved to: {results['report_path']}")

print(f"\nAll outputs saved to: {backtest.run_output_dir}")
print(f"  - Predictions: {backtest.run_predictions_dir}")
print(f"  - Training inputs: {backtest.run_inputs_dir}")

output_files = list(Path(backtest.run_output_dir).rglob('*'))
print(f"\nTotal output files: {len(output_files)}")

## 14. Export Summary CSV

In [None]:
summary_path = project_root / 'data' / 'outputs' / f"summary_{results['date_range'].replace(' to ', '_')}.csv"
daily_df.to_csv(summary_path, index=False)
print(f"Summary CSV saved to: {summary_path}")

from google.colab import files
files.download(str(summary_path))

## Optional: Download All Results

In [None]:
import shutil

archive_path = project_root / 'data' / 'outputs' / f"backtest_results_{backtest.run_timestamp}.zip"
shutil.make_archive(
    str(archive_path.with_suffix('')),
    'zip',
    backtest.run_output_dir
)

print(f"Results archived to: {archive_path}")
print(f"Size: {archive_path.stat().st_size / (1024**2):.1f} MB")

files.download(str(archive_path))