## üîß 1. Environment Setup

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è WARNING: GPU not available! Training will be very slow.")
    print("Go to Runtime ‚Üí Change runtime type ‚Üí Hardware accelerator ‚Üí GPU")

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

# Create project directory in Drive
import os
project_dir = '/content/drive/MyDrive/CurriTail_GAN'
os.makedirs(project_dir, exist_ok=True)
print(f"Project directory: {project_dir}")

## üì¶ 2. Install Dependencies

In [None]:
%%capture
# Install required packages (silent mode)
!pip install yfinance statsmodels seaborn tqdm scipy scikit-learn matplotlib pandas numpy

## üìÇ 3. Upload Code Files

**Option A: Upload from local machine** (recommended for first run)
- Run the cell below and upload all `.py` files from your local project

**Option B: Clone from GitHub** (if you have a repo)
- Uncomment and modify the git clone command

In [None]:
# Option A: Upload files manually
from google.colab import files

print("üì§ Upload ALL .py files from your project:")
print("   - config.py")
print("   - utils.py")
print("   - models.py")
print("   - metrics.py")
print("   - baselines.py")
print("   - statistical_tests.py")
print("   - portfolio.py")
print("   - plotting.py")
print("   - main_experiment.py")
print("   - (optional) test_suite.py, failure_analysis.py")
print("\nSelect multiple files at once!")

uploaded = files.upload()
print(f"\n‚úÖ Uploaded {len(uploaded)} files")

In [None]:
# Option B: Clone from GitHub (uncomment if using)
# !git clone https://github.com/YOUR_USERNAME/curtail_gan.git
# %cd curtail_gan

In [None]:
# Verify all required files are present
import os

required_files = [
    'config.py', 'utils.py', 'models.py', 'metrics.py', 
    'baselines.py', 'statistical_tests.py', 'portfolio.py',
    'plotting.py', 'main_experiment.py'
]

missing = [f for f in required_files if not os.path.exists(f)]

if missing:
    print("‚ùå Missing files:")
    for f in missing:
        print(f"   - {f}")
else:
    print("‚úÖ All required files present!")
    print("\nFiles in current directory:")
    !ls -lh *.py

## ‚öôÔ∏è 4. Configure Experiment

Choose your configuration:
- **Quick Test** (2-3 hours): 5 seeds, 100 epochs, 1 dataset
- **Medium Run** (8-10 hours): 10 seeds, 200 epochs, 2 datasets  
- **Full Publication** (16-20 hours): 30 seeds, 400 epochs, 3 datasets

In [None]:
# ============================================
# CONFIGURATION SELECTOR
# ============================================

# Choose one: 'quick', 'medium', 'full'
RUN_MODE = 'quick'  # ‚Üê CHANGE THIS

# Configuration parameters
configs = {
    'quick': {
        'seeds': list(range(42, 47)),  # 5 seeds
        'epochs': 100,
        'datasets': ['Synthetic'],
        'batch_size': 256,
        'description': 'Quick test (2-3 hours)'
    },
    'medium': {
        'seeds': list(range(42, 52)),  # 10 seeds
        'epochs': 200,
        'datasets': ['Synthetic', 'SPX'],
        'batch_size': 256,
        'description': 'Medium run (8-10 hours)'
    },
    'full': {
        'seeds': list(range(42, 52)),  # 10 seeds
        'epochs': 200,
        'datasets': ['Synthetic', 'SPX', 'BTC'],
        'batch_size': 256,
        'description': 'Full publication (16-20 hours)'
    }
}

cfg = configs[RUN_MODE]

print(f"üìä Selected Configuration: {cfg['description']}")
print(f"   Seeds: {len(cfg['seeds'])} ({cfg['seeds'][0]} to {cfg['seeds'][-1]})")
print(f"   Epochs: {cfg['epochs']}")
print(f"   Datasets: {cfg['datasets']}")
print(f"   Batch size: {cfg['batch_size']}")
print(f"\n‚è±Ô∏è Estimated runtime: {cfg['description'].split('(')[1].strip(')')}")

In [None]:
# Modify config.py with selected parameters
import re

# Read config.py
with open('config.py', 'r') as f:
    config_content = f.read()

# Replace parameters in __post_init__
replacements = [
    (r'self\.seeds = list\(range\(\d+, \d+\)\)', 
     f'self.seeds = {cfg["seeds"]}'),
    (r'epochs: int = \d+', 
     f'epochs: int = {cfg["epochs"]}'),
    (r'self\.datasets = \[.*?\]', 
     f'self.datasets = {cfg["datasets"]}'),
    (r'batch_size: int = \d+',
     f'batch_size: int = {cfg["batch_size"]}')
]

for pattern, replacement in replacements:
    config_content = re.sub(pattern, replacement, config_content)

# Save modified config
with open('config.py', 'w') as f:
    f.write(config_content)

print("‚úÖ Configuration updated!")

# Verify changes
from config import CONFIG
print(f"\nVerification:")
print(f"  Seeds: {CONFIG.seeds[:3]}...{CONFIG.seeds[-1]} ({len(CONFIG.seeds)} total)")
print(f"  Epochs: {CONFIG.epochs}")
print(f"  Datasets: {CONFIG.datasets}")
print(f"  Batch size: {CONFIG.batch_size}")
print(f"  Device: {CONFIG.device}")

## üöÄ 5. Run Experiments

**‚ö†Ô∏è Important Notes:**
- Colab may disconnect after 12 hours of inactivity
- For long runs, keep browser tab active or use Colab Pro
- Results are saved incrementally to Google Drive
- Progress bars show real-time status

In [None]:
# Run main experiment
import sys
from datetime import datetime

print("="*80)
print(f"üöÄ Starting CurriTail-GAN Experiments")
print(f"üìÖ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)
print()

# Redirect outputs to Google Drive
import os
os.makedirs(f"{project_dir}/outputs", exist_ok=True)
os.makedirs(f"{project_dir}/figures", exist_ok=True)
os.makedirs(f"{project_dir}/saved_models", exist_ok=True)

# Run main experiment
%run main_experiment.py

print()
print("="*80)
print(f"‚úÖ Experiments Complete!")
print(f"üìÖ End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

## üìä 6. View Results

In [None]:
# Display summary statistics
import pandas as pd
import glob

# Find latest results file
result_files = sorted(glob.glob('outputs/summary_*.csv'))
if result_files:
    latest_summary = result_files[-1]
    print(f"üìÑ Loading: {latest_summary}\n")
    
    df_summary = pd.read_csv(latest_summary, index_col=[0, 1])
    print("=" * 80)
    print("SUMMARY RESULTS (Mean ¬± Std across seeds)")
    print("=" * 80)
    print(df_summary)
    
    # Highlight best performers
    print("\n" + "=" * 80)
    print("üìà BEST PERFORMERS (by Tail KL Divergence - lower is better)")
    print("=" * 80)
    
    all_results = pd.read_csv(result_files[-1].replace('summary', 'results_all_seeds'))
    best_by_dataset = all_results.groupby('dataset')['tail_kl'].mean().groupby('model').mean()
    print(best_by_dataset.sort_values())
else:
    print("‚ùå No results found yet. Run experiments first.")

In [None]:
# Display generated figures
from IPython.display import Image, display
import glob

figure_files = sorted(glob.glob('figures/*.png'))

if figure_files:
    print(f"üìä Generated {len(figure_files)} figures:\n")
    
    for fig_path in figure_files:
        print(f"\n{'='*80}")
        print(f"üìà {os.path.basename(fig_path)}")
        print(f"{'='*80}")
        display(Image(filename=fig_path, width=800))
else:
    print("‚ùå No figures found yet.")

## üìä 6b. Analyze Saved Models (Skip Retraining)

If you already have trained models saved as `.pth` files, you can skip retraining and just run statistical analysis on the saved checkpoints. This is **much faster** (~2-3 minutes vs hours of training)!

In [None]:
# Upload the analyze_saved_models.py script
from google.colab import files

print("üì§ Upload analyze_saved_models.py:")
uploaded = files.upload()

if 'analyze_saved_models.py' in uploaded:
    print("‚úÖ analyze_saved_models.py uploaded successfully!")
else:
    print("‚ùå Please upload analyze_saved_models.py")

In [None]:
# Run analysis on saved model checkpoints
from analyze_saved_models import main

print("üîç Analyzing saved model checkpoints...")
print("="*80)

# Analyze models from Google Drive
results, df_all, df_summary = main(
    models_dir=f"{project_dir}/saved_models",  # Where your .pth files are
    datasets=['SPX'],  # Or ['Synthetic', 'SPX', 'BTC']
    generate_plots=True
)

if results:
    print("\n‚úÖ Analysis complete!")
    print(f"\nResults saved to: {project_dir}/outputs/")
    print(f"Figures saved to: {project_dir}/figures/")
else:
    print("\n‚ùå No models found. Make sure .pth files are in:")
    print(f"   {project_dir}/saved_models/")

In [None]:
# View the statistical comparison results
if results and 'SPX' in results:
    print("="*80)
    print("üìä STATISTICAL COMPARISONS vs CurriTail")
    print("="*80)
    
    if results['SPX']['statistical_comparisons'] is not None:
        print(results['SPX']['statistical_comparisons'])
    
    print("\n" + "="*80)
    print("üìà VALIDATION AGAINST PAPER")
    print("="*80)
    
    if results['SPX']['validation'] is not None:
        print(results['SPX']['validation'])

## üíæ 7. Download Results

In [None]:
# Copy all outputs to Google Drive for persistence
!cp -r outputs/ "{project_dir}/"
!cp -r figures/ "{project_dir}/"
!cp -r saved_models/ "{project_dir}/"
!cp -r data_cache/ "{project_dir}/" 2>/dev/null || true

print(f"‚úÖ All results saved to Google Drive:")
print(f"   {project_dir}/outputs/")
print(f"   {project_dir}/figures/")
print(f"   {project_dir}/saved_models/")
print("\nüí° Access files from Google Drive even after Colab disconnects!")

In [None]:
# Create downloadable zip file
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_name = f"CurriTail_Results_{timestamp}.zip"

!zip -r {zip_name} outputs/ figures/ saved_models/ -q

print(f"üì¶ Created: {zip_name}")
print(f"   Size: ", end="")
!du -h {zip_name}

print("\n‚¨áÔ∏è Downloading...")
from google.colab import files
files.download(zip_name)

## üß™ 8. Optional: Run Tests & Failure Analysis

In [None]:
# Run unit tests (if test_suite.py was uploaded)
import os

if os.path.exists('test_suite.py'):
    print("üß™ Running unit tests...\n")
    %run test_suite.py
else:
    print("‚ÑπÔ∏è test_suite.py not found. Skipping tests.")

In [None]:
# Run failure analysis (if failure_analysis.py was uploaded)
if os.path.exists('failure_analysis.py'):
    print("üîç Running failure analysis...\n")
    %run failure_analysis.py
else:
    print("‚ÑπÔ∏è failure_analysis.py not found. Skipping analysis.")

## üîß Troubleshooting

### Common Issues:

**1. "RuntimeError: CUDA out of memory"**
- Reduce batch size: Set `batch_size = 128` in config
- Restart runtime and clear GPU memory

**2. "Colab disconnected after X hours"**
- Use Colab Pro for longer sessions
- Results are saved to Drive - you can resume
- Keep browser tab active

**3. "Slow training speed"**
- Verify GPU is enabled (Cell 1)
- Check batch size (larger = faster on GPU)
- Use 'quick' mode first

**4. "ModuleNotFoundError"**
- Re-run installation cell
- Ensure all .py files uploaded

### Performance Tips:
- **T4 GPU** (free): ~3-4 hours for quick mode
- **A100 GPU** (Pro+): ~1-2 hours for quick mode  
- **V100 GPU** (Pro): ~2-3 hours for quick mode

### Support:
- Check logs in Drive: `{project_dir}/outputs/metadata_*.json`
- Review error messages carefully
- Ensure GPU is properly allocated

## üßπ 9. Cleanup (Optional)

Free up Colab storage after downloading results

In [None]:
# Remove local files (results are in Google Drive)
!rm -rf outputs/ figures/ saved_models/ data_cache/ *.zip
print("‚úÖ Cleanup complete. Results remain in Google Drive.")

---

## üìö Citation

If you use this code in your research, please cite:

```bibtex
@article{curritailgan2024,
  title={CurriTail-GAN: Curriculum Learning for Tail Generation in Financial Returns},
  author={Your Name},
  journal={Your Journal},
  year={2024}
}
```

---

**üéì Publication-Ready Experimental Suite**  
**‚ö° Optimized for Google Colab GPU**  
**üìä Complete Statistical Analysis Pipeline**

---