In [None]:
# ==============================================================================
# PHASE 0: SETUP & GPU CHECK
# ==============================================================================

# Check GPU availability
import torch
print(f"üñ•Ô∏è GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Install dependencies
!pip install -q yfinance ta-lib-bin torch torchvision pillow matplotlib seaborn optuna scikit-learn lightgbm xgboost pandas numpy plotly mplfinance

import numpy as np
import pandas as pd
import yfinance as yf
import talib
import matplotlib.pyplot as plt
import mplfinance as mpf
from PIL import Image
import io
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# PyTorch for CNN
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Sklearn for ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report

print("\n‚úÖ All packages installed!")

---
# üìä PHASE 1: DATA COLLECTION & MULTI-TIMEFRAME ANALYSIS

In [None]:
# ==============================================================================
# PHASE 1: MULTI-TIMEFRAME DATA COLLECTION
# ==============================================================================

# Training tickers (diverse sectors for robustness)
TICKERS = [
    'SPY', 'QQQ', 'IWM',  # Indices
    'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META',  # Tech giants
    'NVDA', 'AMD', 'TSM',  # Semiconductors
    'TSLA', 'F', 'GM',  # Auto
    'JPM', 'BAC', 'GS',  # Finance
    'XOM', 'CVX',  # Energy
    'JNJ', 'UNH', 'PFE'  # Healthcare
]

START_DATE = '2020-01-01'
END_DATE = datetime.now().strftime('%Y-%m-%d')

print(f"üì• Downloading data for {len(TICKERS)} tickers...")
print(f"   Period: {START_DATE} to {END_DATE}")

# Download multiple timeframes
data = {
    '1d': {},
    '1h': {},
    '4h': {}
}

for ticker in TICKERS:
    try:
        # Daily data (5 years)
        df_1d = yf.download(ticker, start=START_DATE, end=END_DATE, progress=False, auto_adjust=True)
        if isinstance(df_1d.columns, pd.MultiIndex):
            df_1d.columns = df_1d.columns.get_level_values(0)
        data['1d'][ticker] = df_1d
        
        # Hourly data (60 days)
        df_1h = yf.download(ticker, period='60d', interval='1h', progress=False, auto_adjust=True)
        if isinstance(df_1h.columns, pd.MultiIndex):
            df_1h.columns = df_1h.columns.get_level_values(0)
        data['1h'][ticker] = df_1h
        
        print(f"  ‚úì {ticker}: 1D={len(df_1d)} rows, 1H={len(df_1h)} rows")
    except Exception as e:
        print(f"  ‚úó {ticker}: {e}")

print(f"\n‚úÖ Data collected: {sum(len(data['1d'][t]) for t in data['1d'])} total 1D rows")

---
# üñºÔ∏è PHASE 2: VISUAL PATTERN ANALYSIS (CNN)

**Approach:** Convert candlestick charts to images, train CNN to recognize patterns
- Input: 224x224 RGB images of 30-day candlestick charts
- Output: 3-class prediction (BUY/HOLD/SELL)
- Architecture: ResNet18 (pretrained on ImageNet, fine-tuned)
- Data augmentation: Random crops, flips (simulates different chart styles)

In [None]:
# ==============================================================================
# PHASE 2A: CHART IMAGE GENERATION
# ==============================================================================

def create_chart_image(df, window_size=30):
    """
    Generate candlestick chart image for CNN training.
    Returns PIL Image (224x224 RGB).
    """
    # Take last window_size days
    df_window = df.iloc[-window_size:].copy()
    
    # Create candlestick chart with mplfinance
    fig, axes = mpf.plot(
        df_window,
        type='candle',
        style='yahoo',
        volume=True,
        returnfig=True,
        figsize=(4, 4),
        tight_layout=True
    )
    
    # Convert to PIL Image
    buf = io.BytesIO()
    fig.savefig(buf, format='png', dpi=56, bbox_inches='tight')
    buf.seek(0)
    img = Image.open(buf).convert('RGB')
    img = img.resize((224, 224), Image.LANCZOS)
    plt.close(fig)
    
    return img

# Generate chart images for training
print("üñºÔ∏è GENERATING CHART IMAGES...")
print("   This will take a few minutes...\n")

chart_dataset = []
WINDOW_SIZE = 30
HORIZON = 5  # Predict 5 days ahead
THRESHOLD = 0.03  # 3% threshold from Colab optimization

for ticker in list(data['1d'].keys())[:5]:  # Start with 5 tickers for speed
    df = data['1d'][ticker]
    
    # Rolling window
    for i in range(WINDOW_SIZE, len(df) - HORIZON):
        window_df = df.iloc[i-WINDOW_SIZE:i]
        
        # Calculate forward return (label)
        future_price = df.iloc[i + HORIZON]['Close']
        current_price = df.iloc[i]['Close']
        forward_return = (future_price - current_price) / current_price
        
        # Create label: 0=SELL, 1=HOLD, 2=BUY
        if forward_return > THRESHOLD:
            label = 2  # BUY
        elif forward_return < -THRESHOLD:
            label = 0  # SELL
        else:
            label = 1  # HOLD
        
        # Generate chart image every 5 days (reduce dataset size)
        if i % 5 == 0:
            try:
                img = create_chart_image(window_df, WINDOW_SIZE)
                chart_dataset.append({
                    'image': img,
                    'label': label,
                    'ticker': ticker,
                    'date': df.index[i]
                })
            except:
                pass
    
    print(f"  ‚úì {ticker}: {len([x for x in chart_dataset if x['ticker'] == ticker])} images")

print(f"\n‚úÖ Generated {len(chart_dataset)} chart images")

# Show class distribution
labels = [x['label'] for x in chart_dataset]
print(f"\nüìä Class Distribution:")
print(f"   SELL (0): {labels.count(0)} ({labels.count(0)/len(labels)*100:.1f}%)")
print(f"   HOLD (1): {labels.count(1)} ({labels.count(1)/len(labels)*100:.1f}%)")
print(f"   BUY (2): {labels.count(2)} ({labels.count(2)/len(labels)*100:.1f}%)")

In [None]:
# ==============================================================================
# PHASE 2B: CNN DATASET & DATALOADER
# ==============================================================================

class ChartDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        image = item['image']
        label = item['label']
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# Data augmentation & normalization
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.3),  # Flip chart (simulate different views)
    transforms.ColorJitter(brightness=0.2, contrast=0.2),  # Vary brightness
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Train/test split (80/20, time-series aware)
split_idx = int(len(chart_dataset) * 0.8)
train_data = chart_dataset[:split_idx]
test_data = chart_dataset[split_idx:]

train_dataset = ChartDataset(train_data, transform=transform_train)
test_dataset = ChartDataset(test_data, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

print(f"üì¶ DataLoaders created:")
print(f"   Train: {len(train_dataset)} samples, {len(train_loader)} batches")
print(f"   Test: {len(test_dataset)} samples, {len(test_loader)} batches")

In [None]:
# ==============================================================================
# PHASE 2C: CNN MODEL (ResNet18 Fine-Tuned)
# ==============================================================================

from torchvision import models

# Load pretrained ResNet18
model_cnn = models.resnet18(pretrained=True)

# Replace final layer for 3-class prediction
num_features = model_cnn.fc.in_features
model_cnn.fc = nn.Linear(num_features, 3)  # 3 classes: SELL, HOLD, BUY

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_cnn = model_cnn.to(device)

# Loss & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_cnn.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

print(f"üß† CNN Model initialized on {device}")
print(f"   Parameters: {sum(p.numel() for p in model_cnn.parameters()):,}")

In [None]:
# ==============================================================================
# PHASE 2D: TRAIN CNN
# ==============================================================================

def train_cnn(model, train_loader, test_loader, criterion, optimizer, scheduler, num_epochs=15):
    """
    Train CNN with early stopping.
    """
    best_acc = 0.0
    train_losses = []
    test_accs = []
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        running_loss = 0.0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        avg_loss = running_loss / len(train_loader)
        train_losses.append(avg_loss)
        
        # Validation
        model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        test_acc = 100 * correct / total
        test_accs.append(test_acc)
        
        print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {avg_loss:.4f}, Test Acc: {test_acc:.2f}%")
        
        # Save best model
        if test_acc > best_acc:
            best_acc = test_acc
            torch.save(model.state_dict(), 'best_cnn_model.pth')
            print(f"  ‚úÖ Best model saved (acc={best_acc:.2f}%)")
        
        scheduler.step()
    
    return train_losses, test_accs, best_acc

# Train CNN
print("\nüöÄ TRAINING CNN...")
print("="*70)

train_losses, test_accs, best_acc = train_cnn(
    model_cnn, train_loader, test_loader, criterion, optimizer, scheduler, num_epochs=15
)

print(f"\nüèÜ BEST CNN ACCURACY: {best_acc:.2f}%")

---
# üî¢ PHASE 3: NUMERICAL PATTERN ANALYSIS (HistGradientBoosting)

**Approach:** Traditional technical indicators + advanced features
- **From Perplexity Research:**
  - Triple barrier labeling (ATR-based dynamic thresholds)
  - Market regime features (HMM, volatility clustering)
  - Cross-asset features (SPY correlation, VIX divergence)
  - Multi-timeframe indicators (1H, 4H, 1D alignment)
- **Optimized Features:** Top 15 from Colab training
- **Model:** HistGradientBoostingClassifier with optimized hyperparameters

In [None]:
# Continue in next cell...