# CPB v2: All-in-One Colab Training

**一体化训练系统 - 所有代码集成在 Colab 中**

优点：
- 无需每次重复设置环境
- 可直接在 Colab 中编辑代码
- 环境一次配置，永久保留
- 快速迭代开发

执行步骤：
1. 运行 Cell 1: 环境设置 (仅需一次)
2. 运行 Cell 2-4: 加载内置模块
3. 运行 Cell 5+: 执行训练
4. 修改参数后，只需重新运行相关 Cell

In [None]:
# ============================================================================
# SETUP CELL (仅需运行一次)
# ============================================================================

import os
import sys
import subprocess

# 安装依赖 (只在第一次运行)
print('Installing dependencies...')
!pip install -q torch pandas numpy scikit-learn requests ta-lib pandas-ta huggingface-hub tqdm 2>/dev/null
print('✓ Dependencies installed')

# 检查 GPU
import torch
print(f'\nGPU Available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

In [None]:
# ============================================================================
# MODULE 1: Data Collector (内置代码)
# 可以直接在这里编辑！
# ============================================================================

import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import logging

logger = logging.getLogger(__name__)

class BinanceDataCollector:
    """Binance API 数据采集器"""
    
    BASE_URL = "https://api.binance.com/api/v3"
    MAX_CANDLES = 1000
    
    def __init__(self):
        self.session = requests.Session()
    
    def get_klines(self, symbol, interval="15m", limit=3000):
        """下载 K 线数据"""
        print(f'Downloading {symbol} {interval}...')
        
        all_klines = []
        end_time = int(datetime.utcnow().timestamp() * 1000)
        start_time = int((datetime.utcnow() - timedelta(days=90)).timestamp() * 1000)
        current_start = start_time
        
        retry_count = 0
        
        while current_start < end_time and len(all_klines) < limit:
            try:
                params = {
                    "symbol": symbol,
                    "interval": interval,
                    "startTime": current_start,
                    "limit": min(self.MAX_CANDLES, limit - len(all_klines))
                }
                
                response = self.session.get(
                    f"{self.BASE_URL}/klines",
                    params=params,
                    timeout=10
                )
                response.raise_for_status()
                
                klines = response.json()
                if not klines:
                    break
                
                all_klines.extend(klines)
                current_start = int(klines[-1][0]) + 1
                retry_count = 0
                time.sleep(0.1)
                
            except Exception as e:
                retry_count += 1
                if retry_count >= 3:
                    print(f'Error: {e}')
                    break
                time.sleep(2 ** retry_count)
        
        # 转换为 DataFrame
        if not all_klines:
            return pd.DataFrame()
        
        df = pd.DataFrame(all_klines, columns=[
            'timestamp', 'open', 'high', 'low', 'close', 'volume',
            'close_time', 'quote_volume', 'trades', 'taker_buy_base', 'taker_buy_quote', 'ignore'
        ])
        
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
        df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].astype(float)
        
        df = df[['timestamp', 'open', 'high', 'low', 'close', 'volume']].drop_duplicates().sort_values('timestamp').reset_index(drop=True)
        
        print(f'✓ Downloaded {len(df)} candles')
        return df
    
    @staticmethod
    def validate(df):
        if len(df) < 100:
            return False
        if df[['open', 'high', 'low', 'close', 'volume']].isnull().any().any():
            return False
        return True

print('✓ BinanceDataCollector 已加载')

In [None]:
# ============================================================================
# MODULE 2: Feature Engineer (内置代码)
# 可以直接在这里编辑！
# ============================================================================

class FeatureEngineer:
    """技术指标计算"""
    
    def __init__(self, df):
        self.df = df.copy()
    
    def calculate_all(self):
        df = self.df
        
        # 移动平均
        for period in [10, 20, 50, 100, 200]:
            df[f'sma_{period}'] = df['close'].rolling(period).mean()
            df[f'ema_{period}'] = df['close'].ewm(span=period).mean()
        
        # RSI
        for period in [14, 21]:
            delta = df['close'].diff()
            gain = delta.where(delta > 0, 0).rolling(period).mean()
            loss = -delta.where(delta < 0, 0).rolling(period).mean()
            rs = gain / loss
            df[f'rsi_{period}'] = 100 - (100 / (1 + rs))
        
        # MACD
        ema12 = df['close'].ewm(span=12).mean()
        ema26 = df['close'].ewm(span=26).mean()
        df['macd'] = ema12 - ema26
        df['macd_signal'] = df['macd'].ewm(span=9).mean()
        df['macd_hist'] = df['macd'] - df['macd_signal']
        
        # Bollinger Bands
        sma20 = df['close'].rolling(20).mean()
        std20 = df['close'].rolling(20).std()
        df['bb_upper'] = sma20 + (std20 * 2)
        df['bb_lower'] = sma20 - (std20 * 2)
        df['bb_width'] = df['bb_upper'] - df['bb_lower']
        
        # ATR
        tr = pd.concat([
            df['high'] - df['low'],
            (df['high'] - df['close'].shift()).abs(),
            (df['low'] - df['close'].shift()).abs()
        ], axis=1).max(axis=1)
        df['atr_14'] = tr.rolling(14).mean()
        
        # OBV
        df['obv'] = (np.sign(df['close'].diff()) * df['volume']).fillna(0).cumsum()
        
        # 价格变化
        df['price_change'] = df['close'].pct_change() * 100
        df['volume_change'] = df['volume'].pct_change() * 100
        
        self.df = df.fillna(0)
        return self.df
    
    def get_features(self):
        exclude = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
        return [col for col in self.df.columns if col not in exclude]

print('✓ FeatureEngineer 已加载')

In [None]:
# ============================================================================
# MODULE 3: Preprocessing (内置代码)
# 可以直接在这里编辑！
# ============================================================================

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

class DataPreprocessor:
    def __init__(self, df, lookback=60):
        self.df = df.copy()
        self.lookback = lookback
        self.scaler = MinMaxScaler((0, 1))
        self.pca = None
    
    def prepare(self, feature_cols, n_components=30):
        # 移除 NaN
        self.df = self.df.dropna()
        
        # 特征选择
        feature_data = self.df[feature_cols].copy()
        
        if len(feature_cols) > n_components:
            self.pca = PCA(n_components=n_components)
            feature_data = self.pca.fit_transform(feature_data)
            feature_cols = [f'pc_{i}' for i in range(n_components)]
            feature_data = pd.DataFrame(feature_data, columns=feature_cols, index=self.df.index)
        
        # 归一化
        feature_data = self.scaler.fit_transform(feature_data)
        
        self.features = feature_data
        self.feature_cols = feature_cols
        
        return feature_data, feature_cols
    
    def create_sequences(self):
        X, y = [], []
        data = self.features
        
        for i in range(self.lookback, len(data)):
            X.append(data[i - self.lookback:i])
            y.append(data[i, 0])
        
        return np.array(X), np.array(y).reshape(-1, 1)
    
    def split_data(self, X, y, train_ratio=0.7):
        n = len(X)
        train_idx = int(n * train_ratio)
        val_idx = int(n * (train_ratio + 0.15))
        
        return {
            'X_train': X[:train_idx], 'y_train': y[:train_idx],
            'X_val': X[train_idx:val_idx], 'y_val': y[train_idx:val_idx],
            'X_test': X[val_idx:], 'y_test': y[val_idx:]
        }

print('✓ DataPreprocessor 已加载')

In [None]:
# ============================================================================
# MODULE 4: LSTM Model (内置代码)
# 可以直接在这里编辑！
# ============================================================================

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class LSTMModel(nn.Module):
    def __init__(self, input_size=30, lstm_units=[96, 64], dense_units=32, dropout=0.2):
        super().__init__()
        
        self.lstm1 = nn.LSTM(input_size, lstm_units[0], batch_first=True, dropout=dropout, bidirectional=True)
        self.lstm2 = nn.LSTM(lstm_units[0] * 2, lstm_units[1], batch_first=True, dropout=dropout, bidirectional=True)
        
        lstm_output = lstm_units[1] * 2
        self.dense1 = nn.Linear(lstm_output, dense_units)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.dense2 = nn.Linear(dense_units, 1)
    
    def forward(self, x):
        lstm1_out, _ = self.lstm1(x)
        lstm2_out, _ = self.lstm2(lstm1_out)
        last_out = lstm2_out[:, -1, :]
        
        dense_out = self.dense1(last_out)
        dense_out = self.relu(dense_out)
        dense_out = self.dropout(dense_out)
        output = self.dense2(dense_out)
        
        return output
    
    def count_params(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

class Trainer:
    def __init__(self, model, device='cuda'):
        self.model = model.to(device)
        self.device = device
        self.train_losses = []
        self.val_losses = []
    
    def train(self, X_train, y_train, X_val, y_val, epochs=50, batch_size=32, lr=0.001):
        X_train_t = torch.FloatTensor(X_train).to(self.device)
        y_train_t = torch.FloatTensor(y_train).to(self.device)
        X_val_t = torch.FloatTensor(X_val).to(self.device)
        y_val_t = torch.FloatTensor(y_val).to(self.device)
        
        train_loader = DataLoader(
            TensorDataset(X_train_t, y_train_t), batch_size=batch_size, shuffle=False
        )
        val_loader = DataLoader(
            TensorDataset(X_val_t, y_val_t), batch_size=batch_size, shuffle=False
        )
        
        optimizer = optim.Adam(self.model.parameters(), lr=lr)
        criterion = nn.MSELoss()
        
        best_val_loss = float('inf')
        patience_counter = 0
        patience = 15
        
        for epoch in range(epochs):
            # Train
            self.model.train()
            train_loss = 0
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                output = self.model(X_batch)
                loss = criterion(output, y_batch)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                train_loss += loss.item()
            train_loss /= len(train_loader)
            
            # Val
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    output = self.model(X_batch)
                    loss = criterion(output, y_batch)
                    val_loss += loss.item()
            val_loss /= len(val_loader)
            
            self.train_losses.append(train_loss)
            self.val_losses.append(val_loss)
            
            if (epoch + 1) % 10 == 0:
                print(f'Epoch {epoch+1}/{epochs}: Train={train_loss:.6f}, Val={val_loss:.6f}')
            
            if val_loss < best_val_loss - 0.0001:
                best_val_loss = val_loss
                patience_counter = 0
                best_weights = self.model.state_dict().copy()
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f'Early stopping at epoch {epoch+1}')
                    self.model.load_state_dict(best_weights)
                    break
        
        return {'best_val_loss': float(best_val_loss), 'epochs': epoch+1}

print('✓ LSTMModel 和 Trainer 已加载')

In [None]:
# ============================================================================
# TRAINING EXECUTION
# ============================================================================

import json

# 配置
CONFIG = {
    'coins': ['BTCUSDT', 'ETHUSDT', 'SOLUSDT'],  # 修改币种列表
    'timeframes': ['15m', '1h'],
    'epochs': 50,
    'batch_size': 32,
    'learning_rate': 0.001,
    'lookback': 60,
    'n_features': 30
}

print(f'Config: {json.dumps(CONFIG, indent=2)}')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

In [None]:
# ============================================================================
# STEP 1: Download Data
# ============================================================================

collector = BinanceDataCollector()
all_data = {}

print('\n=== DATA DOWNLOAD ===')
for coin in CONFIG['coins']:
    coin_data = {}
    for timeframe in CONFIG['timeframes']:
        try:
            df = collector.get_klines(coin, timeframe, limit=3000)
            if BinanceDataCollector.validate(df):
                coin_data[timeframe] = df
            else:
                print(f'  ✗ {coin} {timeframe} validation failed')
        except Exception as e:
            print(f'  ✗ {coin} {timeframe}: {e}')
    
    if coin_data:
        all_data[coin] = coin_data
        print(f'✓ {coin}: {len(coin_data)} timeframes')

print(f'\nTotal: {sum([len(v) for v in all_data.values()])} datasets')

In [None]:
# ============================================================================
# STEP 2: Train Models
# ============================================================================

results = {}

print('\n=== TRAINING ===')

for coin in all_data:
    for timeframe in all_data[coin]:
        print(f'\n--- {coin} {timeframe} ---')
        
        try:
            # 数据准备
            df = all_data[coin][timeframe]
            
            # 特征工程
            fe = FeatureEngineer(df)
            df_features = fe.calculate_all()
            feature_cols = fe.get_features()
            
            # 预处理
            prep = DataPreprocessor(df_features, lookback=CONFIG['lookback'])
            features, feature_cols = prep.prepare(feature_cols, CONFIG['n_features'])
            X, y = prep.create_sequences()
            data = prep.split_data(X, y)
            
            # 构建模型
            model = LSTMModel(input_size=features.shape[-1])
            print(f'Model params: {model.count_params():,}')
            
            # 训练
            trainer = Trainer(model, device=device)
            history = trainer.train(
                data['X_train'], data['y_train'],
                data['X_val'], data['y_val'],
                epochs=CONFIG['epochs'],
                batch_size=CONFIG['batch_size'],
                lr=CONFIG['learning_rate']
            )
            
            results[f'{coin}_{timeframe}'] = history
            print(f'✓ Best Val Loss: {history["best_val_loss"]:.6f}')
            
        except Exception as e:
            print(f'✗ Error: {e}')

print(f'\n=== SUMMARY ===')
print(f'Trained: {len(results)} models')
for key, val in results.items():
    print(f'  {key}: loss={val["best_val_loss"]:.6f}')

## 编辑代码指南

### 如何在 Colab 中修改代码

无需重新设置环境，直接编辑和运行：

#### 1. 修改配置 (Cell 5)
```python
CONFIG = {
    'coins': ['BTCUSDT', 'ETHUSDT'],  # 改这里
    'epochs': 100,  # 改这里
    'batch_size': 16  # 改这里
}
```

#### 2. 修改模块代码
- Cell 2: 编辑 `BinanceDataCollector`
- Cell 3: 编辑 `FeatureEngineer` 添加更多指标
- Cell 4: 编辑 `LSTMModel` 改变模型架构

#### 3. 修改后运行
1. 修改对应 Cell
2. **Ctrl+Enter** 运行该 Cell
3. 如果修改了 CONFIG，运行 Cell 5-7
4. 无需重新运行 Cell 1 (仅需一次)

### 环境会一直保留！

- 首次运行 Cell 1 安装依赖
- 后续只需修改和运行相关 Cell
- GPU 会话保留 12 小时
- 所有模块会一直存在于内存中

### 快速迭代工作流

```
第一次运行:           后续迭代:
1. Cell 1 (setup)     1. 修改对应 Cell
2. Cell 2-4 (load)    2. Ctrl+Enter 运行
3. Cell 5-7 (train)   3. 查看结果
                      4. 重复
```