# Cryptocurrency 24-Hour Return Prediction

LightGBM pipeline for predicting 24-hour forward returns across 355 cryptocurrencies, built for the **Avenir HKU Web3 Quant Competition**.

**Methodology**: Feature engineering (RSI, MACD, Bollinger Bands, ATR, OBV, rolling stats, lagged returns) followed by LightGBM with TimeSeriesSplit cross-validation (5 folds, train on past, validate on future).

## Setup and Configuration

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import warnings
import gc
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

warnings.filterwarnings('ignore')

# --- Configuration ---
TRAIN_DATA_PATH = "/kaggle/input/avenir-hku-web/kline_data/train_data"
SUBMISSION_ID_PATH = "/kaggle/input/avenir-hku-web/submission_id.csv"
OUTPUT_PATH = "/kaggle/working/submission.csv"
N_SPLITS = 5

## Data Loading

In [None]:
def load_all_crypto_data(data_path):
    all_files = [f for f in os.listdir(data_path) if f.endswith('.parquet')]
    df_list = []
    print(f"Loading {len(all_files)} files...")
    for file in tqdm(all_files):
        symbol = file.split('.')[0]
        file_path = os.path.join(data_path, file)
        try:
            df = pd.read_parquet(file_path)
            df['symbol'] = symbol
            df_list.append(df)
        except Exception as e:
            print(f"Could not read file {file}: {e}")
    if not df_list: return pd.DataFrame()
    full_df = pd.concat(df_list, ignore_index=True)
    full_df['timestamp'] = pd.to_datetime(full_df['timestamp'], unit='ms')
    full_df = full_df.sort_values(by=['symbol', 'timestamp']).reset_index(drop=True)
    return full_df

## Feature Engineering

In [None]:
def feature_engineering_for_symbol(df):
    data = df.copy().sort_values('timestamp')
    data['rsi_14'] = calculate_rsi(data, 14)
    data['macd'], data['macd_signal'] = calculate_macd(data)
    data['roc_12'] = (data['close'].diff(12) / data['close'].shift(12)) * 100
    data['upper_band'], data['lower_band'] = calculate_bollinger_bands(data, 20)
    data['atr_14'] = calculate_atr(data, 14)
    data['bb_width'] = (data['upper_band'] - data['lower_band']) / data['close'].rolling(20).mean()
    data['obv'] = calculate_obv(data)
    for w in [10, 30, 60]:
        data[f'rolling_mean_{w}'] = data['close'].rolling(window=w).mean()
        data[f'rolling_std_{w}'] = data['close'].rolling(window=w).std()
        data[f'rolling_vol_mean_{w}'] = data['volume'].rolling(window=w).mean()
    data['return'] = data['close'].pct_change()
    for lag in [1, 2, 3, 4, 5]:
        data[f'return_lag_{lag}'] = data['return'].shift(lag)
    return data

def calculate_rsi(data, period=14):
    """RSI using Wilder smoothing (EWM with alpha=1/period)."""
    delta = data['close'].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.ewm(alpha=1/period, adjust=False).mean()
    avg_loss = loss.ewm(alpha=1/period, adjust=False).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

def calculate_macd(data, fast_period=12, slow_period=26, signal_period=9):
    ema_fast = data['close'].ewm(span=fast_period, adjust=False).mean()
    ema_slow = data['close'].ewm(span=slow_period, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal_period, adjust=False).mean()
    return macd_line, signal_line

def calculate_bollinger_bands(data, window=20, std_dev=2):
    sma = data['close'].rolling(window=window).mean()
    std = data['close'].rolling(window=window).std()
    upper_band = sma + (std * std_dev)
    lower_band = sma - (std * std_dev)
    return upper_band, lower_band

def calculate_atr(data, window=14):
    high_low = data['high'] - data['low']
    high_prev_close = np.abs(data['high'] - data['close'].shift(1))
    low_prev_close = np.abs(data['low'] - data['close'].shift(1))
    tr = pd.concat([high_low, high_prev_close, low_prev_close], axis=1).max(axis=1, skipna=False)
    return tr.ewm(span=window, adjust=False).mean()

def calculate_obv(data):
    obv = (np.sign(data['close'].diff()) * data['volume']).fillna(0).cumsum()
    return obv

## Training with TimeSeriesSplit Cross-Validation

Train a LightGBM ensemble using 5-fold time-series cross-validation. Each fold trains on past data and validates on the subsequent temporal block, preventing data leakage. The final model averages predictions across all 5 fold models.

In [None]:
print("STAGE 1: TRAINING")
df_full = load_all_crypto_data(TRAIN_DATA_PATH)
if not df_full.empty:
    rename_dict = {'open_price': 'open', 'high_price': 'high', 'low_price': 'low', 'close_price': 'close'}
    df_full.rename(columns=rename_dict, inplace=True)
    numeric_cols = ['open', 'high', 'low', 'close', 'volume', 'amount', 'count', 'buy_volume', 'buy_amount']
    for col in numeric_cols:
        if col in df_full.columns:
            df_full[col] = pd.to_numeric(df_full[col], errors='coerce').astype('float32')
    df_features = df_full.groupby('symbol', as_index=False).apply(feature_engineering_for_symbol).reset_index(drop=True)
    del df_full; gc.collect()
    future_periods = 96
    df_features['target'] = df_features.groupby('symbol')['close'].shift(-future_periods) / df_features['close'] - 1
    df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_train = df_features.dropna(subset=['target'])
    features = [col for col in df_train.columns if col not in ['timestamp', 'symbol', 'target', 'open', 'high', 'low', 'close', 'volume', 'amount']]

    # Sort by timestamp to ensure temporal ordering for TimeSeriesSplit
    df_train = df_train.sort_values('timestamp').reset_index(drop=True)
    X = df_train[features]
    y = df_train['target']

    # TimeSeriesSplit: always train on past, validate on future
    tscv = TimeSeriesSplit(n_splits=N_SPLITS)
    fold_models = []
    fold_scores = []
    last_fold_val = {}  # Store last fold's validation data for analysis

    print(f"Training with {N_SPLITS}-fold TimeSeriesSplit...")
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        lgbm = lgb.LGBMRegressor(
            objective='regression_l1', n_estimators=1000, learning_rate=0.05,
            num_leaves=31, max_depth=8, subsample=0.8, colsample_bytree=0.8,
            random_state=42, n_jobs=-1
        )
        lgbm.fit(X_train, y_train,
                 eval_set=[(X_val, y_val)], eval_metric='l1',
                 callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])

        val_pred = lgbm.predict(X_val)
        mae = mean_absolute_error(y_val, val_pred)
        fold_scores.append(mae)
        fold_models.append(lgbm)
        print(f"  Fold {fold + 1}/{N_SPLITS} â€” Validation MAE: {mae:.6f}")

        # Save last fold validation data for baseline comparison and L/S simulation
        last_fold_val = {
            'X_val': X_val, 'y_val': y_val, 'y_pred': val_pred,
            'timestamps': df_train.iloc[val_idx]['timestamp'].values,
            'symbols': df_train.iloc[val_idx]['symbol'].values,
        }

    mean_mae = np.mean(fold_scores)
    std_mae = np.std(fold_scores)
    print(f"\nCV Results: MAE = {mean_mae:.6f} +/- {std_mae:.6f}")
    print("Model training completed successfully!")
    del X_train, y_train, X_val, y_val; gc.collect()
else:
    print("Execution stopped because no data was loaded.")

## SHAP Feature Importance

Use TreeExplainer to identify which features drive the model's predictions. This provides interpretable, per-feature contribution analysis beyond simple split-based importance.

In [None]:
import shap

# Use the last fold model and a sample of validation data
model = fold_models[-1]
X_test_sample = last_fold_val['X_val'].sample(n=min(5000, len(last_fold_val['X_val'])), random_state=42)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test_sample)

shap.summary_plot(shap_values, X_test_sample, max_display=15, show=False)
plt.tight_layout()
plt.savefig("shap_importance.png", dpi=150, bbox_inches="tight")
plt.show()

# Print top features by mean absolute SHAP value
shap_importance = pd.DataFrame({
    'feature': X_test_sample.columns,
    'mean_abs_shap': np.abs(shap_values).mean(axis=0)
}).sort_values('mean_abs_shap', ascending=False)
print("Top 10 features by mean |SHAP|:")
print(shap_importance.head(10).to_string(index=False))

## Naive Baseline Comparison

Compare the model's MAE against two simple baselines to quantify how much value the model adds:
- **Zero baseline**: Always predict 0 return (MAE = mean of |actual returns|)
- **Persistence baseline**: Predict the last known return (most recent lagged return)

In [None]:
y_actual = last_fold_val['y_val']
y_pred = last_fold_val['y_pred']

# Model MAE (last fold)
model_mae = mean_absolute_error(y_actual, y_pred)

# Zero baseline: predict 0 for all returns
zero_mae = np.mean(np.abs(y_actual))

# Persistence baseline: predict last known return (return_lag_1 feature)
persistence_pred = last_fold_val['X_val']['return_lag_1'].fillna(0).values
persistence_mae = mean_absolute_error(y_actual, persistence_pred)

# Improvement percentages
zero_improvement = (zero_mae - model_mae) / zero_mae * 100
persistence_improvement = (persistence_mae - model_mae) / persistence_mae * 100

print("=" * 55)
print("BASELINE COMPARISON (Last Fold Validation Set)")
print("=" * 55)
print(f"  Model MAE:        {model_mae:.6f}")
print(f"  Zero Baseline:    {zero_mae:.6f}  (predict 0)")
print(f"  Persistence:      {persistence_mae:.6f}  (predict last return)")
print("-" * 55)
print(f"  vs Zero:          {zero_improvement:+.2f}% improvement")
print(f"  vs Persistence:   {persistence_improvement:+.2f}% improvement")
print("=" * 55)

## Long/Short Strategy Simulation

Test whether the model's predictions have **economic value** beyond statistical accuracy. For each time step in the validation set:
- Rank all 355 assets by predicted return
- Go **long** the top decile, **short** the bottom decile (equal weight)
- Compute portfolio returns, cumulative performance, Sharpe ratio, and max drawdown

In [None]:
# Build a DataFrame with predictions and actuals per timestamp/symbol
ls_df = pd.DataFrame({
    'timestamp': last_fold_val['timestamps'],
    'symbol': last_fold_val['symbols'],
    'y_actual': last_fold_val['y_val'].values,
    'y_pred': last_fold_val['y_pred'],
})

# For each timestamp, rank predictions and compute L/S return
ls_returns = []
for ts, group in ls_df.groupby('timestamp'):
    if len(group) < 10:
        continue
    n_decile = max(1, len(group) // 10)
    ranked = group.sort_values('y_pred')
    short_leg = ranked.head(n_decile)['y_actual'].mean()
    long_leg = ranked.tail(n_decile)['y_actual'].mean()
    ls_return = long_leg - short_leg  # L/S spread
    ls_returns.append({'timestamp': ts, 'ls_return': ls_return})

ls_series = pd.DataFrame(ls_returns).sort_values('timestamp').reset_index(drop=True)
ls_series['cumulative'] = (1 + ls_series['ls_return']).cumprod()

# Performance metrics
total_return = ls_series['cumulative'].iloc[-1] - 1
n_periods = len(ls_series)
# Annualize assuming 15-min bars: 96 bars/day * 365 days
periods_per_year = 96 * 365
annualized_return = (1 + total_return) ** (periods_per_year / max(n_periods, 1)) - 1
annualized_vol = ls_series['ls_return'].std() * np.sqrt(periods_per_year)
sharpe = annualized_return / annualized_vol if annualized_vol > 0 else 0
running_max = ls_series['cumulative'].cummax()
drawdown = (ls_series['cumulative'] - running_max) / running_max
max_drawdown = drawdown.min()

print("=" * 55)
print("LONG/SHORT STRATEGY (Top vs Bottom Decile)")
print("=" * 55)
print(f"  Periods:             {n_periods}")
print(f"  Cumulative Return:   {total_return:.4%}")
print(f"  Annualized Sharpe:   {sharpe:.2f}")
print(f"  Max Drawdown:        {max_drawdown:.4%}")
print("=" * 55)

# Plot cumulative return
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(ls_series['timestamp'], ls_series['cumulative'], linewidth=1.2)
ax.set_title('Long/Short Portfolio: Cumulative Return (Top vs Bottom Decile)')
ax.set_xlabel('Time')
ax.set_ylabel('Cumulative Return (1 = start)')
ax.axhline(y=1, color='gray', linestyle='--', alpha=0.5)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("ls_cumulative_return.png", dpi=150, bbox_inches="tight")
plt.show()

del X, y, df_train, df_features; gc.collect()

## Prediction and Submission

Generate predictions by averaging across all 5 fold models. Process symbols one at a time to stay within Kaggle memory limits.

In [None]:
if fold_models:
    print("STAGE 2: PREDICTION (Memory-Safe Symbol by Symbol)")

    print("Step 1 (Pred): Loading all data for prediction...")
    df_pred_base = load_all_crypto_data(TRAIN_DATA_PATH)
    df_pred_base.rename(columns=rename_dict, inplace=True)
    for col in numeric_cols:
        if col in df_pred_base.columns:
            df_pred_base[col] = pd.to_numeric(df_pred_base[col], errors='coerce').astype('float32')

    all_predictions = []
    symbols = df_pred_base['symbol'].unique()

    print(f"Step 2 (Pred): Processing {len(symbols)} symbols one by one...")
    for symbol in tqdm(symbols):
        df_symbol = df_pred_base[df_pred_base['symbol'] == symbol]
        df_symbol_features = feature_engineering_for_symbol(df_symbol)
        X_pred = df_symbol_features[features]
        X_pred.fillna(0, inplace=True)
        # Average predictions across all fold models
        preds = np.mean([model.predict(X_pred) for model in fold_models], axis=0)
        result_df = pd.DataFrame({
            'timestamp': df_symbol_features['timestamp'],
            'symbol': df_symbol_features['symbol'],
            'predict_return': preds
        })
        all_predictions.append(result_df)

    print("Optimizing memory before final concatenation...")
    del df_pred_base
    gc.collect()

    print("Step 3 (Pred): Concatenating all predictions...")
    final_pred_df = pd.concat(all_predictions, ignore_index=True)
    del all_predictions
    gc.collect()

    print("Step 4 (Pred): Formatting submission file...")
    final_pred_df['timestamp'] = final_pred_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
    final_pred_df['id'] = final_pred_df['timestamp'] + "_" + final_pred_df['symbol']

    df_submission_id = pd.read_csv(SUBMISSION_ID_PATH)
    final_submission = pd.merge(df_submission_id, final_pred_df[['id', 'predict_return']], on='id', how='left')
    final_submission['predict_return'].fillna(0, inplace=True)

    final_submission.to_csv(OUTPUT_PATH, index=False)

    print(f"Submission file successfully generated at: {OUTPUT_PATH}")
    print("File preview:")
    print(final_submission.head())
    print(f"File shape: {final_submission.shape}")