In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
from tqdm import tqdm
import warnings
import gc
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

# --- Configuration ---
TRAIN_DATA_PATH = "/kaggle/input/avenir-hku-web/kline_data/train_data"
SUBMISSION_ID_PATH = "/kaggle/input/avenir-hku-web/submission_id.csv"
OUTPUT_PATH = "/kaggle/working/submission.csv"

# --- Functions (No changes needed here) ---
def load_all_crypto_data(data_path):
    all_files = [f for f in os.listdir(data_path) if f.endswith('.parquet')]
    df_list = []
    print(f"Loading {len(all_files)} files...")
    for file in tqdm(all_files):
        symbol = file.split('.')[0]
        file_path = os.path.join(data_path, file)
        try:
            df = pd.read_parquet(file_path)
            df['symbol'] = symbol
            df_list.append(df)
        except Exception as e:
            print(f"Could not read file {file}: {e}")
    if not df_list: return pd.DataFrame()
    full_df = pd.concat(df_list, ignore_index=True)
    full_df['timestamp'] = pd.to_datetime(full_df['timestamp'], unit='ms')
    full_df = full_df.sort_values(by=['symbol', 'timestamp']).reset_index(drop=True)
    return full_df

def feature_engineering_for_symbol(df):
    data = df.copy().sort_values('timestamp')
    data['rsi_14'] = calculate_rsi(data, 14)
    data['macd'], data['macd_signal'] = calculate_macd(data)
    data['roc_12'] = (data['close'].diff(12) / data['close'].shift(12)) * 100
    data['upper_band'], data['lower_band'] = calculate_bollinger_bands(data, 20)
    data['atr_14'] = calculate_atr(data, 14)
    data['bb_width'] = (data['upper_band'] - data['lower_band']) / data['close'].rolling(20).mean()
    data['obv'] = calculate_obv(data)
    for w in [10, 30, 60]:
        data[f'rolling_mean_{w}'] = data['close'].rolling(window=w).mean()
        data[f'rolling_std_{w}'] = data['close'].rolling(window=w).std()
        data[f'rolling_vol_mean_{w}'] = data['volume'].rolling(window=w).mean()
    data['return'] = data['close'].pct_change()
    for lag in [1, 2, 3, 4, 5]:
        data[f'return_lag_{lag}'] = data['return'].shift(lag)
    # Return a subset of columns to be memory efficient inside the loop
    return data

def calculate_rsi(data, window=14):
    delta = data['close'].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_macd(data, fast_period=12, slow_period=26, signal_period=9):
    ema_fast = data['close'].ewm(span=fast_period, adjust=False).mean()
    ema_slow = data['close'].ewm(span=slow_period, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal_period, adjust=False).mean()
    return macd_line, signal_line

def calculate_bollinger_bands(data, window=20, std_dev=2):
    sma = data['close'].rolling(window=window).mean()
    std = data['close'].rolling(window=window).std()
    upper_band = sma + (std * std_dev)
    lower_band = sma - (std * std_dev)
    return upper_band, lower_band

def calculate_atr(data, window=14):
    high_low = data['high'] - data['low']
    high_prev_close = np.abs(data['high'] - data['close'].shift(1))
    low_prev_close = np.abs(data['low'] - data['close'].shift(1))
    tr = pd.concat([high_low, high_prev_close, low_prev_close], axis=1).max(axis=1, skipna=False)
    return tr.ewm(span=window, adjust=False).mean()

def calculate_obv(data):
    obv = (np.sign(data['close'].diff()) * data['volume']).fillna(0).cumsum()
    return obv

# ==============================================================================
# ======================== STAGE 1: TRAINING THE MODEL =========================
# ==============================================================================

print("STAGE 1: TRAINING")
# (The training stage is assumed to run correctly as before)
df_full = load_all_crypto_data(TRAIN_DATA_PATH)
if not df_full.empty:
    rename_dict = {'open_price': 'open', 'high_price': 'high', 'low_price': 'low', 'close_price': 'close'}
    df_full.rename(columns=rename_dict, inplace=True)
    numeric_cols = ['open', 'high', 'low', 'close', 'volume', 'amount', 'count', 'buy_volume', 'buy_amount']
    for col in numeric_cols:
        if col in df_full.columns:
            df_full[col] = pd.to_numeric(df_full[col], errors='coerce').astype('float32')
    df_features = df_full.groupby('symbol', as_index=False).apply(feature_engineering_for_symbol).reset_index(drop=True)
    del df_full; gc.collect()
    future_periods = 96
    df_features['target'] = df_features.groupby('symbol')['close'].shift(-future_periods) / df_features['close'] - 1
    df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_train = df_features.dropna(subset=['target'])
    features = [col for col in df_train.columns if col not in ['timestamp', 'symbol', 'target', 'open', 'high', 'low', 'close', 'volume', 'amount']]
    X = df_train[features]
    y = df_train['target']
    del df_train, df_features; gc.collect()
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    del X, y; gc.collect()
    print("Step 5: Training the LightGBM model...")
    lgbm = lgb.LGBMRegressor(
        objective='regression_l1', n_estimators=1000, learning_rate=0.05,
        num_leaves=31, max_depth=8, subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1
    )
    lgbm.fit(X_train, y_train, 
             eval_set=[(X_val, y_val)], eval_metric='l1',
             callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
    print("Model training completed successfully!")
    del X_train, X_val, y_train, y_val; gc.collect()

    # ==============================================================================
    # ================= STAGE 2: GENERATING SUBMISSION FILE ========================
    # ==============================================================================
    print("\nSTAGE 2: PREDICTION (Memory-Safe Symbol by Symbol)")
    
    print("Step 1 (Pred): Loading all data for prediction...")
    df_pred_base = load_all_crypto_data(TRAIN_DATA_PATH)
    df_pred_base.rename(columns=rename_dict, inplace=True)
    for col in numeric_cols:
        if col in df_pred_base.columns:
            df_pred_base[col] = pd.to_numeric(df_pred_base[col], errors='coerce').astype('float32')

    all_predictions = []
    symbols = df_pred_base['symbol'].unique()

    print(f"Step 2 (Pred): Processing {len(symbols)} symbols one by one...")
    for symbol in tqdm(symbols):
        df_symbol = df_pred_base[df_pred_base['symbol'] == symbol]
        df_symbol_features = feature_engineering_for_symbol(df_symbol)
        X_pred = df_symbol_features[features]
        X_pred.fillna(0, inplace=True)
        predictions = lgbm.predict(X_pred)
        result_df = pd.DataFrame({
            'timestamp': df_symbol_features['timestamp'],
            'symbol': df_symbol_features['symbol'],
            'predict_return': predictions
        })
        all_predictions.append(result_df)

    # ==================== Pre-concatenation Memory Optimization ====================
    # Delete the large raw dataset before merging â€” no longer needed after prediction
    print("Optimizing memory before final concatenation...")
    del df_pred_base
    gc.collect()
    # ==============================================================================

    print("Step 3 (Pred): Concatenating all predictions...")
    final_pred_df = pd.concat(all_predictions, ignore_index=True)

    # Free prediction list memory
    del all_predictions
    gc.collect()

    print("Step 4 (Pred): Formatting submission file...")
    final_pred_df['timestamp'] = final_pred_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
    final_pred_df['id'] = final_pred_df['timestamp'] + "_" + final_pred_df['symbol']
    
    df_submission_id = pd.read_csv(SUBMISSION_ID_PATH)
    final_submission = pd.merge(df_submission_id, final_pred_df[['id', 'predict_return']], on='id', how='left')
    final_submission['predict_return'].fillna(0, inplace=True)
    
    final_submission.to_csv(OUTPUT_PATH, index=False)
    
    print(f"Submission file successfully generated at: {OUTPUT_PATH}")
    print("File preview:")
    print(final_submission.head())
    print(f"File shape: {final_submission.shape}")

else:
    print("Execution stopped because no data was loaded.")