In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, GroupKFold
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

## Loading data

In [3]:
x_train = pd.read_csv('x_train.csv', index_col='ID')
y_train = pd.read_csv('y_train.csv', index_col='ID')
train = pd.concat([x_train, y_train], axis=1)
test = pd.read_csv('x_test.csv', index_col='ID')
# Ensure proper time ordering within each stock
train = train.sort_values(['STOCK', 'DATE']).reset_index()
test  = test.sort_values(['STOCK', 'DATE']).reset_index()
train # already sorted by stock and date

Unnamed: 0,ID,DATE,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,...,VOLUME_16,RET_17,VOLUME_17,RET_18,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20,RET
0,2377,1,0,37,12,5,94,-0.005967,0.136699,0.009031,...,-0.493354,-0.007660,-0.585497,-0.001063,-0.351363,0.005127,-0.324675,-0.019275,-0.291751,False
1,5198,4,0,37,12,5,94,0.001348,-0.269520,0.011100,...,-0.313575,0.007867,0.071338,0.007733,-0.405243,-0.003276,-0.424336,-0.010489,-0.050591,False
2,8017,5,0,37,12,5,94,-0.014405,0.192655,0.003614,...,-0.367499,-0.005843,-0.405562,0.002930,-0.315935,0.010462,-0.474957,-0.003541,-0.260130,True
3,20826,11,0,37,12,5,94,0.008938,0.430916,0.002662,...,0.023598,0.011266,0.079711,0.019038,-0.230167,-0.000287,-0.312123,0.008682,-0.226628,True
4,33843,21,0,37,12,5,94,-0.006523,-0.060371,-0.007632,...,-0.337686,-0.007224,-0.161117,-0.001461,-0.095494,0.012667,0.471895,-0.038752,1.532045,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,391556,206,5716,50,17,7,114,0.029552,-0.075091,-0.001428,...,0.076480,0.017026,0.170516,0.002276,-0.106224,-0.034597,0.123750,-0.015676,-0.228186,True
418591,394490,208,5716,50,17,7,114,0.008316,0.028099,-0.006688,...,0.090287,0.002887,-0.050408,0.008736,-0.159294,0.027350,-0.022922,0.008186,-0.080569,True
418592,400150,210,5716,50,17,7,114,-0.004633,-0.173518,0.001687,...,0.695373,-0.015320,-0.149467,-0.035810,-0.262389,0.000896,-0.172450,0.008586,-0.482171,False
418593,403129,211,5716,50,17,7,114,0.010883,0.172313,0.008844,...,-0.045512,-0.008823,-0.026153,-0.011428,-0.142636,0.011253,-0.224195,0.000609,-0.341878,True


# Feature Engineering

In [7]:
class FeatureEngineer:
    """
    Reusable feature engineering pipeline for stock prediction
    """
    def __init__(self, eps=1e-6):
        self.eps = eps
        
    def fit(self, df):
        """
        Fit any parameters needed (e.g., for normalization)
        For this dataset, we don't need to fit anything since
        features are computed per-stock or per-date
        """
        # No fitting needed for these features
        # But this method allows future extensions
        return self
    
    def transform(self, df):
        """
        Apply feature engineering to dataframe
        """
        df = df.copy()  # Don't modify original
        
        print("="*80)
        print("FEATURE ENGINEERING")
        print("="*80)
        
        # =========================================================
        # STEP 1: TIME-SERIES FEATURES
        # =========================================================
        print("\n[Step 1] Creating time-series features...")
        
        grouped = df.groupby('STOCK', group_keys=False)
        
        # Lag features
        for lag in [1, 2, 3, 5, 10, 20]:
            df[f'RET_1_lag_{lag}'] = grouped['RET_1'].shift(lag)
        
        # Rolling statistics
        for window in [5, 10, 20, 60]:
            df[f'RET_1_roll_std_{window}'] = grouped['RET_1'].transform(
                lambda x: x.rolling(window, min_periods=max(1, window//2)).std()
            )
            df[f'RET_1_roll_mean_{window}'] = grouped['RET_1'].transform(
                lambda x: x.rolling(window, min_periods=max(1, window//2)).mean()
            )
        
        # Derived features
        df['RET_1_vol_adj'] = df['RET_1'] / (df['RET_1_roll_std_20'] + self.eps)
        df['RET_1_dist_mean_20'] = df['RET_1'] - df['RET_1_roll_mean_20']
                
        # =========================================================
        # STEP 2: TECHNICAL INDICATORS
        # =========================================================
        print("\n[Step 2] Creating technical indicators...")
        
        # RSI
        for period in [14, 28]:
            df[f'RSI_{period}'] = grouped['RET_1'].transform(
                lambda x: self._calculate_rsi(x, period)
            )
        
        # ADL
        df['_mfm'] = np.sign(df['RET_1'])
        df['_mfv'] = df['_mfm'] * df['VOLUME_1'].fillna(0)
        df['ADL'] = grouped['_mfv'].cumsum()
        df['ADL_zscore'] = grouped['ADL'].transform(
            lambda x: (x - x.mean()) / (x.std() + self.eps)
        )
        df['ADL_momentum'] = grouped['ADL'].diff(5)
        df.drop(['_mfm', '_mfv'], axis=1, inplace=True)
                
        # =========================================================
        # STEP 3: TURNOVER FEATURES
        # =========================================================
        print("\n[Step 3] Creating turnover features...")
        
        volume_cols = [f'VOLUME_{i}' for i in range(1, 21)]
        volume_data = df[volume_cols].values
        
        df['VOLUME_1_log'] = np.sign(df['VOLUME_1']) * np.log1p(np.abs(df['VOLUME_1']))
        
        df['turnover_mean_5'] = np.nanmean(volume_data[:, :5], axis=1)
        df['turnover_mean_20'] = np.nanmean(volume_data, axis=1)
        df['turnover_std_5'] = np.nanstd(volume_data[:, :5], axis=1)
        df['turnover_std_20'] = np.nanstd(volume_data, axis=1)
        df['turnover_cv_5'] = df['turnover_std_5'] / (np.abs(df['turnover_mean_5']) + self.eps)
        df['turnover_cv_20'] = df['turnover_std_20'] / (np.abs(df['turnover_mean_20']) + self.eps)
        df['turnover_ratio'] = df['VOLUME_1'] / (df['turnover_mean_20'] + self.eps)
        
        df['volume_momentum'] = grouped['VOLUME_1'].transform(
            lambda x: x.rolling(5, min_periods=1).mean() - x.rolling(20, min_periods=1).mean()
        )
        df['volume_acceleration'] = grouped['volume_momentum'].diff(1)
        df['volume_ret_interaction'] = df['VOLUME_1_log'] * df['RET_1']
                
        # =========================================================
        # STEP 4: LIQUIDITY FEATURES
        # =========================================================
        print("\n[Step 4] Creating liquidity features...")
        
        df['has_volume_data'] = (~df[volume_cols].isna().all(axis=1)).astype(np.int8)
        df['volume_nonmissing_frac'] = (1 - df[volume_cols].isna().mean(axis=1))
        df['volume_nonmissing_frac'] = grouped['volume_nonmissing_frac'].transform('mean')
        df['low_liquidity_flag'] = (df['volume_nonmissing_frac'] < 0.5).astype(np.int8)
        df['RET_1_lowliq_interaction'] = df['RET_1'] * df['low_liquidity_flag']
                
        # =========================================================
        # STEP 5: VOLATILITY REGIME FEATURES
        # =========================================================
        print("\n[Step 5] Creating volatility regime features...")
        
        df['vol_regime'] = df.groupby('DATE')['RET_1_roll_std_20'].transform(
            lambda x: pd.qcut(x, q=3, labels=False, duplicates='drop') if x.nunique() > 2 else 1
        )
        df['vol_regime'] = df['vol_regime'].fillna(1).astype(int)
        df['high_vol_regime'] = (df['vol_regime'] == 2).astype(np.int8)
        df['low_vol_regime'] = (df['vol_regime'] == 0).astype(np.int8)
        df['RET_1_regime_adj'] = df['RET_1'] * (1 + df['vol_regime'] * 0.1)
                
        # =========================================================
        # STEP 6: ADVANCED MOMENTUM FEATURES
        # =========================================================
        print("\n[Step 6] Creating advanced momentum features...")
        
        df['reversal_magnitude'] = np.abs(df['RET_1'] - df['RET_1_roll_mean_20'])
        df['trend_consistency'] = grouped['RET_1'].transform(
            lambda x: (x.rolling(10, min_periods=1).mean() > 0).rolling(10, min_periods=1).mean()
        )
        df['momentum_strength'] = df['RET_1_roll_mean_20'] / (df['RET_1_roll_std_20'] + self.eps)
                
        # =========================================================
        # STEP 7: CROSS-SECTIONAL FEATURES
        # =========================================================
        print("\n[Step 7] Creating cross-sectional features...")
        
        date_grouped = df.groupby('DATE')
        
        # Percentile ranks
        for col in ['RET_1', 'RET_2', 'RET_5', 'RET_10', 'RET_20']:
            if col in df.columns:
                df[f'{col}_pctrank'] = date_grouped[col].rank(pct=True)
        
        for col in ['VOLUME_1', 'VOLUME_5', 'VOLUME_10']:
            if col in df.columns:
                df[f'{col}_pctrank'] = date_grouped[col].rank(pct=True)
        
        # Z-scores
        for col in ['RET_1', 'RET_5', 'RET_10', 'RET_20']:
            if col in df.columns:
                df[f'{col}_cs_zscore'] = date_grouped[col].transform(
                    lambda x: (x - x.mean()) / (x.std() + self.eps)
                )
        
        # Sector-relative
        if 'SECTOR' in df.columns:
            sector_date_grouped = df.groupby(['DATE', 'SECTOR'])
            
            for col in ['RET_1', 'RET_5', 'RET_10']:
                if col in df.columns:
                    df[f'{col}_sector_pctrank'] = sector_date_grouped[col].rank(pct=True)
                    sector_med = sector_date_grouped[col].transform('median')
                    df[f'{col}_vs_sector_med'] = df[col] - sector_med
            
            if 'RET_1_roll_mean_20' in df.columns:
                df['sector_strength_rank'] = sector_date_grouped['RET_1_roll_mean_20'].rank(pct=True)
        
        # Composite momentum
        ret_1_5 = [f'RET_{i}' for i in range(1, 6) if f'RET_{i}' in df.columns]
        ret_6_20 = [f'RET_{i}' for i in range(6, 21) if f'RET_{i}' in df.columns]
        
        if ret_1_5:
            df['momentum_1_5'] = df[ret_1_5].mean(axis=1)
            df['momentum_1_5_pctrank'] = date_grouped['momentum_1_5'].rank(pct=True)
        
        if ret_6_20:
            df['momentum_6_20'] = df[ret_6_20].mean(axis=1)
            df['momentum_6_20_pctrank'] = date_grouped['momentum_6_20'].rank(pct=True)
        
        if ret_1_5:
            df['vol_5d'] = df[ret_1_5].std(axis=1)
            df['vol_5d_pctrank'] = date_grouped['vol_5d'].rank(pct=True)
            df['ret_vol_ratio'] = df.get('momentum_1_5', 0) / (df['vol_5d'] + self.eps)
            df['ret_vol_ratio_pctrank'] = date_grouped['ret_vol_ratio'].rank(pct=True)
                
        print("\n" + "="*80)
        print("✅ FEATURE ENGINEERING COMPLETE!")
        print("="*80)
        
        return df
    
    def fit_transform(self, df):
        """Fit and transform in one step"""
        return self.fit(df).transform(df)
    
    def _calculate_rsi(self, series, period=14):
        """Helper: Calculate RSI"""
        delta = series.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=period, min_periods=period//2).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=period, min_periods=period//2).mean()
        rs = gain / (loss + self.eps)
        rsi = 100 - (100 / (1 + rs))
        return rsi


In [8]:

fe = FeatureEngineer(eps=1e-6)

train['_dataset'] = 'train'
test['_dataset'] = 'test'
df_combined = pd.concat([train, test], axis=0, ignore_index=True)

df_engineered = fe.fit_transform(df_combined)

train_eng = df_engineered[df_engineered['_dataset'] == 'train'].drop('_dataset', axis=1).reset_index(drop=True)
test_eng = df_engineered[df_engineered['_dataset'] == 'test'].drop('_dataset', axis=1).reset_index(drop=True)

FEATURE ENGINEERING

[Step 1] Creating time-series features...

[Step 2] Creating technical indicators...

[Step 3] Creating turnover features...

[Step 4] Creating liquidity features...

[Step 5] Creating volatility regime features...

[Step 6] Creating advanced momentum features...

[Step 7] Creating cross-sectional features...

✅ FEATURE ENGINEERING COMPLETE!


In [9]:
train_eng

Unnamed: 0,ID,DATE,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,...,RET_10_vs_sector_med,sector_strength_rank,momentum_1_5,momentum_1_5_pctrank,momentum_6_20,momentum_6_20_pctrank,vol_5d,vol_5d_pctrank,ret_vol_ratio,ret_vol_ratio_pctrank
0,2377,1,0,37,12,5,94,-0.005967,0.136699,0.009031,...,-0.007893,,0.005347,0.366182,0.001468,0.790854,0.007284,0.137540,0.733974,0.654023
1,5198,4,0,37,12,5,94,0.001348,-0.269520,0.011100,...,-0.002215,,0.002730,0.623980,-0.001430,0.145087,0.011573,0.278822,0.235824,0.676481
2,8017,5,0,37,12,5,94,-0.014405,0.192655,0.003614,...,0.009023,,0.000613,0.670762,0.001354,0.747280,0.009360,0.074763,0.065499,0.709372
3,20826,11,0,37,12,5,94,0.008938,0.430916,0.002662,...,0.011519,,0.005347,0.761224,0.004605,0.902721,0.006656,0.112585,0.803225,0.945918
4,33843,21,0,37,12,5,94,-0.006523,-0.060371,-0.007632,...,-0.006162,,-0.002009,0.451809,-0.005748,0.187352,0.006528,0.148461,-0.307673,0.354075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,391556,206,5716,50,17,7,114,0.029552,-0.075091,-0.001428,...,0.009727,0.045840,0.009303,0.751933,0.007490,0.799016,0.013405,0.323612,0.693922,0.837316
418591,394490,208,5716,50,17,7,114,0.008316,0.028099,-0.006688,...,-0.010926,0.038285,0.004816,0.774029,0.004637,0.570893,0.009304,0.334697,0.517537,0.861963
418592,400150,210,5716,50,17,7,114,-0.004633,-0.173518,0.001687,...,0.001495,0.016897,0.004498,0.830601,0.003175,0.719166,0.009700,0.417350,0.463626,0.868511
418593,403129,211,5716,50,17,7,114,0.010883,0.172313,0.008844,...,-0.003665,0.015748,0.008220,0.767827,-0.001501,0.115580,0.004216,0.048327,1.949107,0.984792


# Pre-modeling analysis

In [11]:
categorical_cols = ['DATE', 'STOCK', 'INDUSTRY', 'INDUSTRY_GROUP', 'SECTOR', 'SUB_INDUSTRY']
exclude_cols = ['ID', 'RET'] + categorical_cols

# Get all numeric features
feature_cols = [col for col in train_eng.columns if col not in exclude_cols]
print(f"\nTotal features: {len(feature_cols)}")

# ----- 1.1: Correlation with target -----
print("\n" + "-"*80)
print("TOP 30 FEATURES BY CORRELATION WITH TARGET")
print("-"*80)
target_corr = train_eng[feature_cols + ['RET']].corr()['RET'].drop('RET').abs().sort_values(ascending=False)
print(target_corr.head(30))

# Save for later use
top_50_by_corr = target_corr.head(50).index.tolist()



Total features: 113

--------------------------------------------------------------------------------
TOP 30 FEATURES BY CORRELATION WITH TARGET
--------------------------------------------------------------------------------
volume_nonmissing_frac    0.028041
vol_5d_pctrank            0.025208
VOLUME_1_pctrank          0.023099
RET_1_sector_pctrank      0.022385
RET_1_pctrank             0.022052
RET_1_roll_std_60         0.021728
RET_1_vs_sector_med       0.019945
RET_1_cs_zscore           0.019352
RET_1_vol_adj             0.019279
RET_1_roll_std_20         0.017856
RSI_14                    0.016891
RET_1                     0.016845
vol_regime                0.016397
RET_1_regime_adj          0.016211
momentum_1_5_pctrank      0.016190
VOLUME_1_log              0.015627
RET_1_roll_std_10         0.015253
RET_1_dist_mean_20        0.014709
low_vol_regime            0.014335
RET_17                    0.014077
high_vol_regime           0.013305
RET_16                    0.012448
RET

In [12]:
print("\n" + "-"*80)
print("MULTICOLLINEARITY CHECK (Highly correlated feature pairs)")
print("-"*80)

# Compute correlation matrix
corr_matrix = train_eng[feature_cols].corr().abs()

# Find highly correlated pairs (>0.95)
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > 0.95:
            high_corr_pairs.append({
                'feature_1': corr_matrix.columns[i],
                'feature_2': corr_matrix.columns[j],
                'correlation': corr_matrix.iloc[i, j]
            })

if high_corr_pairs:
    high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('correlation', ascending=False)
    print(f"\nFound {len(high_corr_pairs)} highly correlated pairs (r > 0.95):")
    print(high_corr_df.head(20).to_string(index=False))
    
    # Features to potentially remove (keep one from each pair)
    redundant_features = set(high_corr_df['feature_2'].tolist())
    print(f"\n⚠️  Consider removing {len(redundant_features)} redundant features")
else:
    print("✓ No severe multicollinearity detected (all correlations < 0.95)")



--------------------------------------------------------------------------------
MULTICOLLINEARITY CHECK (Highly correlated feature pairs)
--------------------------------------------------------------------------------

Found 3 highly correlated pairs (r > 0.95):
         feature_1          feature_2  correlation
             RET_1   RET_1_regime_adj     0.998498
             RET_1 RET_1_dist_mean_20     0.963680
RET_1_dist_mean_20   RET_1_regime_adj     0.962721

⚠️  Consider removing 2 redundant features


In [13]:
print("\n" + "-"*80)
print("LOW VARIANCE FEATURES")
print("-"*80)

feature_variance = train_eng[feature_cols].var().sort_values()
low_var_features = feature_variance[feature_variance < 0.001].index.tolist()

if low_var_features:
    print(f"Found {len(low_var_features)} features with very low variance:")
    print(feature_variance[feature_variance < 0.001])
    print("\n⚠️  Consider removing these features")
else:
    print("✓ All features have sufficient variance")


--------------------------------------------------------------------------------
LOW VARIANCE FEATURES
--------------------------------------------------------------------------------
Found 34 features with very low variance:
RET_1_lowliq_interaction    0.000012
RET_1_roll_mean_60          0.000027
RET_1_roll_mean_20          0.000069
RET_1_roll_mean_10          0.000122
RET_1_roll_std_60           0.000171
momentum_1_5                0.000180
momentum_6_20               0.000215
RET_1_roll_mean_5           0.000230
RET_1_roll_std_20           0.000246
RET_1_roll_std_10           0.000308
RET_1_roll_std_5            0.000394
vol_5d                      0.000474
reversal_magnitude          0.000555
RET_5_vs_sector_med         0.000746
RET_1_vs_sector_med         0.000747
RET_10_vs_sector_med        0.000784
RET_16                      0.000872
RET_1_dist_mean_20          0.000901
RET_14                      0.000923
RET_11                      0.000944
RET_19                      0.000

In [15]:
print("="*80)
print("FEATURE CLEANUP")
print("="*80)

# Remove multicollinear features (definite removes)
redundant_features = ['RET_1_regime_adj', 'RET_1_dist_mean_20']

# DON'T remove low variance features - they're fine for returns!
# low_var_features = []  # Leave this empty

# Combine removals
features_to_remove = redundant_features
features_to_use = [f for f in feature_cols if f not in features_to_remove]

print(f"\nOriginal features: {len(feature_cols)}")
print(f"Removing: {len(features_to_remove)}")
print(f"Final features: {len(features_to_use)}")

FEATURE CLEANUP

Original features: 113
Removing: 2
Final features: 111


In [19]:
print("\n" + "="*80)
print("DATA PREPARATION")
print("="*80)

categorical_cols = ['DATE', 'STOCK', 'INDUSTRY', 'INDUSTRY_GROUP', 'SECTOR', 'SUB_INDUSTRY']
exclude_cols = ['ID', 'RET'] + categorical_cols

X_train = train_eng[features_to_use].copy()
y_train = train_eng['RET'].astype(int)
groups_train = train_eng['DATE'].values

X_test = test_eng[features_to_use].copy()

print("\nChecking for problematic columns...")

# Check for all-NaN columns
all_nan_cols = X_train.columns[X_train.isna().all()].tolist()
if all_nan_cols:
    print(f"⚠️  Found {len(all_nan_cols)} all-NaN columns:")
    for col in all_nan_cols:
        print(f"  ✗ {col}")
    X_train = X_train.drop(columns=all_nan_cols)
    X_test = X_test.drop(columns=all_nan_cols)

# Check for non-numeric columns
non_numeric_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric_cols:
    print(f"⚠️  Found {len(non_numeric_cols)} non-numeric columns:")
    for col in non_numeric_cols:
        print(f"  ✗ {col}")
    X_train = X_train.drop(columns=non_numeric_cols)
    X_test = X_test.drop(columns=non_numeric_cols)

# Check for columns with >95% NaN
high_nan_cols = X_train.columns[X_train.isna().mean() > 0.95].tolist()
if high_nan_cols:
    print(f"⚠️  Found {len(high_nan_cols)} columns with >95% NaN:")
    for col in high_nan_cols:
        print(f"  ✗ {col} ({X_train[col].isna().mean()*100:.1f}% NaN)")
    X_train = X_train.drop(columns=high_nan_cols)
    X_test = X_test.drop(columns=high_nan_cols)

imputer = SimpleImputer(strategy='median')

X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,  # Now column count matches!
    index=X_train.index
)

X_test_imputed = pd.DataFrame(
    imputer.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_imputed),
    columns=X_train_imputed.columns,
    index=X_train_imputed.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_imputed),
    columns=X_test_imputed.columns,
    index=X_test_imputed.index
)



DATA PREPARATION

Checking for problematic columns...
⚠️  Found 3 all-NaN columns:
  ✗ volume_directional_flow
  ✗ avg_volume_on_extreme
  ✗ turnover_ret_rank_corr


In [22]:
print("\n" + "="*80)
print("BASELINE MODEL TRAINING")
print("="*80)


tscv = TimeSeriesSplit(n_splits=5)

baseline_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.7,
    random_state=42,
    eval_metric='logloss',
    tree_method='hist'
)

cv_results = []
feature_importance_list = []

print("\nRunning 5-fold time series cross-validation...\n")

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train_scaled), 1):
    X_tr, X_val = X_train_scaled.iloc[train_idx], X_train_scaled.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    baseline_model.fit(X_tr, y_tr, verbose=False)
    
    y_pred = baseline_model.predict(X_val)
    y_prob = baseline_model.predict_proba(X_val)[:, 1]
    
    acc = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_prob)
    logloss = log_loss(y_val, y_prob)
    
    cv_results.append({
        'fold': fold,
        'accuracy': acc,
        'auc': auc,
        'logloss': logloss
    })
    
    feature_importance_list.append(baseline_model.feature_importances_)
    
    print(f"Fold {fold}: Acc={acc:.4f}, AUC={auc:.4f}, LogLoss={logloss:.4f}")

cv_df = pd.DataFrame(cv_results)
print("\n" + "-"*80)
print(f"Mean Accuracy: {cv_df['accuracy'].mean():.4f} ± {cv_df['accuracy'].std():.4f}")
print(f"Mean AUC:      {cv_df['auc'].mean():.4f} ± {cv_df['auc'].std():.4f}")
print(f"Mean LogLoss:  {cv_df['logloss'].mean():.4f} ± {cv_df['logloss'].std():.4f}")
print("-"*80)



BASELINE MODEL TRAINING

Running 5-fold time series cross-validation...

Fold 1: Acc=0.5131, AUC=0.5195, LogLoss=0.6929
Fold 2: Acc=0.5432, AUC=0.5588, LogLoss=0.6876
Fold 3: Acc=0.5351, AUC=0.5483, LogLoss=0.6887
Fold 4: Acc=0.5372, AUC=0.5512, LogLoss=0.6884
Fold 5: Acc=0.5401, AUC=0.5606, LogLoss=0.6871

--------------------------------------------------------------------------------
Mean Accuracy: 0.5337 ± 0.0119
Mean AUC:      0.5477 ± 0.0166
Mean LogLoss:  0.6889 ± 0.0023
--------------------------------------------------------------------------------


In [23]:
print("\n" + "="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)

avg_importance = np.mean(feature_importance_list, axis=0)
target_corr_aligned = target_corr.reindex(X_train_scaled.columns).fillna(0).abs()

importance_df = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'importance': avg_importance,
    'importance_std': np.std(feature_importance_list, axis=0),
    'target_corr': target_corr_aligned.values
}).sort_values('importance', ascending=False)

print("\nTOP 30 FEATURES BY MODEL IMPORTANCE:")
print(importance_df.head(30).to_string(index=False))

# Categorize features
def categorize_feature(feat):
    if 'RSI' in feat or 'ADL' in feat:
        return 'Technical'
    elif 'turnover' in feat or 'volume' in feat.lower():
        return 'Turnover/Volume'
    elif 'regime' in feat or 'vol_' in feat:
        return 'Volatility'
    elif 'momentum' in feat or 'reversal' in feat:
        return 'Momentum'
    elif 'pctrank' in feat or 'zscore' in feat or 'sector' in feat:
        return 'Cross-sectional'
    elif 'lag' in feat or 'roll' in feat:
        return 'Time-series'
    else:
        return 'Other'

importance_df['category'] = importance_df['feature'].apply(categorize_feature)
category_importance = importance_df.groupby('category')['importance'].agg(['sum', 'mean', 'count']).sort_values('sum', ascending=False)

print("\n" + "-"*80)
print("IMPORTANCE BY FEATURE CATEGORY")
print("-"*80)
print(category_importance)


FEATURE IMPORTANCE ANALYSIS

TOP 30 FEATURES BY MODEL IMPORTANCE:
               feature  importance  importance_std  target_corr
volume_nonmissing_frac    0.016696        0.003750     0.028041
       RET_1_cs_zscore    0.013104        0.001275     0.019352
                RET_17    0.012707        0.002100     0.014077
                vol_5d    0.012294        0.001637     0.011810
                 RET_1    0.012036        0.002027     0.016845
          VOLUME_1_log    0.011889        0.001202     0.015627
        vol_5d_pctrank    0.011460        0.001133     0.025208
              VOLUME_1    0.011387        0.000551     0.004757
  momentum_1_5_pctrank    0.011328        0.000879     0.016190
         momentum_6_20    0.011185        0.000341     0.005491
         RET_1_vol_adj    0.011160        0.000298     0.019279
   RET_1_vs_sector_med    0.011057        0.001146     0.019945
          momentum_1_5    0.010931        0.000283     0.011357
      VOLUME_1_pctrank    0.010824   

In [24]:
# ============================================================================
# TEST REDUCED FEATURE SETS
# ============================================================================
print("\n" + "="*80)
print("TESTING FEATURE SELECTION")
print("="*80)

# Get top features by importance
top_50_features = importance_df.head(50)['feature'].tolist()
top_30_features = importance_df.head(30)['feature'].tolist()

# Compare performance
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

def quick_eval(features, name):
    X_subset = X_train_scaled[features]
    cv_acc = []
    
    for train_idx, val_idx in tscv.split(X_subset):
        X_tr, X_val = X_subset.iloc[train_idx], X_subset.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = xgb.XGBClassifier(
            n_estimators=200, max_depth=5, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.7, random_state=42
        )
        model.fit(X_tr, y_tr, verbose=False)
        cv_acc.append(accuracy_score(y_val, model.predict(X_val)))
    
    mean_acc = np.mean(cv_acc)
    print(f"{name:.<50} {mean_acc:.4f} ({mean_acc*100:.2f}%)")
    return mean_acc

print("\nComparing feature sets:\n")
acc_all = quick_eval(X_train_scaled.columns.tolist(), f"All features ({len(X_train_scaled.columns)})")
acc_50 = quick_eval(top_50_features, "Top 50 features")
acc_30 = quick_eval(top_30_features, "Top 30 features")

print(f"\n{'='*80}")
if acc_50 > acc_all:
    print(f"✅ Top 50 features BETTER by {(acc_50-acc_all)*100:.2f}pp!")
elif acc_30 > acc_all:
    print(f"✅ Top 30 features BETTER by {(acc_30-acc_all)*100:.2f}pp!")
else:
    print(f"✅ All features optimal")
print(f"{'='*80}")



TESTING FEATURE SELECTION

Comparing feature sets:

All features (108)................................ 0.5337 (53.37%)
Top 50 features................................... 0.5340 (53.40%)
Top 30 features................................... 0.5338 (53.38%)

✅ Top 50 features BETTER by 0.03pp!


In [25]:
# ============================================================================
# HYPERPARAMETER TUNING
# ============================================================================
print("\n" + "="*80)
print("HYPERPARAMETER TUNING")
print("="*80)

from sklearn.model_selection import RandomizedSearchCV

# Define search space
param_distributions = {
    'n_estimators': [200, 300, 500],
    'max_depth': [4, 5, 6, 7],
    'learning_rate': [0.03, 0.05, 0.07, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

# Use best feature set (e.g., top 50)
X_train_best = X_train_scaled[top_50_features]

# Random search with time series CV
random_search = RandomizedSearchCV(
    xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    param_distributions=param_distributions,
    n_iter=20,  # Try 20 random combinations
    scoring='roc_auc',
    cv=TimeSeriesSplit(n_splits=3),  # Faster with 3 folds
    verbose=1,
    n_jobs=-1,
    random_state=42
)

print("\nRunning random search (this may take 5-10 minutes)...")
random_search.fit(X_train_best, y_train)

print(f"\n{'='*80}")
print(f"Best parameters found:")
print(f"{'='*80}")
for param, value in random_search.best_params_.items():
    print(f"  {param:.<30} {value}")

print(f"\nBest CV AUC: {random_search.best_score_:.4f}")
print(f"{'='*80}")

# Save best model
best_model = random_search.best_estimator_



HYPERPARAMETER TUNING

Running random search (this may take 5-10 minutes)...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best parameters found:
  subsample..................... 0.9
  reg_lambda.................... 1
  reg_alpha..................... 0.1
  n_estimators.................. 500
  min_child_weight.............. 1
  max_depth..................... 7
  learning_rate................. 0.07
  gamma......................... 0
  colsample_bytree.............. 0.6

Best CV AUC: 0.5654


In [29]:
X_test_best = X_test_scaled[top_50_features]

# Predict
test_preds = best_model.predict(X_test_best)

# Create submission
submission = pd.DataFrame({
    'ID': test_eng['ID'],
    'RET': test_preds
})

# Save
submission.to_csv('submission_tuned.csv', index=False)

In [35]:
# ============================================================================
# ENSEMBLE: XGBoost + LightGBM + CatBoost
# ============================================================================
print("\n" + "="*80)
print("ENSEMBLE MODEL")
print("="*80)


# Use best features
X_train_best = X_train_scaled[top_50_features]
X_test_best = X_test_scaled[top_50_features]

# Train 3 models
print("\nTraining ensemble (XGBoost + LightGBM + CatBoost)...")

# Model 1: XGBoost
xgb_model = xgb.XGBClassifier(**random_search.best_params_, random_state=42)
xgb_model.fit(X_train_best, y_train, verbose=False)

# Model 2: LightGBM
lgb_model = lgb.LGBMClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.7, random_state=42, verbose=-1
)
lgb_model.fit(X_train_best, y_train)

# Model 3: CatBoost
cat_model = CatBoostClassifier(
    iterations=300, depth=6, learning_rate=0.05,
    random_state=42, verbose=0
)
cat_model.fit(X_train_best, y_train)

# Predict on training data
xgb_train_probs = xgb_model.predict_proba(X_train_best)[:, 1]
lgb_train_probs = lgb_model.predict_proba(X_train_best)[:, 1]
cat_train_probs = cat_model.predict_proba(X_train_best)[:, 1]

# Ensemble on training data
ensemble_train_probs = (xgb_train_probs + lgb_train_probs + cat_train_probs) / 3
ensemble_train_preds = (ensemble_train_probs > 0.5).astype(int)

# Training metrics
train_acc = accuracy_score(y_train, ensemble_train_preds)
train_auc = roc_auc_score(y_train, ensemble_train_probs)
train_logloss = log_loss(y_train, ensemble_train_probs)

print("-"*80)
print("TRAINING SET PERFORMANCE")
print("-"*80)
print(f"Accuracy:  {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"AUC:       {train_auc:.4f}")
print(f"LogLoss:   {train_logloss:.4f}")
print("-"*80)


ENSEMBLE MODEL

Training ensemble (XGBoost + LightGBM + CatBoost)...
--------------------------------------------------------------------------------
TRAINING SET PERFORMANCE
--------------------------------------------------------------------------------
Accuracy:  0.6634 (66.34%)
AUC:       0.7309
LogLoss:   0.6568
--------------------------------------------------------------------------------


In [36]:
print("\n[Step 2] Running 5-fold time series cross-validation...\n")

tscv = TimeSeriesSplit(n_splits=5)
cv_results = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train_best), 1):
    X_tr, X_val = X_train_best.iloc[train_idx], X_train_best.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # XGBoost
    xgb_fold = xgb.XGBClassifier(**random_search.best_params_, random_state=42)
    xgb_fold.fit(X_tr, y_tr, verbose=False)
    xgb_val_probs = xgb_fold.predict_proba(X_val)[:, 1]
    
    # LightGBM
    lgb_fold = lgb.LGBMClassifier(
        n_estimators=300, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.7, random_state=42, verbose=-1
    )
    lgb_fold.fit(X_tr, y_tr)
    lgb_val_probs = lgb_fold.predict_proba(X_val)[:, 1]
    
    # CatBoost
    cat_fold = CatBoostClassifier(
        iterations=300, depth=6, learning_rate=0.05,
        random_state=42, verbose=0
    )
    cat_fold.fit(X_tr, y_tr)
    cat_val_probs = cat_fold.predict_proba(X_val)[:, 1]
    
    # Ensemble
    ensemble_val_probs = (xgb_val_probs + lgb_val_probs + cat_val_probs) / 3
    ensemble_val_preds = (ensemble_val_probs > 0.5).astype(int)
    
    # Metrics
    acc = accuracy_score(y_val, ensemble_val_preds)
    auc = roc_auc_score(y_val, ensemble_val_probs)
    logloss = log_loss(y_val, ensemble_val_probs)
    
    cv_results.append({
        'fold': fold,
        'accuracy': acc,
        'auc': auc,
        'logloss': logloss
    })
    
    print(f"Fold {fold}: Acc={acc:.4f}, AUC={auc:.4f}, LogLoss={logloss:.4f}")

# CV Summary
cv_df = pd.DataFrame(cv_results)
cv_acc_mean = cv_df['accuracy'].mean()
cv_auc_mean = cv_df['auc'].mean()
cv_logloss_mean = cv_df['logloss'].mean()

print("\n" + "-"*80)
print("CROSS-VALIDATION PERFORMANCE")
print("-"*80)
print(f"Mean Accuracy: {cv_acc_mean:.4f} ± {cv_df['accuracy'].std():.4f}")
print(f"Mean AUC:      {cv_auc_mean:.4f} ± {cv_df['auc'].std():.4f}")
print(f"Mean LogLoss:  {cv_logloss_mean:.4f} ± {cv_df['logloss'].std():.4f}")
print("-"*80)


[Step 2] Running 5-fold time series cross-validation...

Fold 1: Acc=0.5281, AUC=0.5407, LogLoss=0.6906
Fold 2: Acc=0.5553, AUC=0.5791, LogLoss=0.6831
Fold 3: Acc=0.5412, AUC=0.5599, LogLoss=0.6864
Fold 4: Acc=0.5462, AUC=0.5666, LogLoss=0.6857
Fold 5: Acc=0.5507, AUC=0.5747, LogLoss=0.6839

--------------------------------------------------------------------------------
CROSS-VALIDATION PERFORMANCE
--------------------------------------------------------------------------------
Mean Accuracy: 0.5443 ± 0.0104
Mean AUC:      0.5642 ± 0.0151
Mean LogLoss:  0.6859 ± 0.0029
--------------------------------------------------------------------------------


In [37]:
print("\n" + "="*80)
print("OVERFITTING ANALYSIS")
print("="*80)
print(f"Train Accuracy:  {train_acc:.4f}")
print(f"CV Accuracy:     {cv_acc_mean:.4f}")
print(f"Difference:      {(train_acc - cv_acc_mean):.4f} ({(train_acc - cv_acc_mean)*100:+.2f}pp)")

if train_acc - cv_acc_mean > 0.03:
    print("\n⚠️  WARNING: Significant overfitting detected (>3pp gap)")
    print("   Consider: simpler model, fewer features, more regularization")
elif train_acc - cv_acc_mean > 0.01:
    print("\n⚠️  Mild overfitting detected (1-3pp gap)")
    print("   Model might underperform on test set")
else:
    print("\n✓ Good generalization (gap <1pp)")

print("="*80)



OVERFITTING ANALYSIS
Train Accuracy:  0.6634
CV Accuracy:     0.5443
Difference:      0.1191 (+11.91pp)

   Consider: simpler model, fewer features, more regularization


In [38]:

xgb_probs = xgb_model.predict_proba(X_test_best)[:, 1]
lgb_probs = lgb_model.predict_proba(X_test_best)[:, 1]
cat_probs = cat_model.predict_proba(X_test_best)[:, 1]

ensemble_probs = (xgb_probs + lgb_probs + cat_probs) / 3
ensemble_preds = (ensemble_probs > 0.5).astype(int)

# Create submission
submission = pd.DataFrame({
    'ID': test_eng['ID'],
    'RET': ensemble_preds
})

submission.to_csv('submission_ensemble.csv', index=False)


[Step 3] Making predictions on test set...


Significant overfitting, with submission score of 0.5006650544135429