In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

INPUT_DIR = '/kaggle/input/hull-tactical-market-prediction'
WORK_DIR = '/kaggle/working'

def log(msg):
    print(f'[LOG] {msg}')

In [2]:
train_path = os.path.join(INPUT_DIR, 'train.csv')
test_path  = os.path.join(INPUT_DIR, 'test.csv')

train = pd.read_csv(train_path).sort_values('date_id').reset_index(drop=True)
test  = pd.read_csv(test_path).sort_values('date_id').reset_index(drop=True)

log(f'train shape={train.shape}, test shape={test.shape}')
has_is_scored = 'is_scored' in test.columns
log(f'is_scored in test? {has_is_scored}')

[LOG] train shape=(8990, 98), test shape=(10, 99)
[LOG] is_scored in test? True


In [3]:
TARGET = 'forward_returns'
TARGETS = ['forward_returns','risk_free_rate','market_forward_excess_returns']

# Strict intersection of non-target columns present in both train and test
feature_cols = [c for c in train.columns if c in test.columns and c not in TARGETS]
log(f'Feature count: {len(feature_cols)}')

# Keep only rows with target for training
train2 = train.dropna(subset=[TARGET]).reset_index(drop=True)

X_all = train2[feature_cols]
y_all = train2[TARGET]
log(f'X_all: {X_all.shape}, y_all: {y_all.shape}')

[LOG] Feature count: 95
[LOG] X_all: (8990, 95), y_all: (8990,)


In [4]:
ridge_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', Ridge(alpha=1.0, random_state=RANDOM_STATE))
])

en_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', ElasticNet(alpha=1e-4, l1_ratio=0.1, random_state=RANDOM_STATE, max_iter=10000))
])

rf_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('model', RandomForestRegressor(n_estimators=300, max_depth=8, random_state=RANDOM_STATE, n_jobs=-1))
])

#  95% train, 5% holdout
n = len(train2)
cut = max(1, int(n*0.95))
Xtr, ytr = train2.iloc[:cut][feature_cols], train2.iloc[:cut][TARGET]
Xho, yho = train2.iloc[cut:][feature_cols], train2.iloc[cut:][TARGET]
log(f'Split shapes -> train={Xtr.shape}, holdout={Xho.shape}')


ridge_pipe.fit(Xtr, ytr)
en_pipe.fit(Xtr, ytr)
rf_pipe.fit(Xtr, ytr)

def rmse(y, p): 
    return np.sqrt(mean_squared_error(y, p))
def score_model(m, Xv, yv):
    p = m.predict(Xv)
    return rmse(yv, p), mean_absolute_error(yv, p)

scores = { 
    'ridge': score_model(ridge_pipe, Xho, yho), 
    'elasticnet': score_model(en_pipe, Xho, yho),
    'rf': score_model(rf_pipe, Xho, yho)
}
log(f'Holdout RMSE/MAE: {scores}')

best_model = ridge_pipe if scores['ridge'][0] <= scores['elasticnet'][0] else en_pipe
chosen = 'ridge' if best_model is ridge_pipe else 'elasticnet'
log(f'Chosen model: {chosen}')

[LOG] Split shapes -> train=(8540, 95), holdout=(450, 95)
[LOG] Holdout RMSE/MAE: {'ridge': (0.009322029894664532, 0.006558694667787884), 'elasticnet': (0.009271981228999667, 0.0065077495709852115), 'rf': (0.009203966168133421, 0.006498452952488181)}
[LOG] Chosen model: elasticnet


In [5]:
Xtest = test[feature_cols]

pred_main = best_model.predict(Xtest)

other = en_pipe if best_model is ridge_pipe else ridge_pipe
pred_other = other.predict(Xtest)
pred = 0.6*pred_main + 0.4*pred_other

submission = pd.DataFrame({
    'date_id': test['date_id'].values,
    'forward_returns': pred
})

if has_is_scored:
    mask = test['is_scored'].values.astype(bool)
    submission = submission[mask]

submission.to_parquet('submission.parquet', index=False)

print(submission.head())
print('Working dir files:', os.listdir(WORK_DIR))


   date_id  forward_returns
0     8980         0.002101
1     8981         0.002599
2     8982         0.002401
3     8983         0.001971
4     8984         0.003098
Working dir files: ['submission.parquet', '__notebook__.ipynb']
