In [1]:
import os
import numpy as np
import pandas as pd
import polars as pl

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

import kaggle_evaluation.default_inference_server

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

INPUT_DIR = '/kaggle/input/hull-tactical-market-prediction'
TARGET = 'forward_returns'
TARGETS = ['forward_returns','risk_free_rate','market_forward_excess_returns']

def log(msg):
    print(f'[LOG] {msg}')

In [2]:
train_path = os.path.join(INPUT_DIR, 'train.csv')
test_path  = os.path.join(INPUT_DIR, 'test.csv')

train = pd.read_csv(train_path).sort_values('date_id').reset_index(drop=True)
test  = pd.read_csv(test_path).sort_values('date_id').reset_index(drop=True)

log(f'train shape={train.shape}, test shape={test.shape}')
has_is_scored = 'is_scored' in test.columns
log(f'is_scored in test? {has_is_scored}')

[LOG] train shape=(8990, 98), test shape=(10, 99)
[LOG] is_scored in test? True


In [3]:
common_feats = [c for c in train.columns if c in test.columns and c not in TARGETS]
feature_cols = common_feats.copy()
log(f'Feature count: {len(feature_cols)}')

train2 = train.dropna(subset=[TARGET]).reset_index(drop=True)
X = train2[feature_cols]
y = train2[TARGET]
log(f'X train shape: {X.shape}')

[LOG] Feature count: 95
[LOG] X train shape: (8990, 95)


In [4]:
ridge_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', Ridge(alpha=1.0, random_state=RANDOM_STATE))
])

en_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', ElasticNet(alpha=1e-4, l1_ratio=0.1, random_state=RANDOM_STATE, max_iter=10000))
])

rf_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('model', RandomForestRegressor(n_estimators=300, max_depth=8, random_state=RANDOM_STATE, n_jobs=-1))
])

ridge_pipe.fit(X, y)
en_pipe.fit(X, y)
rf_pipe.fit(X, y)

#  time split (95/5)
n = len(train2)
cut = max(1, int(n*0.95))
Xtr, ytr = train2.iloc[:cut][feature_cols], train2.iloc[:cut][TARGET]
Xho, yho = train2.iloc[cut:][feature_cols], train2.iloc[cut:][TARGET]

def rmse(y, p): 
    return np.sqrt(mean_squared_error(y, p))
def eval_model(m, Xv, yv):
    p = m.predict(Xv)
    return rmse(yv, p), mean_absolute_error(yv, p)

scores = {}
for name, m in [('ridge', ridge_pipe), ('elasticnet', en_pipe), ('rf', rf_pipe)]:
    r, a = eval_model(m, Xho, yho)
    scores[name] = (r, a)
log(f'Holdout RMSE/MAE: {scores}')

best_model = ridge_pipe if scores['ridge'][0] <= scores['elasticnet'][0] else en_pipe
chosen = 'ridge' if best_model is ridge_pipe else 'elasticnet'
log(f'Chosen model: {chosen}')

[LOG] Holdout RMSE/MAE: {'ridge': (0.009138127076919871, 0.006478569295320791), 'elasticnet': (0.009148017646183906, 0.006474974665647242), 'rf': (0.008933626798736154, 0.006399406331208841)}
[LOG] Chosen model: ridge


In [5]:
trained_cols = feature_cols.copy()

def predict(test: pl.DataFrame):
    batch = test.to_pandas()

    Xb = batch.reindex(columns=trained_cols, fill_value=np.nan)

    preds = best_model.predict(Xb)
    return pl.DataFrame({'forward_returns': preds})


In [6]:
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway((INPUT_DIR,))
