In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

INPUT_DIR = '/kaggle/input/hull-tactical-market-prediction'
WORK_DIR = '/kaggle/working'

def log(msg):
    print(f'[LOG] {msg}')

In [2]:
train_path = os.path.join(INPUT_DIR, 'train.csv') if os.path.exists(INPUT_DIR) else 'train.csv'
test_path  = os.path.join(INPUT_DIR, 'test.csv')  if os.path.exists(INPUT_DIR) else 'test.csv'

train = pd.read_csv(train_path).sort_values('date_id').reset_index(drop=True)
test  = pd.read_csv(test_path).sort_values('date_id').reset_index(drop=True)

log(f'train shape={train.shape}, test shape={test.shape}')
has_is_scored = 'is_scored' in test.columns
log(f'is_scored in test? {has_is_scored}')
train.head()

[LOG] train shape=(8990, 98), test shape=(10, 99)
[LOG] is_scored in test? True


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,V3,V4,V5,V6,V7,V8,V9,forward_returns,risk_free_rate,market_forward_excess_returns
0,0,0,0,0,1,1,0,0,0,1,...,,,,,,,,-0.002421,0.000301,-0.003038
1,1,0,0,0,1,1,0,0,0,1,...,,,,,,,,-0.008495,0.000303,-0.009114
2,2,0,0,0,1,0,0,0,0,1,...,,,,,,,,-0.009624,0.000301,-0.010243
3,3,0,0,0,1,0,0,0,0,0,...,,,,,,,,0.004662,0.000299,0.004046
4,4,0,0,0,1,0,0,0,0,0,...,,,,,,,,-0.011686,0.000299,-0.012301


In [3]:
# Mark sets and combine
train['__is_train__'] = 1
test['__is_train__']  = 0
combo = pd.concat([train, test], ignore_index=True, sort=False).sort_values('date_id')

# Historical forward_returns (past-only)
combo['fr_hist_source'] = np.where(combo['__is_train__']==1, combo['forward_returns'], np.nan)
combo['fr_hist'] = combo['fr_hist_source'].ffill()

# Lags of forward_returns 1..5
for lag in range(1, 6):
    combo[f'forward_returns_lag{lag}'] = combo['fr_hist'].shift(lag)

# Historical market_forward_excess_returns (past-only)
combo['mkt_ex_hist_source'] = np.where(combo['__is_train__']==1, combo['market_forward_excess_returns'], np.nan)
combo['mkt_ex_hist'] = combo['mkt_ex_hist_source'].ffill()
for lag in range(1, 6):
    combo[f'market_forward_excess_returns_lag{lag}'] = combo['mkt_ex_hist'].shift(lag)

# Rolling stats (past-only)
combo['fr_roll5_mean']  = combo['fr_hist'].rolling(5).mean()
combo['fr_roll5_std']   = combo['fr_hist'].rolling(5).std()
combo['fr_roll20_mean'] = combo['fr_hist'].rolling(20).mean()
combo['fr_roll20_std']  = combo['fr_hist'].rolling(20).std()

# Split back and drop helpers
drop_helpers = ['__is_train__','fr_hist_source','fr_hist','mkt_ex_hist_source','mkt_ex_hist']
train2 = combo[combo['__is_train__']==1].drop(columns=drop_helpers)
test2  = combo[combo['__is_train__']==0].drop(columns=drop_helpers)

log(f'After features: train2={train2.shape}, test2={test2.shape}')

[LOG] After features: train2=(8990, 116), test2=(10, 116)


In [4]:
TARGET = 'forward_returns'
TARGETS = ['forward_returns','risk_free_rate','market_forward_excess_returns']

# Strict intersection
all_cols = [c for c in train2.columns if c not in TARGETS]
feature_cols = [c for c in all_cols if c in test2.columns]
log(f'Initial intersection feature count: {len(feature_cols)}')

# Fallback to raw common features if intersection is empty
if len(feature_cols) == 0:
    base_train = pd.read_csv(train_path).sort_values('date_id').reset_index(drop=True)
    base_test  = pd.read_csv(test_path).sort_values('date_id').reset_index(drop=True)
    feature_cols = [c for c in base_train.columns if (c in base_test.columns) and (c not in TARGETS)]
    train2 = base_train.copy()
    test2  = base_test.copy()
    log(f'Fallback to raw common features. Count: {len(feature_cols)}')

# Drop rows only if target is missing; do NOT drop by feature NaNs
train2 = train2.dropna(subset=[TARGET]).reset_index(drop=True)
log(f'train2 after dropping only missing target: {train2.shape}')

# Optional: trim long window features if dataset modest
if train2.shape[0] < 3000:
    reduced = [c for c in feature_cols if 'roll20' not in c]
    if len(reduced) != len(feature_cols):
        feature_cols = reduced
        log(f'Removed 20-day rolling features. New feature count: {len(feature_cols)}')

assert len(feature_cols) > 0, 'No features available after fallback.'


[LOG] Initial intersection feature count: 113
[LOG] train2 after dropping only missing target: (8990, 116)


In [5]:
def time_split(df, train_frac=0.95, valid_frac=0.03):
    n = len(df)
    tr_end = max(1, int(n*train_frac))
    va_end = max(tr_end+1, int(n*(train_frac+valid_frac)))
    if va_end >= n:
        va_end = n-1 if n > 2 else n
    return df.iloc[:tr_end], df.iloc[tr_end:va_end], df.iloc[va_end:]

train_df, valid_df, hold_df = time_split(train2, 0.95, 0.03)
log(f'Splits: train={train_df.shape}, valid={valid_df.shape}, holdout={hold_df.shape}')

Xtr, ytr = train_df[feature_cols], train_df[TARGET]
Xva, yva = valid_df[feature_cols], valid_df[TARGET]
Xho, yho = hold_df[feature_cols],  hold_df[TARGET]
log(f'X shapes -> train={Xtr.shape}, valid={Xva.shape}, holdout={Xho.shape}')
assert Xtr.shape[0] > 0 and Xtr.shape[1] > 0, 'Training set empty or no features.'

[LOG] Splits: train=(8540, 116), valid=(270, 116), holdout=(180, 116)
[LOG] X shapes -> train=(8540, 113), valid=(270, 113), holdout=(180, 113)


In [6]:
ridge_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', Ridge(alpha=1.0, random_state=RANDOM_STATE))
])
rf_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('model', RandomForestRegressor(n_estimators=400, max_depth=8, random_state=RANDOM_STATE, n_jobs=-1))
])
en_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
    ('model', ElasticNet(alpha=1e-4, l1_ratio=0.1, random_state=RANDOM_STATE, max_iter=10000))
])

ridge_pipe.fit(Xtr, ytr); rf_pipe.fit(Xtr, ytr); en_pipe.fit(Xtr, ytr)

def rmse(y, p): 
    return np.sqrt(mean_squared_error(y, p))
def eval_model(model, X, y, split):
    p = model.predict(X)
    return {'split': split, 'rmse': rmse(y, p), 'mae': mean_absolute_error(y, p),
            'sharpe_like': (np.mean(p)/np.std(p)) if np.std(p) > 0 else 0.0}

rows = []
for name, model in [('ridge', ridge_pipe), ('rf', rf_pipe), ('elasticnet', en_pipe)]:
    rows.append({'model': name, **eval_model(model, Xtr, ytr, 'train')})
    rows.append({'model': name, **eval_model(model, Xva, yva, 'valid')})
    rows.append({'model': name, **eval_model(model, Xho, yho, 'holdout')})

scores_df = pd.DataFrame(rows)
scores_df.to_csv('model_scores.csv', index=False)
scores_df

Unnamed: 0,model,split,rmse,mae,sharpe_like
0,ridge,train,0.000353,5.2e-05,0.042335
1,ridge,valid,0.000137,0.00013,0.154588
2,ridge,holdout,0.000205,0.000197,0.067966
3,rf,train,0.005774,0.004438,0.063774
4,rf,valid,0.004982,0.003922,0.230176
5,rf,holdout,0.007904,0.00548,0.004359
6,elasticnet,train,0.000359,7.4e-05,0.042572
7,elasticnet,valid,0.000123,0.000109,0.152077
8,elasticnet,holdout,0.000131,0.00011,0.059047


In [7]:
Xtest = test2[feature_cols]

pred_ridge = ridge_pipe.predict(Xtest)
pred_en    = en_pipe.predict(Xtest)

pred = 0.6*pred_en + 0.4*pred_ridge

submission = pd.DataFrame({
    'date_id': test2['date_id'].values,
    'forward_returns': pred
})

submission.to_parquet('submission.parquet', index=False)

submission.head()

Unnamed: 0,date_id,forward_returns
0,8980,-0.005789
1,8981,-0.007262
2,8982,-0.007278
3,8983,0.008455
4,8984,-0.002738


In [8]:
print('Working dir files:', os.listdir(WORK_DIR))
print(pd.read_csv('model_scores.csv').head())
import pandas as pd
print(pd.read_parquet('submission.parquet').head())

Working dir files: ['submission.parquet', '__notebook__.ipynb', 'model_scores.csv']
   model    split      rmse       mae  sharpe_like
0  ridge    train  0.000353  0.000052     0.042335
1  ridge    valid  0.000137  0.000130     0.154588
2  ridge  holdout  0.000205  0.000197     0.067966
3     rf    train  0.005774  0.004438     0.063774
4     rf    valid  0.004982  0.003922     0.230176
   date_id  forward_returns
0     8980        -0.005789
1     8981        -0.007262
2     8982        -0.007278
3     8983         0.008455
4     8984        -0.002738
