In [1]:
!git clone https://github.com/aapetukhov/Commodities-Trading.git
%cd Commodities-Trading

Cloning into 'Commodities-Trading'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 73 (delta 16), reused 69 (delta 12), pack-reused 0 (from 0)[K
Receiving objects: 100% (73/73), 9.87 MiB | 20.10 MiB/s, done.
Resolving deltas: 100% (16/16), done.
/kaggle/working/Commodities-Trading


In [2]:
!git pull

Already up to date.


In [None]:
import os
import gc
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Tuple

from catboost import CatBoostRanker, Pool

In [None]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_DIR = Path("data")
TRAIN_PATH = DATA_DIR / "train.csv"
TRAIN_LABELS_PATH = DATA_DIR / "train_labels.csv"
TARGET_PAIRS_PATH = DATA_DIR / "target_pairs.csv"
TEST_PATH = DATA_DIR / "test.csv"

## Метрики из соревнования

In [None]:
SOLUTION_NULL_FILLER = -999999

def rank_correlation_sharpe_ratio(merged_df: pd.DataFrame) -> float:
    prediction_cols = [col for col in merged_df.columns if col.startswith('prediction_')]
    target_cols = [col for col in merged_df.columns if col.startswith('target_')]

    def _compute_rank_correlation(row):
        non_null_targets = [col for col in target_cols if not pd.isnull(row[col])]
        matching_predictions = [col for col in prediction_cols if col.replace('prediction', 'target') in non_null_targets]
        if not non_null_targets:
            raise ValueError('No non-null target values found')
        if row[non_null_targets].std(ddof=0) == 0 or row[matching_predictions].std(ddof=0) == 0:
            # если в конкретный день нулевая дисперсия (редкость), пропустим
            return np.nan
        return np.corrcoef(
            row[matching_predictions].rank(method='average'),
            row[non_null_targets].rank(method='average')
        )[0, 1]

    daily_rank_corrs = merged_df.apply(_compute_rank_correlation, axis=1).dropna()
    std_dev = daily_rank_corrs.std(ddof=0)
    if std_dev == 0 or np.isnan(std_dev):
        return 0.0
    sharpe_ratio = daily_rank_corrs.mean() / std_dev
    return float(sharpe_ratio)

def score_like_competition(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    solution = solution.copy()
    submission = submission.copy()
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    assert all(solution.columns == submission.columns)
    submission = submission.rename(columns={col: col.replace('target_', 'prediction_') for col in submission.columns})
    solution = solution.replace(SOLUTION_NULL_FILLER, None)
    return rank_correlation_sharpe_ratio(pd.concat([solution, submission], axis='columns'))

In [None]:
train = pd.read_csv(TRAIN_PATH)[:-90]
train_labels = pd.read_csv(TRAIN_LABELS_PATH)[:-90]
target_pairs = pd.read_csv(TARGET_PAIRS_PATH)

print(train.shape, train_labels.shape, target_pairs.shape)
print("Example features:", train.columns[:10].tolist())
print("Example labels:", train_labels.columns[:10].tolist())
print(target_pairs.head())

Loading CSVs ...
(1827, 558) (1827, 425) (424, 3)
Example features: ['date_id', 'LME_AH_Close', 'LME_CA_Close', 'LME_PB_Close', 'LME_ZS_Close', 'JPX_Gold_Mini_Futures_Open', 'JPX_Gold_Rolling-Spot_Futures_Open', 'JPX_Gold_Standard_Futures_Open', 'JPX_Platinum_Mini_Futures_Open', 'JPX_Platinum_Standard_Futures_Open']
Example labels: ['date_id', 'target_0', 'target_1', 'target_2', 'target_3', 'target_4', 'target_5', 'target_6', 'target_7', 'target_8']
     target  lag                                            pair
0  target_0    1                           US_Stock_VT_adj_close
1  target_1    1            LME_PB_Close - US_Stock_VT_adj_close
2  target_2    1                     LME_CA_Close - LME_ZS_Close
3  target_3    1                     LME_AH_Close - LME_ZS_Close
4  target_4    1  LME_AH_Close - JPX_Gold_Standard_Futures_Close


In [None]:
label_cols = [c for c in train_labels.columns if c.startswith("target_")]
labels_long = (
    train_labels
    .melt(id_vars="date_id", value_vars=label_cols, var_name="target_col", value_name="y")
    .dropna(subset=["y"])
    .reset_index(drop=True)
)
labels_long["target_id"] = labels_long["target_col"].str.replace("target_", "", regex=False).astype(int)

In [None]:
meta = target_pairs.rename(columns={"target": "target_col"})
labels_long = labels_long.merge(meta, on="target_col", how="left")

labels_long["pair"] = labels_long["pair"].astype("category")
labels_long["target_id_str"] = labels_long["target_id"].astype("category")

In [None]:
train_feats = train.copy()
Xy = labels_long.merge(train_feats, on="date_id", how="left")

Для катбуста важно отсортировать по id группы!!!

In [None]:
unique_dates = np.sort(Xy["date_id"].unique())
# 80 train  20 val
split_idx = int(0.8 * len(unique_dates))
train_dates = set(unique_dates[:split_idx])
val_dates = set(unique_dates[split_idx:])

Xy["is_val"] = Xy["date_id"].isin(val_dates)


# ВАЖНО: СОРТИРУЕМ ПО "date_id" !!!!
train_df = Xy[~Xy["is_val"]].copy().sort_values(by="date_id")
val_df = Xy[Xy["is_val"]].copy().sort_values(by="date_id")

print("Train rows:", len(train_df), "Val rows:", len(val_df))
print("Train days:", train_df['date_id'].nunique(), "Val days:", val_df['date_id'].nunique())

Train rows: 554433 Val rows: 138528
Train days: 1461 Val days: 366


In [None]:
y_tr = train_df["y"].values
y_va = val_df["y"].values

group_tr = train_df["date_id"].values
group_va = val_df["date_id"].values

feature_cols_market = [c for c in train.columns if c != "date_id"]
feature_cols = ["lag", "pair", "target_id_str"] + feature_cols_market

X_tr = train_df[feature_cols]
X_va = val_df[feature_cols]

cat_features = ["pair", "target_id_str"]
cat_feature_indices = [feature_cols.index(c) for c in cat_features]

## Делаем пулы

In [None]:
train_pool = Pool(
    data=X_tr,
    label=y_tr,
    group_id=group_tr,
    cat_features=cat_feature_indices
)

val_pool = Pool(
    data=X_va,
    label=y_va,
    group_id=group_va,
    cat_features=cat_feature_indices
)

## Учим ранкер

In [None]:
params = dict(
    loss_function="PairLogit",
    iterations=800,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3.0,
    random_seed=RANDOM_STATE,
    # eval_metric="NDCG",
    early_stopping_rounds=200,
    verbose=10,
    task_type="GPU", # ON KAGGLE
)

model = CatBoostRanker(**params)

In [14]:
model.fit(train_pool, eval_set=val_pool, use_best_model=True)

Groupwise loss function. OneHotMaxSize set to 10
0:	learn: 0.6927017	test: 0.6933075	best: 0.6933075 (0)	total: 5.73s	remaining: 1h 16m 20s
10:	learn: 0.6930866	test: 0.6931595	best: 0.6931595 (10)	total: 28.6s	remaining: 34m 11s
20:	learn: 0.6928861	test: 0.6931739	best: 0.6931279 (14)	total: 52s	remaining: 32m 8s
30:	learn: 0.6925822	test: 0.6931716	best: 0.6931279 (14)	total: 1m 14s	remaining: 30m 58s
40:	learn: 0.6923206	test: 0.6932206	best: 0.6931279 (14)	total: 1m 38s	remaining: 30m 18s
50:	learn: 0.6921264	test: 0.6932481	best: 0.6931279 (14)	total: 2m 1s	remaining: 29m 40s
60:	learn: 0.6919653	test: 0.6932959	best: 0.6931279 (14)	total: 2m 24s	remaining: 29m 10s
70:	learn: 0.6917660	test: 0.6932986	best: 0.6931279 (14)	total: 2m 47s	remaining: 28m 40s
80:	learn: 0.6915816	test: 0.6933097	best: 0.6931279 (14)	total: 3m 11s	remaining: 28m 15s
90:	learn: 0.6914179	test: 0.6933164	best: 0.6931279 (14)	total: 3m 34s	remaining: 27m 49s
100:	learn: 0.6912592	test: 0.6933231	best: 0.6

<catboost.core.CatBoostRanker at 0x7d171a933810>

## Валидация

In [None]:
val_df = val_df.copy()
val_df["pred"] = model.predict(val_pool)

pred_wide = (
    val_df
    .pivot(index="date_id", columns="target_col", values="pred")
    .reset_index()
    .rename_axis(None, axis=1)
)

sol_wide = (
    val_df
    .pivot(index="date_id", columns="target_col", values="y")
    .reset_index()
    .rename_axis(None, axis=1)
)

pred_wide = pred_wide.rename(columns=lambda c: c if c=="date_id" else c)
sol_wide = sol_wide.rename(columns=lambda c: c if c=="date_id" else c)

val_score = score_like_competition(sol_wide, pred_wide, row_id_column_name="date_id")
print(f"Sharpe: {val_score:.5f}")

Validation competition-like score (Sharpe of daily rank corr): 0.08092


In [None]:
fi = pd.DataFrame({
    "feature": feature_cols,
    "importance": model.get_feature_importance(train_pool, type="FeatureImportance")
}).sort_values("importance", ascending=False)

print("Top 25 features:")
print(fi.head(25))

Top 25 features:
                                     feature  importance
1                                       pair    0.000042
2                              target_id_str    0.000026
0                                        lag    0.000020
530                                FX_GBPAUD    0.000010
5                               LME_PB_Close    0.000009
554                                FX_ZAREUR    0.000008
451                  US_Stock_EOG_adj_volume    0.000005
17        JPX_Platinum_Standard_Futures_High    0.000005
105                    US_Stock_RIO_adj_open    0.000005
512                  US_Stock_VWO_adj_volume    0.000005
191                    US_Stock_MPC_adj_high    0.000005
179                    US_Stock_HES_adj_high    0.000004
376                   US_Stock_KGC_adj_close    0.000004
166                    US_Stock_EOG_adj_high    0.000003
378                   US_Stock_LQD_adj_close    0.000003
507                  US_Stock_VGK_adj_volume    0.000003
360           

In [None]:
MODEL_PATH = Path("catboost_ranker_pairlogit.cbm")
model.save_model(MODEL_PATH)
print("Saved model to", MODEL_PATH)

Saved model to catboost_ranker_pairlogit.cbm


In [None]:
del train_pool, val_pool
gc.collect()

0