In [1]:
!mkdir oof models

mkdir: cannot create directory ‘oof’: File exists
mkdir: cannot create directory ‘models’: File exists


In [2]:
import random
import numpy as np
import os

import pandas as pd
import polars as pl

from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor

import pickle
import gc


# import scipy as sp
# from glob import glob
# import joblib
# import itertools
# from tqdm.auto import tqdm
# import torch
# from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
# from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.preprocessing import LabelEncoder

In [3]:
class CFG:
    VER = 1
    DATA_PATH = Path("/kaggle/input/mitsui-commodity-prediction-challenge")
    SEED = 42
    N_SPLIT = 5
    METHOD_LIST = ["lightgbm", "xgboost", "catboost"]
    MODEL_DATA_PATH = Path("./models")
    OOF_DATA_PATH = Path("./oof")

    # USE_GPU = torch.cuda.is_available()
    # metric = 'rmse'
    # metric_maximize_flag = False

    
    num_boost_round = 2500
    early_stopping_round = 10
    verbose = 50

    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
    regression_lgb_params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.005, # default = 0.1
        "num_leaves": 6, # default = 31
        "seed": SEED      
    }

    regression_xgb_params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        'learning_rate': 0.005, 
        'max_depth': 4,
        'random_state': SEED,
    }

    regression_cat_params = {
        'loss_function': 'RMSE',
        'learning_rate': 0.005, 
        'iterations': num_boost_round, 
        'depth': 4, 
        'random_seed': SEED,
    }
    
    
    PREFIX = f"seed{SEED}_ver{VER}"

In [4]:
def seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed(CFG.SEED)

### The competition's metric
- is a variant of the Sharpe ratio, computed by dividing the mean Spearman rank correlation between the predictions and targets by the standard deviation. 

In [5]:


SOLUTION_NULL_FILLER = -999999


def rank_correlation_sharpe_ratio(merged_df: pd.DataFrame) -> float:
    """
    Calculates the rank correlation between predictions and target values,
    and returns its Sharpe ratio (mean / standard deviation).

    :param merged_df: DataFrame containing prediction columns (starting with 'prediction_')
                      and target columns (starting with 'target_')
    :return: Sharpe ratio of the rank correlation
    :raises ZeroDivisionError: If the standard deviation is zero
    """
    prediction_cols = [col for col in merged_df.columns if col.startswith('prediction_')]
    target_cols = [col for col in merged_df.columns if col.startswith('target_')]

    def _compute_rank_correlation(row):
        non_null_targets = [col for col in target_cols if not pd.isnull(row[col])]
        matching_predictions = [col for col in prediction_cols if col.replace('prediction', 'target') in non_null_targets]
        if not non_null_targets:
            raise ValueError('No non-null target values found')
        if row[non_null_targets].std(ddof=0) == 0 or row[matching_predictions].std(ddof=0) == 0:
            raise ZeroDivisionError('Denominator is zero, unable to compute rank correlation.')
        return np.corrcoef(row[matching_predictions].rank(method='average'), row[non_null_targets].rank(method='average'))[0, 1]

    daily_rank_corrs = merged_df.apply(_compute_rank_correlation, axis=1)
    std_dev = daily_rank_corrs.std(ddof=0)
    if std_dev == 0:
        raise ZeroDivisionError('Denominator is zero, unable to compute Sharpe ratio.')
    sharpe_ratio = daily_rank_corrs.mean() / std_dev
    return float(sharpe_ratio)


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Calculates the rank correlation between predictions and target values,
    and returns its Sharpe ratio (mean / standard deviation).
    """
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    assert all(solution.columns == submission.columns)

    submission = submission.rename(columns={col: col.replace('target_', 'prediction_') for col in submission.columns})

    # Not all securities trade on all dates, but solution files cannot contain nulls.
    # The filler value allows us to handle trading halts, holidays, & delistings.
    solution = solution.replace(SOLUTION_NULL_FILLER, None)
    return rank_correlation_sharpe_ratio(pd.concat([solution, submission], axis='columns'))

### difine model training

In [6]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, 
                      x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)

    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html
    model = lgb.train(
        params = CFG.regression_lgb_params,
        train_set = lgb_train,
        valid_sets = [lgb_train, lgb_valid],
        num_boost_round = CFG.num_boost_round,
        callbacks = [
            lgb.early_stopping(stopping_rounds = CFG.early_stopping_round, verbose = CFG.verbose),
            lgb.log_evaluation(period = CFG.verbose),
        ]
    )
    
    # Predict validation
    valid_pred = model.predict(x_valid)
    
    return model, valid_pred

# https://xgboost.readthedocs.io/en/stable/python/python_api.html
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame,
                    x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    
    xgb_train = xgb.DMatrix(data = x_train, label = y_train)
    xgb_valid = xgb.DMatrix(data = x_valid, label = y_valid)
    
    model = xgb.train(
        params = CFG.regression_xgb_params,
        dtrain = xgb_train,
        num_boost_round = CFG.num_boost_round,
        evals = [(xgb_train, "train"), (xgb_valid, "eval")],
        early_stopping_rounds = CFG.early_stopping_round,
        verbose_eval = CFG.verbose,
    )
    
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred

def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame,
                      x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    # initialize Pool
    cat_train = Pool(data = x_train, label = y_train)
    cat_valid = Pool(data = x_valid, label = y_valid)
    # specify the training parameters
    model = CatBoostRegressor(**CFG.regression_cat_params)
    # train the model
    model.fit(
        cat_train,
        eval_set = [cat_valid],
        early_stopping_rounds = CFG.early_stopping_round,
        verbose = CFG.verbose,
        use_best_model = True,
    )
    valid_pred = model.predict(x_valid)
    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list,
                                        target_cols: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    for fold in range(CFG.N_SPLIT):
        print("-" * 50)
        print(f"{method} training fold {fold + 1}")
        x_train = train_df[train_df["cv_flag"]!=fold + 1][features]
        y_train = train_df[train_df["cv_flag"]!=fold + 1]["target"]
        
        
        valid_df = train_df[train_df["cv_flag"] == fold + 1].copy()
        x_valid = valid_df[features]
        y_valid = valid_df['target']
        
        if method == "lightgbm":
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid)
            ## 2. feature importance
            importance_df = pd.DataFrame(model.feature_importance(), index = features,
                                        columns = ["feature_importance"]).reset_index()
            importance_df.to_csv(CFG.MODEL_DATA_PATH/f'{method}_fold{fold + 1}_{CFG.PREFIX}_importance.csv', index=False)
            
        if method == "xgboost":
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
            
        if method == "catboost":
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid)
            
        
        # saving bast model
        pickle.dump(model, open(CFG.MODEL_DATA_PATH/f"{method}_fold{fold + 1}_{CFG.PREFIX}.pkl", "wb"))
        
        # Add to out of folds array
        oof_predictions[train_df["cv_flag"] == fold + 1] = valid_pred
        del x_train, y_train, valid_df, x_valid, y_valid, valid_pred, model
        gc.collect()
        
    train_df["pred"] = oof_predictions
    
    # creating a dataframe to store out of folds predictions
    np.save(CFG.OOF_DATA_PATH/f"oof_{method}_{CFG.PREFIX}", oof_predictions)
            
        

            
            

# loading data

In [7]:
train_df = pl.read_csv(CFG.DATA_PATH/"train.csv").to_pandas()
train_df

Unnamed: 0,date_id,LME_AH_Close,LME_CA_Close,LME_PB_Close,LME_ZS_Close,JPX_Gold_Mini_Futures_Open,JPX_Gold_Rolling-Spot_Futures_Open,JPX_Gold_Standard_Futures_Open,JPX_Platinum_Mini_Futures_Open,JPX_Platinum_Standard_Futures_Open,...,FX_GBPCAD,FX_CADCHF,FX_NZDCAD,FX_NZDCHF,FX_ZAREUR,FX_NOKGBP,FX_NOKCHF,FX_ZARCHF,FX_NOKJPY,FX_ZARGBP
0,0,2264.5,7205.0,2570.0,3349.0,,,,,,...,1.699987,0.776874,0.888115,0.689954,0.066653,0.090582,0.119630,0.078135,13.822740,0.059163
1,1,2228.0,7147.0,2579.0,3327.0,,,,,,...,1.695279,0.778682,0.889488,0.692628,0.067354,0.091297,0.120520,0.079066,13.888146,0.059895
2,2,2250.0,7188.5,2587.0,3362.0,4684.0,4691.0,4684.0,3363.0,3367.0,...,1.692724,0.780186,0.894004,0.697490,0.067394,0.091478,0.120809,0.079287,13.983675,0.060037
3,3,2202.5,7121.0,2540.0,3354.0,4728.0,4737.0,4729.0,3430.0,3426.0,...,1.683111,0.785329,0.889439,0.698502,0.067639,0.091558,0.121021,0.079285,14.035571,0.059983
4,4,2175.0,7125.0,2604.0,3386.0,,,,,,...,1.684816,0.787264,0.891042,0.701485,0.067443,0.091266,0.121055,0.078925,14.013760,0.059503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1912,1912,2450.0,9523.5,1961.5,2676.5,15086.0,15440.0,15085.0,4461.5,4467.0,...,1.864661,0.598318,0.827529,0.495125,0.049224,0.072574,0.080968,0.046175,14.058107,0.041388
1913,1913,2471.5,9519.5,1980.5,2710.5,15165.0,15509.0,15162.0,4495.0,4490.0,...,1.863539,0.594400,0.824390,0.490018,0.049409,0.072828,0.080671,0.046113,14.082236,0.041630
1914,1914,2471.5,9533.5,1974.0,2693.0,15040.0,15477.0,15044.0,4544.5,4555.0,...,1.860067,0.595250,0.822392,0.489529,0.049095,0.073232,0.081083,0.045901,14.126606,0.041457
1915,1915,2456.0,9500.5,1970.0,2697.5,15420.0,15752.0,15420.0,4670.0,4685.0,...,1.859624,0.597780,0.817224,0.488520,0.049205,0.073018,0.081170,0.045987,14.095322,0.041368


In [8]:
train_df = pl.read_csv(CFG.DATA_PATH/"train.csv").to_pandas()
train_df["cv_flag"] = pd.qcut(train_df.index, CFG.N_SPLIT, labels = False) + 1

train_labels_df = pl.read_csv(CFG.DATA_PATH /"train_labels.csv").to_pandas()

original_features = list(train_df.columns[1:]) # except "date_id"

target_cols = list(train_labels_df.columns[1:]) # except "date_id"

# combining train_df + targets
training_df = []
for i, target_col in enumerate(target_cols):
    temp_train_df = train_df.copy()
    temp_train_df["target_id"] = i
    
    # if i == 2:
    #      break

    y = train_labels_df[target_col].values # date_id順のターゲット値
    temp_train_df["target"] = y
    #print(np.isinf(y).sum())
    #print((np.abs(y) > 1e10).sum())
    mask = ~(np.isnan(y) | np.isinf(y) | (np.abs(y) > 1e10))
    training_df.append(temp_train_df[mask].copy())
training_df = pd.concat(training_df).reset_index(drop = True) # 元のidxは保持しない

### MODEL TRAINING

In [9]:
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, training_df.copy(),
                                        original_features + ['target_id'], target_cols)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.130148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141011
[LightGBM] [Info] Number of data points in the train set: 580900, number of used features: 554
[LightGBM] [Info] Start training from score -0.000043
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	training's rmse: 0.0331328	valid_1's rmse: 0.0255751
--------------------------------------------------
lightgbm training fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.160089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 142221
[LightGBM] [Info] Number of data points in the train set: 580627, number of used features: 559
[LightGBM] [Info] Start training from score -0.000037
T

### feature_importance

In [10]:
pd.read_csv("/kaggle/working/models/lightgbm_fold3_seed42_ver1_importance.csv")

Unnamed: 0,index,feature_importance
0,LME_AH_Close,0
1,LME_CA_Close,0
2,LME_PB_Close,0
3,LME_ZS_Close,0
4,JPX_Gold_Mini_Futures_Open,0
...,...,...
554,FX_ZARCHF,0
555,FX_NOKJPY,0
556,FX_ZARGBP,0
557,cv_flag,0


## output / variable image

In [11]:
y

array([ 0.02730989,  0.02094043,  0.00170587, ..., -0.12768791,
       -0.01218726,         nan])

In [12]:
# /tmp/ipykernel_36/1471824027.py:23: RuntimeWarning: invalid value encountered in greater
#  mask = ~(np.isnan(y) | np.isinf(y) | (np.abs(y) > 1e10))

np.isnan(y).sum() # 189
np.isinf(y).sum() # 0
(np.abs(y) > 1e10).sum() # 0

0

In [13]:
mask = ~(np.isnan(y) | np.isinf(y) | (np.abs(y) > 1e10))
mask

array([ True,  True,  True, ...,  True,  True, False])