In [2]:
%%time

from IPython.display import display_html, clear_output, Markdown
from gc import collect
import copy, pandas as pd, numpy as np, joblib, ctypes
from os import system, getpid, walk
from psutil import Process
from copy import deepcopy
libc = ctypes.CDLL("libc.so.6")

from tqdm.notebook import tqdm
from pprint import pprint 
from colorama import Fore, Style, init

import lightgbm
from itertools import combinations 
from sklearn.model_selection import KFold

collect()

CPU times: user 1.15 s, sys: 150 ms, total: 1.3 s
Wall time: 1.72 s


30

In [4]:
%%time 

from sklearn.model_selection import (RepeatedStratifiedKFold as RSKF,
                                    StratifiedKFold as SKF,
                                    KFold,
                                    RepeatedKFold as RKF,
                                    cross_val_score)
from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR
from xgboost import XGBRegressor as XGBR
from catboost import CatBoostRegressor as CBR
from sklearn.ensemble import HistGradientBoostingRegressor as HGBR
from sklearn.metrics import mean_absolute_error as mae, make_scorer

collect()

CPU times: user 404 ms, sys: 64.5 ms, total: 468 ms
Wall time: 937 ms


164

In [6]:
%%time 

def GetMemUsage():
    """
    defines the memory usage across the kernel.
    """
    
    pid = getpid()
    py = Process(pid)
    memory_use = py.memory_info()[0] / 2. * 30
    return f"RAM memory GB usage = {memory_use :.4f}"

from sklearn import set_config
set_config(transform_output = "pandas")

CPU times: user 77 µs, sys: 19 µs, total: 96 µs
Wall time: 102 µs


#### Configuration Class

In [7]:
%%time 

class CFG:
    """
    Configuration class for parameters and CV strategy for tuning and trainig
    use CAPITAL LETTERS when filling in parameters.
    """
    
    # Data preparation:
    version_nb = 5
    test_req = "N"
    test_frac = 0.01
    load_tr_data = "N"
    gpu_switch = "OFF"
    state = 42
    target = 'target'
    
    path = f'/kaggle/input/optiver-memoryreducedatasets/'
    test_path = f'/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv'
    df_choice = f'XTrIntCmpNewFtre.parquet'
    mdl_path = f'/kaggle/working/BaselineML/'
    inf_path = f'/kaggle/input/optiverbaselinemodels/'
    
    # Model Training:
    methods = ['LGBMR', 'CBR', 'HGBR']
    ML = "N"
    n_splits = 5
    n_repeats = 1
    nbrnd_erly_stp = 100
    mdlcv_mthd = 'KF'
    
    # Ensemble
    ensemble_req = "N"
    enscv_mthd = "KF"
    metric_obj = 'minimize'
    ntrials = 10 if test_req == "Y" else 200
    ens_weights = [0.54, 0.44, 0.02]
    
    # Inference:
    inference_req = "Y"
    
    # Global variables for plotting: 
    grid_specs = {'visible' : True, 'which' : 'both', 'linestyle' : '--',
                 'color' : 'lightgrey', 'linewidth' : 0.75}
    title_specs = {"fontsize" : 9, 'fontweight' : 'bold', 'color' : 'tab:blue'}
    
    print("Configuration done!")
    collect()
    print(GetMemUsage())

Configuration done!
RAM memory GB usage = 4458577920.0000
CPU times: user 300 ms, sys: 478 µs, total: 300 ms
Wall time: 300 ms


#### Cross validation 


In [11]:
%%time 

all_cv = {"KF" : KFold(n_splits = CFG.n_splits, shuffle = True, random_state = CFG.state),
         'RKF' : RKF(n_splits = CFG.n_splits, n_repeats = CFG.n_repeats ,random_state = CFG.state),
         'RSKF' : RSKF(n_splits = CFG.n_splits, n_repeats = CFG.n_repeats, random_state = CFG.state),
         'SKF' : SKF(n_splits = CFG.n_splits, shuffle = True, random_state = CFG.state)}

# Defining the metric
def ScoreMetric(y_true, y_pred) -> float:
    """
    Calculates the metric for the competition 
    y_true = ground truth np.array
    y_pred = predictions
    return = metric value (float)
    """
    return mae(y_true, y_pred)

# Custom scorer for cross_val_predict
myscorer = make_scorer(ScoreMetric, greater_is_better = False, needs_proba = False)

collect()
print(GetMemUsage())

RAM memory GB usage = 4458577920.0000
CPU times: user 164 ms, sys: 401 µs, total: 164 ms
Wall time: 162 ms


#### Conversion and Adjustment

In [12]:
%%time 

def goto_conversion(listOfOdds, total = 1, eps = 1e-6, isAmericanOdds = False):
    
    """
    Function for converting odds to probabilities
    
    The function first checks if the odds are in American format 
    and if so converts them to decimal odds if needed. Then computes the 
    probabilities based on the inverse of the odds. After this calculates the
    standard error (SE) for each probability. Ends with adjusting probabilities
    by stepping back based on the SE to ensure the sum is equal to the `total`.
    
    returns adjusted probabilities.
    =====================================================
    Parameters
    _____________________________________________________
    listOfOdds: list
        a list of odds that is either in American or decimal format
    total: float
        total sum of probabilities
    eps: float
        small value to prevent division by 0
    isAmericanOdds: bool
        A boolean flag indicating whether the input odds are in American format
    
    """
    
    # Converting American odds to Decimal odds 
    if isAmericanOdds:
        for i in range(len(listOfOdds)):
            currOdds = listOfOdds[i]
            isNegativeAmericanOdds = currOdds < 0
            if isNegativeAmericanOdds:
                currDecimalOdds = 1 + (100/(currOdds*-1))
            else:
                # Is non-negative 'merican odds
                currDecimalOdds = 1 + (currOdds/100)
            listOfOdds[i] = currDecimalOdds
        
    # Error catchers 
    if len(listOfOdds) < 2:
        raise ValueError('len(listOfOdds) must be >= 2')
    if any(x < 1 for x in listOfOdds):
        raise ValueError("All odds must be >= 1, set isAmericanOdds parameter to True if using American Odds.")
    
    # Computation:
    # init probabilities using inverse odds
    listOfProbabilities = [1/x for x in listOfOdds]
    
    # compute the standard error (SE) for each proabilitiy
    listOfSe = [pow((x-x**2)/x, 0.5) for x in listOfProbabilities]
    
    # compute how many steps for SE the probabilityies should step back by
    step = (sum(listOfProbabilities) - total)/sum(listOfSe)
    
    outputListOfProbabilities = [min(max(x - (y*step), eps), 1) for x,y in zip(listOfProbabilities, listOfSe)]
    
    return outputListOfProbabilities

def zero_sum(listOfProces, listOfVolumes):
    """
    Adjusts the list of prices and volumes to achieve a zero-sum condition.
    -----------------------------------------------------------------------
    Parameters
    ----------
    listOfPrices: list of the prices
    listOfVolumes: list of volumes corresponding to the prices.
    
    =======================================================================
    Computes Standard Error (SE) for each price based on the volumes.
    Then adjusts the prices by scaling them using the standard errors to achieve
    a zero-sum condition.
    
    returns adjusted prices
    
    """
    # compute standard errors assuming standard deviation is same for all stocks 
    listOfSe = [x**0.5 for x in listOfVolumes]
    step = sum(listOfPrices)/sum(listofSe)
    outputListOfPrices = [x - (y*step) for x,y in zip(listOfPrices,listOfSe)]
    return outputListOfPrices

collect()

CPU times: user 274 ms, sys: 0 ns, total: 274 ms
Wall time: 273 ms


0

### Load and Prepare training 


In [None]:
%%time 

if (CFG.load_tr_data == "Y" or CFG.ML == "Y") and CFG.test_req == 'Y':
    if isinstance(CFG.test_frac, float):
        X = pd.read_parquet(CFG.path + CFG.df_choice).sample(frac = CFG.test_frac)
    else:
        X = pd.read_parquet(CFG.path + CFG.df_choice).sample(n = CFG.test_frac)
    
    y = pd.read_parquet(CFG.path + f"Ytrain.parquet").loc[X.index].squeeze()
    print(f"Sampled train shapes for coding tests = {X.shape} {y.shape}")
    
    X.index, y.index = range(len(X)), range(len(y))
    
    print(X.columns)
    
elif CFG.load_tr_data == "Y" or CFG.ML == "Y":
    X = pd.read_parquet(CFG.path + CFG.df_choice)
    y = pd.read_parquet(CFG.path + "Ytrain.parquet").squeeze()
    
    print(f"Train shapes for code testing = {X.shape} {y.shape}")
    
elif CFG.load_tr_data != "Y" or CFG.inference_req == "Y":
    print(f"Train data is not required since inferring from the model")
    
collect()
libc.malloc_trim(0)

Print(GetMemUsage())

### Initialize model configurations

In [None]:
%%time 

# Initializing Models 

if CFG.ML == "Y":
    Mdl_Master = \
    {
        "CBR" : CBR(**{'task_type' : "GPU" if CFG.gpu_switch == "ON" else "CPU",
                      'objective' : "MAE",
                      'eval_metric' : 'MAE',
                      'bagging_temperature' : 0.5,
                      'colsample_bylevel' : 0.7,
                      'iterations' : 500,
                      'learning_rate' : 0.065,
                      'od_wait' : 25,
                      'max_depth' : 7,
                       '12_leaf_reg' : 1.5,
                       'min_data_in_leaf' : 1000,
                       'random_strength' : 0.65,
                       'verbose' : 0,
                       'use_best_model' : True,
                      }),
        'LGBMR' : LGBMR(**{'device' : 'gpu' if CFG.gpu_switch == "ON" else "cpu",
                           'objective' : 'regression_l1',
                           'boosting_type' : 'gbdt',
                           'random_state' : CFG.state,
                           'colsample_bytree' : 0.7,
                           'subsample' : 0.65,
                           'learning_rate' : 0.065,
                           'max_depth' : 6,
                           'n_estimators' : 500,
                           'num_leaves' : 150,
                           'reg_alpha' : 0.01,
                           'reg_lambda' : 3.25,
                           'verbose' : -1,
            
                        }),
        'XGBR' : XGBR(**{'tree_method' : 'gpu_hist' if CFG.gpu_switch == 'ON' else 'hist',
                         'objective' : 'reg:absoluteerror',
                         'random_state' : CFG.state,
                         'colsample_bytree' : 0.7,
                         'learning_rate' : 0.07,
                         'max_depth' : 6,
                         'n_estimators' : 500,
                         'reg_alpha' : 0.025,
                         'reg_lambda' : 1.75,
                         'min_child_weight' : 1000,
                         'early_stopping_rounds' : CFG.nbrnd_erly_stp
                        }),
        "HGBR" : HGBR( loss = 'squared_error',
                     learning_rate = 0.075,
                     early_stopping = True,
                     max_iter = 200,
                     max_depth = 6,
                     min_samples_leaf = 1500,
                     12_regularization = 1.75,
                     scoring = myscorer,
                     random_state = CFG.state)
    }
    
collect()
print(GetMemUsage())

In [None]:
%%time 

if CFG.ML == "Y":
    # Initialize the models from configuration class:
    methods = CFG.methods
    
    # Initialize a folder to store the trained and fitted models
    system('mkdir BaselineML')
    
    # Init the model path for storage 
    model_path = CFG.mdl_path
    
    # Initializing the CV object:
    cv = all_cv[CFG.mdlcv_mthd]
    
    # Initializing score dataframe 
    Scores = pd.Dataframe(index = range(CFG.n_splits * CFG.n_repeats),
                         columns = methods).fillna(0).astype(np.float32)
    
    FtreImp = pd.DataFrame(index = X.columns, columns = [methods]).fillna(0)
    
collect()
libc.malloc_trim(0)
print(GetMemUsage())

In [None]:
if CFG.ML == "Y":
    print("="*25, "Trainig", "-" 25)
    
    # Initializing CV splitting 
    for fold_nb, (train_idx, val_idx) in tddm(enumerate(cv.split(X,y)),
                                             f"{CFG.mdlcv_mthd} CV {CFG.n_splits} * {CFG.n_repeats}"):
        # creating the CV folds
        X_train = X.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_train = y.iloc[train_idx]
        y_val = y.iloc[val_idx]
        
        print(f"-------------- Fold {fold_nb} ------------------")
        
        # Fitting the models 
        for method in methods:
            model = Mdl_Master[method]
            if method == "LGBMR":
                model.fit(X_train, y_train,
                         eval_set = [(X_val, y_val)],
                         verbose = 0,
                         eval_metric = 'mae',
                         callbacks = [log_evaluation(0,),
                                     early_stopping(CFG.nbrnd_erly_stp, verbose = False)])
            elif method == "XGBR":
                model.fit(X_train, y_train,
                         eval_set = [(X_val,y_val)],
                         verbose = 0,
                         eval_metric = 'mae')
            elif method == "CBR":
                model.fit(X_train, y_train,
                         eval_set = [(X_val, y_val)],
                         verbose = 0,
                         early_stopping_rounds = CFG.nbrnd_erly_stp)
            else:
                model.fit(X_train, y_train)
                
            joblib.dump(model, CFG.mdl_oath + f"{method}v{CFG.version_nb}Fold{fold_nb}.model")
            
            # Creating OOF scores
            score = ScoreMetric(y_val, model,predict(X_val))
            scores.at[fold_nb, method] = score
            num_space = 6 - len(method)
            print(f"----- {method} {' ' * num_space} OOF = {score:.5f} --------")
            
            del num_space, score
            
            # Collecting Feature importance 
            try: 
                FtreImp[method] = FtreImp[method].values + (model.feature_importances_ / (CFG.n_splits * CFG.n_repeats))
            except:
                pass
            
            collect()
            
        del X_train, y_train, X_val, y_val
        collect()
            
    clear_output()
    print(" Mean OOF Scores across methods")
    display(Scores.mean())
    
    try: FtreImp.to_csv(CFG.mdl_path + f"FtreImp_V{CFG.version_nb}.csv")
    except: pass
    
collect()
libc.malloc_trim(0)
print(GetMemUsage())

In [None]:
%%time 

def MakeFeature(df: pd.DataFrame, prices: list) -> pd.DataFrame:
    """
    The function creates new features using the prices columns. 
    
    ------------------------------------------------------------
    Parameters
    ----------
    df: pd.DataFrame
    prices: prices columns for transformation
    
    -----------------------------------------------------------
    Returns
    --------
    df: pd.DataFrame
        DataFrame with addtional columns after feature engineering
    """
    
    features = ['overall_medvol', "first5min_medvol", "last5min_medvol",
                'seconds_in_bucket', 'imbalance_buy_sell_flag',
                'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2']
    df['imb_s1'] = df.eval('(bid_size - ask_size) / (bid_size + ask_size)').astype(np.float32)
    df['imb_s2'] = df.eval('(imbalance_size - matched_size) / (matched_size + imbalance_sizes)').astype(np.float32)
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i > j:
                df[f'{a}_{b}_imb'] = df.eval(f"({a} - {b}) / ({a} + {b})")
                features.append(f'{a}_{b}_imb')
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c enumerate(prices):
                 if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1);
                    min_ = df[[a,b,c]].min(axis=1);
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = ((max_-mid_)/(mid_-min_)).astype(np.float32);
                    features.append(f'{a}_{b}_{c}_imb2')
    
    return df[features]

collect()

In [None]:
%%time 

# Creating the test envir

if CFG.inference_req == "Y":
    try:
        del X,y
    except:
        pass
    
    prices = ['reference_price', 'far_price','near_price',
             'bid_price','ask_price','wap']
    
    # Making the test envir for inference
    
    import optiver2023
    try:
        env = optiver2023.make_env()
        iter_test = env.iter_test()
        print("Curating the ingerence environment")
    except:
        pass
    
    # collating list of models 
    
    models = []
    
    # loading the models for inference 
    
    if CFG.ML != "Y":
        model_path = CFG.inf_path
        print(f"Loading models from the input data for the kernel -V{CFG.version_nb}")
    elif CFG.ML == "Y":
        model_path = CFG.mdl_path
        print("Loading models from the working directory for the kernel")
        
    model_label = []
    for _, _, filename in walk(model_path):
        model_label.extend(filename)
    
    models = []
    for filename in model_label:
        models.append(joblib.load(model_path+f"{filename}"))
        
    model_label = [m.replace(r".model","") for m in model_label]
    
    model_dict = {l:m for l,m in zip(model_label, models)}
    print("Trained Models")
    print(np.array(model_label))

collect()
libc.malloc_trim(0)
print(GetMemUsage())