In [1]:
import pandas as pd 
import numpy as np 

In [6]:
import sys 
import pickle 
from pprint import pprint, pformat 
import json 

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import lightgbm as lgb
import matplotlib.pyplot as plt 
import seaborn as sns 
from timeit import default_timer as timer 
from IPython.display import clear_output 
import optuna 

from optuna.visualization import (plot_optimization_history, plot_param_importances, plot_parallel_coordinate)
sns.set_style("dark")

from itertools import combinations 
import gc 
import plotly.express as px
import joblib

### Data Loading

In [7]:
train = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")

reveal_targets = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv")

test = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv")

sample_submission = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv")

In [8]:
train

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


In [9]:
median_vol = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()

In [12]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    decrease = 100 * (start_mem - end_mem) / start_mem
    print(f"Decreased by {decrease:.2f}%")

    return df

In [17]:
def feat_eng(df):
    
    cols = [c for c in df.columns if c not in ['row_id','time_id']]
    df = df[cols]
    
    df['imbalance_buy_flag'] = np.where(df['imbalance_buy_sell_flag'] == 1, 1, 0)
    df['imbalance_sell_flag'] = np.where(df['imbalance_buy_sell_flag'] == 1, 1, 0)
    df['bid_plus_ask_sizes'] = df['bid_size'] + train['ask_size']
    df['median_vol'] = df['stock_id'].map(median_vol.to_dict())
    df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['median_vol'], 1,0)
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    df['ask_x_size'] = df.eval('(ask_size*ask_price)')
    df['bid_x_size'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    df['ask_minus_bid'] = df['ask_x_size'] - df['bid_x_size']
    
    df['bid_size_over_ask_size'] = df['bid_size'].div(df['ask_size'])
    df['bid_price_over_ask_price'] = df['bid_price'].div(df['ask_price'])
    
    prices = ['reference_price', 'far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for c in combinations(prices, 2):
        
        df[f"{c[0]}_minus_{c[1]}"] = (df[f"{c[0]}"] - df[f'{c[1]}'] ).astype(np.float32)
        df[f'{c[0]}_times_{c[1]}'] = (df[f'{c[0]}'] * df[f'{c[1]}'] ).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]} - {c[1]} ) / ({c[0]} + {c[1]})')
        
    for c in combinations(prices, 3):
        max_ = df[list(c)].max(axis = 1)
        min_ = df[list(c)].min(axis = 1)
        mid_ = df[list(c)].sum(axis = 1) - min_ - max_
        
        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_ - mid_)/(mid_ - min_)
    df.drop(columns=['date_id'], inplace = True)
    df = reduce_mem_usage(df)
    gc.collect()
    
    return df

In [18]:
X = feat_eng(train.drop(columns='target'))
y = train['target']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['imbalance_buy_flag'] = np.where(df['imbalance_buy_sell_flag'] == 1, 1, 0)


Memory usage of dataframe is 3037.16 MB
Memory usage after optimization is: 1168.91 MB
Decreased by 61.51%


### Hyperparameter Optimization 

Evaluation via Cross validation with `TimeSeriesSplit`

In [19]:
def cross_validation(model, X, y, cv):
    scores = np.zeros(cv.n_splits)
    
    for i, (train_index, test_index) in enumerate(cv.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle = False, test_size = 0.1)
        start = timer()
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
        end = timer()
        y_pred = model.predict(X_test)
        scores[i] = mean_absolute_error(y_pred, y_test)
        
    return scores

### Evaluation with `train_test_split`

In [20]:
def evaluate_simple(model, X, y, cv):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False, test_size = 0.2)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle = False, test_size = 0.1)
    start = timer()
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], callbacks=[lgb.early_stopping(50,verbose=False)])
    end = timer()
    score = mean_absolute_error(y_pred, y_test)
    return score

In [21]:
def run_optimization(objective, n_trials=100, n_jobs=1):
    """Run the given objective with Optuna and return the study resuts"""
    optuna.logging.set_verbosity(optuna.Warning)
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, n_jobs = n_jobs, show_progress_bar = True)
    with open("best_params.json", "w") as f:
        json.dump(study.best_params, f)
    return study

In [23]:
def get_objective_function(evaluations='simple', cv=None, logging_levels='info'):
    """Returns the objective function for optuna"""
    if evaluation == "simple":
        eval_function = evaluate_simple
    else:
        eval_function = cross_validate
    
    def optimize_lgbm(trial):
        """Optimizes a LGBRegressor with cross validation """
        #num_leaves should be smaller than 2*(max_depth)
        max_depth = trial.suggest_int("max_depth", 6, 9)
        num_leaves = trial.suggest_int("num_leaves", 32, int((2**max_depth) * 0.90))
        
        param_space = {
            "boosting" : "gbdt",
            "objective" : trail.suggest_categorical("objective", ['mae']),
            "random_state" : trail.suggest_categorical("random_state", [0]),
            "n_estimators" : trail.suggest_categorical("n_estimators", [600]),
            "reg_alpha" : trail.suggest_float("reg_alpha", 1e-3, 1.0, log=True),
            "reg_lambda" : trial.suggest_float("reg_lambda", 1e-3, 1.0, log=True),
            "learning_rate" : trial.suggest_float("learning_rate", 1e-2, 2e-1, log =True),
            "num_leaves" : num_leaves,
            "max_depth" : max_depth
        }
        model = LGBMRegressor(**param_specs)
        scores = eval_function(model, X, y, cv=cv)
        return scores.mean()
    return optimize_lgbm

In [None]:
m = lgb.LGBMRegressor(objective='mae', n_estimators=600, random_state=0)
m.fit(X,y)