# Analyse Data

## Install Package

## Load package

In [2]:
import numpy as np
import pandas as pd
from glob import glob

from numba import njit
from multiprocessing import Pool, cpu_count

import gc
import pathlib
from tqdm.auto import tqdm
import json
import time
import requests as re
from datetime import datetime
from dateutil.relativedelta import relativedelta, FR

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
from sklearn import model_selection
import joblib
import lightgbm as lgb

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
from matplotlib_venn import venn2, venn3
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')

import warnings
warnings.simplefilter('ignore')

pd.get_option("display.max_columns")

20

In [24]:
pathData = "~/Dev/data/Kaggle/Optiver/"

DEBUG = True
MODE = 'INFERENCE'
MODEL_DIR = '../models'

In [25]:
class CFG:
    INPUT_DIR = "~/Dev/data/Kaggle/Optiver/"
    OUTPUT_DIR = "~/Dev/data/Kaggle/Optiver/"

In [26]:
# Logging is always nice for your experiment:)
def init_logger(log_file='train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = init_logger(log_file=f'{CFG.OUTPUT_DIR}/baseline.log')
logger.info(f'Start Logging...')

FileNotFoundError: [Errno 2] No such file or directory: '/home/gregory/Dev/kaggle/jupyter/~/Dev/data/Kaggle/Optiver/baseline.log'

In [21]:
pathData + "book_train.parquet/*"

'/home/gregory/Dev/data/Kaggle/Optiver/book_train.parquet/*'

In [22]:
train_targets = pd.read_csv(pathData + "train.csv")
train_targets['row_id'] = train_targets['stock_id'].astype(str) + '-' + train_targets['time_id'].astype(str)
train_targets = train_targets[['row_id','target']].set_index("row_id")
train_files = glob(pathData + "book_train.parquet/*")

In [9]:
book = pd.read_parquet(train_files[0], engine="pyarrow").to_numpy(dtype=np.float32)
book.shape

(1101256, 10)

In [10]:
train_book_stocks = os.listdir(os.path.join(CFG.INPUT_DIR, 'book_train.parquet'))

if DEBUG:
    logger.info('Debug mode: using 3 stocks only')
    train_book_stocks = train_book_stocks[:3]

logger.info('{:,} train book stocks: {}'.format(len(train_book_stocks), train_book_stocks))

Debug mode: using 3 stocks only
3 train book stocks: ['stock_id=115', 'stock_id=124', 'stock_id=100']


In [11]:
column_names = [
    "time_id",           # 0
    "seconds_in_bucket", # 1
    "bid_price1",        # 2
    "ask_price1",        # 3
    "bid_price2",        # 4
    "ask_price2",        # 5
    "bid_size1",         # 6
    "ask_size1",         # 7
    "bid_size2",         # 8
    "ask_size2"          # 9
]

In [12]:
@njit
def fill_array(book_data, filled_data):
    filled_data[0] = book_data[0]
    last_read_idx = 0
    for row_idx in range(1, 600):
        # print(row_idx, last_read_idx, int(book_data[last_read_idx + 1][1]), int(book_data[last_read_idx + 1][1]) == row_idx)
        if int(book_data[last_read_idx + 1][1]) == row_idx:
            last_read_idx += 1
        filled_data[row_idx] = book_data[last_read_idx]
        filled_data[row_idx][1] = row_idx

In [13]:
@njit
def calculate_features(filled_data):
    filled_data = filled_data.transpose()
    
    trade_vols1 = filled_data[6] + filled_data[7]
    trade_vols2 = filled_data[8] + filled_data[9]
    trade_diffs1 = filled_data[7] - filled_data[6]
    trade_diffs2 = filled_data[9] - filled_data[8]
    
    spreads1 = (filled_data[2] / filled_data[3]) - 1
    spreads2 = (filled_data[4] / filled_data[5]) - 1
    
    waps1 = (filled_data[2] * filled_data[7] + filled_data[3] * filled_data[6]) / (filled_data[6] + filled_data[7])
    waps2 = (filled_data[4] * filled_data[9] + filled_data[5] * filled_data[8]) / (filled_data[8] + filled_data[9])
    
    logs1 = np.diff(np.log(waps1))
    logs2 = np.diff(np.log(waps2))
    
    return [
        waps1.mean(), 
        waps2.mean(),
        waps1[300:].mean(),
        waps2[300:].mean(),
        waps1.std(),
        waps2.std(),
        waps1[300:].std(),
        waps2[300:].std(),
        logs1.mean(),
        logs2.mean(),
        logs1[300:].mean(),
        logs2[300:].mean(),
        logs1.std(), # Essentially volatility1
        logs2.std(), # Essentially volatility2
        trade_vols1.mean(),
        trade_vols2.mean(),
        trade_vols1[300:].mean(),
        trade_vols2[300:].mean(),
        trade_diffs1.mean(),
        trade_diffs2.mean(),
        trade_diffs1[300:].mean(),
        trade_diffs2[300:].mean(),
        int(filled_data[0][0])
    ]

In [14]:
@njit
def process_groups(dataset, stock_id):
    ret_lis = []
    last_split_pos = 0
    filled_data = np.zeros((600, 10), dtype=np.float32)
    for split_pos in np.nonzero(np.diff(dataset[:,0]))[0]:
        data_split = dataset[last_split_pos:split_pos]
        fill_array(data_split, filled_data)
        features = calculate_features(filled_data)
        ret_lis.append(features + [stock_id])
        last_split_pos = split_pos
    data_split = dataset[last_split_pos:]
    fill_array(data_split, filled_data)
    features = calculate_features(filled_data)
    ret_lis.append(features + [stock_id])
    return ret_lis

In [15]:
feature_columns = [
    "wap1", "wap2", "wap1l", "wap2l", "wap1_std", "wap2_std", "wap1l_std", "wap2l_std", "log1", "log2", "log1l", "log2l", "vol1", "vol2",
    "volume1", "volume2", "volume1l", "volume2l", "diff1", "diff2", "diff1l", "diff2l", "time_id", "stock_id"
]

In [16]:
def process_single_stock(file_path):
    book = pd.read_parquet(file_path, engine="pyarrow").to_numpy(dtype=np.float32)
    group_features = process_groups(book, int(file_path.split('=')[1]))
    return group_features

In [17]:
def preprocess_data():
    worker_pool = Pool(processes=None)
    full_feature_list_matrix = worker_pool.map(process_single_stock, train_files)
    worker_pool.close()
    worker_pool.join()
    return_feature_list = []
    for feature_list in full_feature_list_matrix:
        return_feature_list += feature_list
        
    return pd.DataFrame(return_feature_list, columns=feature_columns)

In [18]:
%timeit preprocess_data()

29.2 s ± 1.73 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
