In [1]:
from IPython.core.display import display, HTML
from scipy.special import gamma
import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc

from joblib import Parallel, delayed

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib


path_submissions = '/'

target_name = 'target'
scores_folds = {}

In [2]:
epsilon = np.finfo(np.float32).eps

In [3]:
def num_positive(series):
    return (series > 0).sum()

def num_negative(series):
    return (series < 0).sum()

def pos_neg_ratio(series):
    if num_negative(series) == 0:
        return num_positive(series)
    else:
        return num_positive(series) / num_negative(series)

In [4]:
def realized_quarticity(series):
    return np.sum(series**4)*series.shape[0]/3

def realized_quadpower_quarticity(series):
    series = abs(series.rolling(window=4).apply(np.product, raw=True))
    return (np.sum(series) * series.shape[0] * (np.pi**2))/4

from scipy.special import gamma
def realized_tripower_quarticity(series):
    series = series ** (4/3)
    series = abs(series).rolling(window=3).apply(np.prod, raw=True)
    return series.shape[0]*0.25*((gamma(1/2)**3)/(gamma(7/6)**3))*np.sum(series)

In [5]:
def realized_1(series):
    return np.sqrt(np.sum(series**4)/(6*np.sum(series**2)))

def realized_2(series):
    return np.sqrt(((np.pi**2)*np.sum(abs(series.rolling(window=4).apply(np.product, raw=True))))/(8*np.sum(series**2)))

def realized_3(series):
    numerator = (gamma(1/2)**3)*np.sum((abs(series)**(4/3)).rolling(window=3).apply(np.prod))
    denominator = 8 * (gamma(7/6)**3)*np.sum(series**2)
    return np.sqrt(numerator/denominator)

In [6]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path, forward_fill = False):
    df = pd.read_parquet(file_path)
    
    if forward_fill:
        df = df.set_index(['time_id', 'seconds_in_bucket'])
        df = df.reindex(pd.MultiIndex.from_product([df_book.index.levels[0], np.arange(0, 600)], names=['time_id', 'seconds_in_bucket']), method='ffill')
        df.reset_index(inplace=True)
    df['idle_sum'] = df.groupby('time_id')['seconds_in_bucket'].transform(lambda x: x.diff())
    df['idle_sum'].fillna(0, inplace = True)
    
    df['total_spread'] = (df['ask_size1'] * df['ask_price1']) - (df['bid_size1'] * df['bid_size2'])
    #df['market_depth'] = df.groupby([''])
    
    #df['ask_diff'] = df.groupby('time_id')['ask_size1'].transform(lambda x: x.diff()).fillna(0)
    #df['bid_diff'] = df.groupby('time_id')['bid_size1'].transform(lambda x: x.diff()).fillna(0)
    df['ask_gradient'] = df['ask_size1'].diff().fillna(0) / df['idle_sum'].shift(-1)
    df['bid_gradient'] = df['bid_size1'].diff().fillna(0) / df['idle_sum'].shift(-1)
    df['ask_gradient'].fillna(0, inplace = True)
    df['bid_gradient'].fillna(0, inplace = True)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    #df['wap3'] = calc_wap3(df)
    #df['wap4'] = calc_wap4(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    #df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    #df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    df['wap_ratio'] = abs(df['wap1'] / df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        #'wap1': [np.sum, np.std],
        #'wap2': [np.sum, np.std],
        #'ask_gradient':[np.var],
        #'bid_gradient':[np.var],
        'idle_sum':[np.mean, np.sum],
        'wap_ratio':[np.mean, np.var],
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'wap_balance': [np.mean],
        'wap_ratio': [np.mean],
        'price_spread':[np.mean, np.var],
        'price_spread2':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'total_volume':[np.sum],
        'volume_imbalance':[np.sum, np.var],
        "bid_ask_spread":[np.sum],
    }
    create_feature_dict_time = {
        'log_return1': [realized_volatility, realized_1],
        'log_return2': [realized_volatility],
        'idle_sum': [np.sum, np.mean],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id__100'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    #df_feature['rq_est1'] = df.groupby('time_id')['log_return'].agg(realized_1).reset_index().iloc[:, -1]
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility, realized_quadpower_quarticity],
        #'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
        'amount':[np.sum, np.var],
    }
    create_feature_dict_time = {
        'log_return':[realized_volatility, realized_1],
        #'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)
    
        #market_breadth
    df_feature['<=300s_price'] = (df[df.seconds_in_bucket <= 300].groupby('time_id')['amount'].sum() / df[df.seconds_in_bucket <= 300].groupby('time_id')['size'].sum()).reset_index().iloc[:, -1]
    df_feature['>300s_price'] = (df[df.seconds_in_bucket > 300].groupby('time_id')['amount'].sum()/ df[df.seconds_in_bucket > 300].groupby('time_id')['size'].sum()).reset_index().iloc[:, -1]
    df_feature["advance_decline"] = df_feature['>300s_price'] - df_feature['<=300s_price']
    df_feature["advance_decline"] = df_feature["advance_decline"].apply(np.sign)
    df_feature.drop(['<=300s_price','>300s_price'], axis= 1, inplace = True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # vol vars
        
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
        # Create row_id so we can merge
    df_feature['rq_est1'] = df.groupby('time_id')['log_return'].agg(realized_1).reset_index().iloc[:, -1]
    #df_feature['rq_est3'] = df.groupby('time_id')['log_return'].agg(realized_3).reset_index().iloc[:, -1]
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    vol_cols = ['log_return1_realized_volatility',
                'log_return2_realized_volatility',
                'log_return1_realized_volatility_400',
                'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300',
                'log_return2_realized_volatility_300',
                'log_return1_realized_volatility_200',
                'log_return2_realized_volatility_200', 
                'log_return1_realized_volatility_100',
                'log_return2_realized_volatility_100', 
                'trade_log_return_realized_volatility',
                'trade_log_return_realized_volatility_400',
                'trade_log_return_realized_volatility_300',
                'trade_log_return_realized_volatility_200',
                'trade_log_return_realized_volatility_100']


    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean']).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean']).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [7]:
# Read train and test
train, test = read_train_test()

# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)


Our training set has 428932 rows


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed: 50.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished


In [8]:
train = train[(train.stock_id != 31) & (train.time_id != 25504)]
train = train[(train.stock_id != 31) & (train.time_id != 27174)]
train = train[(train.stock_id != 31) & (train.time_id != 1544)]
train = train[(train.stock_id != 81) & (train.time_id != 28319)]
train = train[(train.stock_id != 27) & (train.time_id != 20551)]

In [9]:
colNames = [col for col in list(train.columns)
            if col not in {"stock_id", "time_id", "target", "row_id"}]
len(colNames)

109

In [10]:
cluster_features = [
    "time_id",
    "stock_id",
    "row_id",
    "idle_sum_mean_500",
    "idle_sum_mean_400",
    "idle_sum_mean_300",
    "idle_sum_mean_200",
    "idle_sum_mean_100",
    "idle_sum_mean",
    "idle_sum_sum",
    "bid_ask_spread_sum",
    "ask_spread_mean",
    "bid_spread_mean",
    "price_spread_var",
    "total_volume_sum",
    "wap_ratio_mean",
    
    #"trade_log_return_realized_volatility_500",
    #"trade_log_return_realized_volatility_400",
    "trade_log_return_realized_volatility_300",
    #"trade_log_return_realized_volatility_200",
    #"trade_log_return_realized_volatility_100",
    
    "trade_size_sum_500",
    "trade_size_sum_400",
    "trade_size_sum_300",
    "trade_size_sum_200",
    "trade_size_sum_100",
    
    "trade_order_count_mean_500",
    "trade_order_count_mean_400",
    "trade_order_count_mean_300",
    "trade_order_count_mean_200",
    "trade_order_count_mean_100",
    
    "trade_energy_v",
    "trade_energy",
    "trade_f_min",
    "trade_f_max",
    "trade_tendency",
    "trade_advance_decline",
    "trade_order_count_mean",
    "trade_abs_diff",
    "trade_df_max",
    "trade_iqr_p_v",
    #"trade_rq_est1",
    
    "log_return1_realized_volatility_500",
    "log_return1_realized_volatility_400",
    "log_return1_realized_volatility_300",
    "log_return1_realized_volatility_200",
    "log_return1_realized_volatility_100",
    
    "log_return2_realized_volatility_500",
    "log_return2_realized_volatility_400",
    "log_return2_realized_volatility_300",
    "log_return2_realized_volatility_200",
    "log_return2_realized_volatility_100"
]

In [11]:
from sklearn.cluster import OPTICS
# making agg features

train_p = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr()

ids = corr.index

kmeans = OPTICS(min_samples = 2, max_eps = 0.75).fit(corr.values)
#KMeans(n_clusters=3, random_state=0).fit(corr.values)
#print(kmeans.labels_)

l = []
for n in range(np.max(kmeans.labels_) + 1):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )
    

mat = []
matTest = []

n = 0
for ind in l:
    if (len(ind) < 3):
        continue
    newDf = train.loc[train['stock_id'].isin(ind) , cluster_features]
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    mat.append ( newDf )
    
    newDf = test.loc[test['stock_id'].isin(ind) , cluster_features]    
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    matTest.append ( newDf )
    
    n+=1
    
mat1 = pd.concat(mat).reset_index()
#mat1.drop(columns=['target'],inplace=True)

mat2 = pd.concat(matTest).reset_index()

In [12]:
mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])
mat1 = mat1.pivot(index='time_id', columns='stock_id')
mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
mat1.reset_index(inplace=True)

mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [13]:
train = pd.merge(train,mat1,how='left',on='time_id')
test = pd.merge(test,mat2,how='left',on='time_id')

In [14]:
train['adr'] = train.groupby('time_id')['trade_advance_decline'].transform(pos_neg_ratio)
test['adr'] = test.groupby('time_id')['trade_advance_decline'].transform(pos_neg_ratio)

In [15]:
import gc
del mat1,mat2, kmeans, matTest, mat
gc.collect()

20

In [16]:
from numpy.random import seed
seed(42)
import tensorflow as tf
tf.random.set_seed(42)
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers

def root_mean_squared_per_error(y_true, y_pred):
         return K.sqrt(K.mean(K.square( (y_true - y_pred)/ y_true )))
    
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=20, verbose=0,
    mode='min',restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=7, verbose=0,
    mode='min')

In [17]:
gc.collect()

42

In [18]:
features = [col for col in train.columns if col not in {"time_id", "target", "row_id", "fold", "lgb1", "lgb2", "nn", 'stock_id'}]

In [19]:
hidden_units = (128,64,32)
stock_embedding_size = 8
weight_decay = 1e-4

def base_model():
    
    # Each instance will consist of two inputs: a single user id, and a single movie id
    stock_id_input = keras.Input(shape=(1,), name='stock_id')
    num_input = keras.Input(shape=(train[features].shape[1],), name='num_data')


    #embedding, flatenning and concatenating
    stock_embedded = keras.layers.Embedding(max(cat_data)+1, stock_embedding_size, 
                                           input_length=1, name='stock_embedding')(stock_id_input)
    stock_flattened = keras.layers.Flatten()(stock_embedded)
    out = keras.layers.Concatenate()([stock_flattened, num_input])
    

    out = keras.layers.Dense(128, activation='swish',
                             kernel_initializer = 'he_normal')(out)
    out = keras.layers.Dense(64, activation='swish',
                             kernel_initializer = 'he_normal')(out)
    out = keras.layers.Dense(32, activation='swish',
                             kernel_initializer = 'he_normal')(out)
        

    #out = keras.layers.Concatenate()([out, num_input])

    # A single output: our predicted rating
    out = keras.layers.Dense(1, activation='linear', name='prediction')(out)
    
    model = keras.Model(
    inputs = [stock_id_input, num_input],
    outputs = out,
    )
    
    return model

In [20]:
K.clear_session()
gc.collect()

20

In [21]:
from sklearn.model_selection import GroupKFold
counter = 0
test_predictions_nn = np.zeros(test.shape[0])
max_target = np.max(train['target'])*0.95
min_target = np.min(train['target'])*0.9
kfold = GroupKFold(n_splits = 5)
for fold, (train_ind, val_ind) in enumerate(kfold.split(train, groups=train.time_id)):
    num_data = train.loc[train_ind, features]
    cat_data = train.loc[train_ind, 'stock_id']
    y_train = train.loc[train_ind, 'target']
    
    num_data_val = train.loc[val_ind, features]
    cat_data_val = train.loc[val_ind, 'stock_id']
    y_val = train.loc[val_ind, 'target']
    
    scaler = MinMaxScaler(feature_range=(-1, 1)) 
    num_data = scaler.fit_transform(num_data.to_numpy())
    num_data_val = scaler.transform(num_data_val.to_numpy())
    num_data = np.where(np.isnan(num_data), np.nanmean(num_data, axis=0), num_data)
    num_data_val = np.where(np.isnan(num_data_val), np.nanmean(num_data_val, axis=0), num_data_val)
    model = base_model()
    model.compile(
        keras.optimizers.Adam(learning_rate=0.006),
        loss=root_mean_squared_per_error
    )
    
    
    model.fit([cat_data, num_data], 
              y_train,               
              batch_size=1024,
              epochs=500,
              validation_data=([cat_data_val, num_data_val], y_val),
              callbacks=[es, plateau],
              validation_batch_size=len(y_val),
              shuffle=True,
             verbose = 0)
    model.fit([cat_data, num_data], y_train, batch_size = 512, epochs = 3)
    preds = model.predict([cat_data_val, num_data_val]).reshape(1,-1)[0].clip(min_target, max_target)
    print(f"min predicted_value: {np.min(preds)}")
    score = round(rmspe(y_true = y_val, y_pred = preds),5)
    print('Fold {} {}: {}'.format(counter, fold, score))
    num_data_test = scaler.transform(test[features].to_numpy())
    #print(num_data_test)
    num_data_test = np.where(np.isnan(num_data_test), np.nanmean(num_data, axis=0), num_data_test)
    #print(num_data_test)
    num_data_test = tf.convert_to_tensor(num_data_test, dtype=tf.float32)
    cat_test = tf.convert_to_tensor(test['stock_id'].to_numpy(), dtype = tf.int8)
    test_predictions_nn += model.predict([test['stock_id'], num_data_test]).reshape(1,-1)[0].clip(min_target, max_target) /5
    del model, cat_test, num_data_test
    K.clear_session()
    gc.collect()

Epoch 1/3
Epoch 2/3
Epoch 3/3
min predicted_value: 0.00019194929336663336
Fold 0 0: 0.22441
Epoch 1/3
Epoch 2/3
Epoch 3/3
min predicted_value: 0.00019194929336663336
Fold 0 1: 0.21866
Epoch 1/3
Epoch 2/3
Epoch 3/3
min predicted_value: 0.00019194929336663336
Fold 0 2: 0.23796
Epoch 1/3
Epoch 2/3
Epoch 3/3
min predicted_value: 0.00019194929336663336
Fold 0 3: 0.21993
Epoch 1/3
Epoch 2/3
Epoch 3/3
min predicted_value: 0.00019194929336663336
Fold 0 4: 0.2242


In [22]:
test["row_id"] = test["stock_id"].astype(str) + "-" + test["time_id"].astype(str) 
test[target_name] = test_predictions_nn


display(test[['row_id', target_name]].head(3))
test[['row_id', target_name]].to_csv('submission.csv',index = False)
#test[['row_id', target_name]].to_csv('submission.csv',index = False)
#kmeans N=5 [0.2101, 0.21399, 0.20923, 0.21398, 0.21175]

Unnamed: 0,row_id,target
0,0-4,0.002393
1,0-32,0.003024
2,0-34,0.003024
