# Overview

# Preprocessing
### Imports

In [1]:
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

import warnings
warnings.filterwarnings("ignore")

In [2]:
def concat_df(df1, df2):
    df1 = pd.concat([df1, df2],
                    ignore_index=True, sort=False
                    ).drop_duplicates(["RowId"], keep="first")
    return df1

In [3]:
# For now... use pandas.... 
#If too heavy use Cudf

# For now... just fix the floats
dtypes = {
'SecuritiesCode' :     np.int16,
'Open'           :     np.float16,
'High'           :     np.float16,
'Low'            :     np.float16,
'Close'          :     np.float16,
'Volume'         :       np.int8,
'AdjustmentFactor':    np.float16,
'ExpectedDividend':    np.float16,
'Target'        :      np.float16
}
path = "../input/jpx-tokyo-stock-exchange-prediction/"
df_prices = pd.read_csv(f"{path}train_files/stock_prices.csv",dtype = dtypes)
prices = pd.read_csv(f"{path}supplemental_files/stock_prices.csv",dtype = dtypes)
df_prices = concat_df(df_prices, prices)
prices = pd.read_csv(f"{path}train_files/secondary_stock_prices.csv",dtype = dtypes)
df_prices = concat_df(df_prices, prices)
prices = pd.read_csv(f"{path}supplemental_files/secondary_stock_prices.csv",dtype = dtypes)
df_prices = concat_df(df_prices, prices)
#df_prices = df_prices[df_prices.Date<"2021-11-01"]
df_prices.info(show_counts=True)

prices = df_prices

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5271570 entries, 0 to 5271569
Data columns (total 12 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   RowId             5271570 non-null  object 
 1   Date              5271570 non-null  object 
 2   SecuritiesCode    5271570 non-null  int16  
 3   Open              5160744 non-null  float16
 4   High              5160744 non-null  float16
 5   Low               5160744 non-null  float16
 6   Close             5160744 non-null  float16
 7   Volume            5271570 non-null  int8   
 8   AdjustmentFactor  5271570 non-null  float16
 9   ExpectedDividend  41373 non-null    float16
 10  SupervisionFlag   5271570 non-null  bool   
 11  Target            5270524 non-null  float16
dtypes: bool(1), float16(7), int16(1), int8(1), object(2)
memory usage: 211.1+ MB


### Handle Nulls

In [4]:
prices = prices.drop("ExpectedDividend", axis=1)
prices = prices.dropna()
prices.isnull().sum()

RowId               0
Date                0
SecuritiesCode      0
Open                0
High                0
Low                 0
Close               0
Volume              0
AdjustmentFactor    0
SupervisionFlag     0
Target              0
dtype: int64

### Cross Validation Split

In [5]:
def setup_cv(df, splits=5):
    df['fold'] = -1
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    num_bins = int(np.floor(1 + np.log2(len(df))))
    df.loc[:, "bins"] = pd.cut(
        df["Target"], bins=num_bins, labels=False
    )

    kf = StratifiedKFold(n_splits=splits)
    for f, (t_, v_) in enumerate(kf.split(X=df, y=df.bins.values)):
            df.loc[v_, 'fold'] = f

    df = df.drop("bins", axis=1)
    return df

In [6]:
prices = setup_cv(prices)

## Feature Engeneering

In [7]:
from datetime import datetime
import time

 # auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%Y-%m-%d").timetuple()))

In [8]:
# define function to compute log returns
# def log_return(series, periods=1):
#     return np.log(series).diff(periods=periods)


# # def fill_the_gaps(df):
# #     new_df = pd.DataFrame(columns= df.columns)
# #     for i in range(len(df['SecuritiesCode'].unique())):
# #         new_df = new_df.append(df[df['SecuritiesCode'] == i].reindex(range(df[df['SecuritiesCode'] == i].index[0],df[df['SecuritiesCode'] == i].index[-1]+60,60),method='pad'))
# #     new_df.fillna(method = 'pad', inplace = True) 
# #     return new_df


# def rsiFunc(prices, n=14):
#     deltas = np.diff(prices)
#     seed = deltas[:n+1]
#     up = seed[seed>=0].sum()/n
#     down = -seed[seed<0].sum()/n
#     rs = up/down
#     rsi = np.zeros_like(prices)
#     rsi[:n] = 100. - 100./(1.+rs)

#     for i in range(n, len(prices)):
#         delta = deltas[i-1] # cause the diff is 1 shorter

#         if delta>0:
#             upval = delta
#             downval = 0.
#         else:
#             upval = 0.
#             downval = -delta

#         up = (up*(n-1) + upval)/n
#         down = (down*(n-1) + downval)/n

#         rs = up/down
#         rsi[i] = 100. - 100./(1.+rs)

#     return rsi



def get_features(df):

#     df['log_return_5'] = log_return(df['Close'],periods=5)
#     df['log_return'] = log_return(df['Close'],periods=1)

    upper_shadow = lambda asset: asset.High - np.maximum(asset.Close,asset.Open)
    lower_shadow = lambda asset: np.minimum(asset.Close,asset.Open)- asset.Low

    df['upper_shadow'] = upper_shadow(df)
    df['lower_shadow'] = lower_shadow(df)
    
#     df['EMA_21'] = df['Close'].ewm(span=21).mean()
    
#     df['EMA_55'] = df['Close'].ewm(span=55).mean()
    
#     df['EMA_315'] = df['Close'].ewm(span=315).mean()
    
#     df['EMA_825'] = df['Close'].ewm(span=825).mean()
    
#     window = 7
    
#     no_of_std = 2
    
#     df[f'EMA_{window}'] = df['Close'].ewm(span=window).mean()
    
#     df[f'EMA_{window}_std'] = df['Close'].rolling(window=window).std()
    
#     df[f'EMA_{window}_BB_high'] = df[f'EMA_{window}'] + no_of_std * df[f'EMA_{window}_std']
    
#     df[f'MA_{window}MA_BB_low'] = df[f'EMA_{window}'] - no_of_std * df[f'EMA_{window}_std']
    
#     window = 5
    
#     df[f'EMA_{window}'] = df['Close'].ewm(span=window).mean()
    
#     df[f'EMA_{window}_std'] = df['Close'].rolling(window=window).std()
    
#     df[f'EMA_{window}_BB_high'] = df[f'EMA_{window}'] + no_of_std * df[f'EMA_{window}_std']
    
#     df[f'MA_{window}MA_BB_low'] = df[f'EMA_{window}'] - no_of_std * df[f'EMA_{window}_std']
    
#     df['MACD'] = df['EMA_7'] - df['EMA_5']
    
    
#     df['rsi_5'] = rsiFunc(df['Close'].values, 5)
    
#     df['rsi_7'] = rsiFunc(df['Close'].values, 7)
    
#     df['rsi_21'] = rsiFunc(df['Close'].values, 21)
    
    
    
    df['VWAP'] = (df['Close'] * df['Volume'])/ df['Volume']




    
    
    #df = pd.concat([df, pd.get_dummies(df['Asset_ID'], prefix= 'Asset_')], axis=1)
    
    df[[ 'upper_shadow', 'lower_shadow','VWAP']].astype('float16')
    
    return df

In [9]:
prices = get_features(prices)

prices = prices.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)

prices = prices.fillna(0)

In [10]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [11]:
scaled_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'upper_shadow', 'lower_shadow','VWAP']
features = ["SecuritiesCode",'Open', 'High', 'Low', 'Close', 'Volume', 'upper_shadow', 'lower_shadow','VWAP']

In [12]:
prices[scaled_features] = scaler.fit_transform(prices[scaled_features])

### Ordinal Encode Securities Code

In [13]:
enc = OrdinalEncoder()
prices["SecuritiesCode"] = enc.fit_transform(prices[["SecuritiesCode"]])

# Train Models

### Deep Neural Network

In [14]:
codes = list(prices.SecuritiesCode.unique())
codes_size = len(codes)

def dense_block(x, units, act='swish', dr=0.2):
    x = L.Dropout(dr)(x)
    x = L.BatchNormalization()(x)
    x = L.Dense(units, activation=act)(x)
    return x

def get_dnn(dense_blocks):
    prices_in = L.Input(shape=(len(scaled_features),), name='input_prices')
    x_prices = L.BatchNormalization()(prices_in)
    x_prices = L.Dense(64, activation='swish')(x_prices)
    
    security_code_input = L.Input(shape=(1,), name='input_security_code')
    x_id = L.Embedding(codes_size, 32, input_length=1)(security_code_input)
    x_id = L.Reshape((-1, ))(x_id)
    x_id = L.Dense(32, activation='swish')(x_id)

    x = L.Concatenate(axis=1)([x_id, x_prices])
    
    for units in dense_blocks:
        x = dense_block(x, units)
    
    output = L.Dense(1)(x)
    
    model = M.Model([prices_in, security_code_input], 
                    [output])

    model.compile(optimizer=tf.optimizers.Adam(lr=0.001),
                  loss='mse', metrics=['mse'])
    
    return model
    
def train_dnn(prices, folds):
    models = list()
    
    for f in range(folds):
        X_train_prices = prices[prices.fold != f][scaled_features]
        X_train_id = prices[prices.fold != f][["SecuritiesCode"]]
        y_train = prices[prices.fold != f][["Target"]]
        X_valid_prices = prices[prices.fold == f][scaled_features]
        X_valid_id = prices[prices.fold == f][["SecuritiesCode"]]
        y_valid = prices[prices.fold == f][["Target"]]

        model = get_dnn([128, 64, 32])
        model.fit([X_train_prices, X_train_id], y_train,
                   validation_data=([X_valid_prices, X_valid_id], y_valid),
                   batch_size=128, epochs=10, verbose=1)

        oof_preds = model.predict([X_valid_prices, X_valid_id])
        oof_score = np.sqrt(mean_squared_error(y_valid, oof_preds))
        print(oof_score)
        models.append(model)
        # break for speed of training, feel free to train all folds
    
    return models

### Run - prints rmse for each fold

In [15]:
dnn_models = train_dnn(prices, 2)

2022-07-03 17:44:44.859384: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-03 17:44:44.977617: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-03 17:44:44.978749: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-03 17:44:44.981133: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.027163700149534026
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.05292783843103313


In [16]:
# Save model
n = 0
for model in dnn_models:
    n += 1
    model.save_weights(f'dnn_{n}.tf')

In [17]:
import gc
del model
gc.collect()

3332

In [18]:
# Define a simple sequential model
model = get_dnn([128, 64, 32])
# Display the model's architecture
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_security_code (InputLayer [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 32)        141312      input_security_code[0][0]        
__________________________________________________________________________________________________
input_prices (InputLayer)       [(None, 8)]          0                                            
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 32)           0           embedding_2[0][0]                
____________________________________________________________________________________________

In [19]:
models = list()
for i in range(1,3):
    model.load_weights(f'./dnn_{i}.tf')
    models.append(model)

# Make Predictions & Submit

In [20]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    
    prices = get_features(prices)
    prices = prices.replace((np.inf, -np.inf, np.nan), 0).reset_index(drop=True)
    prices = prices.fillna(0)
    prices[scaled_features] = scaler.fit_transform(prices[scaled_features])
    prices["SecuritiesCode"] = enc.fit_transform(prices[["SecuritiesCode"]])
    X_test = prices[features]    
    X_test_prices = prices[scaled_features]
    X_test_id = prices[["SecuritiesCode"]]
    dnn_preds = list()
    for model in models:
        dnn_preds.append(model.predict([X_test_prices, X_test_id]))    
    sample_prediction["Prediction"] = sum(dnn_preds)/len(dnn_preds)   
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction.Rank = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [21]:
pd.read_csv("./submission.csv")

Unnamed: 0,Date,SecuritiesCode,Rank
0,2021-12-06,1301,777
1,2021-12-06,1332,240
2,2021-12-06,1333,1004
3,2021-12-06,1375,519
4,2021-12-06,1376,287
...,...,...,...
3995,2021-12-07,9990,343
3996,2021-12-07,9991,819
3997,2021-12-07,9993,1315
3998,2021-12-07,9994,1551
