## Start

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
from tqdm import tqdm
import lightgbm as lgb
import gc
import multiprocessing
from IPython.display import display
import random

import sklearn

random.seed(0)
np.random.seed(0)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/options.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/financials.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/secondary_stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/trades.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/__init__.py
/kaggle/input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_fin_spec.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/data_specifications/trades_spec.csv
/kaggle/input/jpx-tokyo-stock-

In [2]:
df_prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv", parse_dates=['Date'])
df_sprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv", parse_dates=['Date'])
df_prices = pd.concat((df_prices, df_sprices))

df_prices = df_prices[df_prices['Open'].notna()].reset_index(drop=True)
df_prices = df_prices.drop(columns=['RowId'])

f = lambda df: df[::-1].cumprod()[::-1]
df_prices['CumAdjustmentFactor'] = df_prices.groupby('SecuritiesCode')['AdjustmentFactor'].transform(f)
for price in ['Open','High','Low','Close']:
    df_prices[price] *= df_prices['CumAdjustmentFactor']
df_prices['Volume'] /= df_prices['CumAdjustmentFactor']

# not sure, change this later
df_prices['ExpectedDividend'] = df_prices['ExpectedDividend'].fillna(0)
df_prices

Unnamed: 0,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,CumAdjustmentFactor
0,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400.0,1.0,0.0,False,0.000730,1.0
1,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500.0,1.0,0.0,False,0.012324,1.0
2,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800.0,1.0,0.0,False,0.006154,1.0
3,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300.0,1.0,0.0,False,0.011053,1.0
4,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800.0,1.0,0.0,False,0.003026,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2593981,2022-06-24,9990,576.0,576.0,563.0,564.0,24200.0,1.0,0.0,False,0.027073,1.0
2593982,2022-06-24,9991,810.0,815.0,804.0,815.0,8700.0,1.0,0.0,False,0.001220,1.0
2593983,2022-06-24,9993,1548.0,1548.0,1497.0,1497.0,12600.0,1.0,0.0,False,0.001329,1.0
2593984,2022-06-24,9994,2507.0,2527.0,2498.0,2527.0,7300.0,1.0,0.0,False,0.003185,1.0


## Features and Train

In [3]:
class Stock:
    def __init__(self, feat):
        '''
        maintain SecuritiesCode, prices, vol, date, target and features for future calculations
        '''
        self.SecuritiesCode = int(feat[0][1])
        openn = feat[:,2].astype(np.float32)
        high = feat[:,3].astype(np.float32)
        low = feat[:,4].astype(np.float32)
        close = feat[:,5].astype(np.float32)
        self.price = (openn+high+low+close)/4
        
        self.vol = feat[:,6].astype(np.float32)
        AdjustmentFactor = feat[:,7].astype(np.float32)
        ExpectedDividend = feat[:,8].astype(np.float32)
        
        # returns
        log_ret = np.log(self.price[1:]/self.price[:-1])
        
        # volume
        log_chg_vol = np.log(self.vol[1:]/self.vol[:-1])
        
        # daily volitality
        daily_chg_log = np.log(high/low)
        
        # volitality
        std5 = self.give_std(5, log_ret)
        std5_chg = np.log(std5[1:]/(std5[:-1]+1e-16) + 1e-16)
        
        # others
        rsi7 = self.give_rsi(7, log_ret)
                
        N = np.min((std5_chg.shape[0], rsi7.shape[0]))
        self.feat = np.stack((AdjustmentFactor[-N:],
                              ExpectedDividend[-N:],
                              log_ret[-N:],
                              log_ret[-N-1:-1],
                              log_ret[-N-2:-2],
                              log_ret[-N-3:-3],
                              log_ret[-N-4:-4],
                              log_ret[-N-5:-5],
                              log_ret[-N-6:-6],
                              log_chg_vol[-N:],
                              log_chg_vol[-N-1:-1],
                              log_chg_vol[-N-2:-2],
                              daily_chg_log[-N:],
                              daily_chg_log[-N-1:-1],
                              daily_chg_log[-N-2:-2],
                              std5[-N:],
                              std5_chg[-N:],
                              rsi7[-N:],           
                             ), axis=1)
        
        self.target = log_ret[-N+2:]

        self.Date = feat[-N:,0]
        
    def give_std(self, wsize, log_ret):
        return np.array([np.std(log_ret[i+1-wsize:i+1]) for i in range(wsize-1,len(log_ret))], dtype=np.float32)
    
    def give_rsi(self, rs_wsize, log_ret):
        gain = np.where(log_ret>0, log_ret, 0.0)
        loss = np.where(log_ret<0, -log_ret, 0.0)
        rs = np.array([np.mean(gain[i+1-rs_wsize:i+1]) / (np.mean(loss[i+1-rs_wsize:i+1])+1e-16) for i in range(rs_wsize-1,len(gain))], dtype=np.float32)
        return 100 - 100/(1+ rs)
    
    def add_row(self, feat):
        '''
        updates all data structures in the class with new data
        and return the feature row added 
        '''
        price = (feat[2] + feat[3] + feat[4] + feat[5]) / 4
        
        AdjustmentFactor = feat[7]
        ExpectedDividend = feat[8]
        
        self.price = np.append(self.price * AdjustmentFactor, price)
        self.vol   = np.append(self.vol / AdjustmentFactor, feat[6])
        
        self.Date = np.append(self.Date, feat[0])
        
        # target = feat[10]
        # self.target = np.append(self.target, target)
        
        log_ret = np.log(self.price[-1]/self.price[-2])
        log_chg_vol = np.log(self.vol[-1]/self.vol[-2])
        daily_chg_log = np.log(feat[3] / feat[4])
        std5 = np.std(np.append(self.feat[-4:,2], log_ret))
        std5_chg = np.log(std5/(self.feat[-1,-3]+1e-16) + 1e-16)
                
        # others
        rsi7 = self.give_rsi(7, np.append(self.feat[-6:,2], log_ret)).item()
        
        feat_row = np.array([AdjustmentFactor,
                             ExpectedDividend,
                             log_ret,
                             self.feat[-1,2],
                             self.feat[-2,2],
                             self.feat[-3,2],
                             self.feat[-4,2],
                             self.feat[-5,2],
                             self.feat[-6,2],
                             log_chg_vol,
                             self.feat[-1,9],
                             self.feat[-2,9],
                             daily_chg_log,
                             self.feat[-1,-6],
                             self.feat[-2,-6],
                             std5,
                             std5_chg,
                             rsi7
                            ], dtype=np.float32).reshape(1,-1)
        
        self.feat = np.concatenate((self.feat, feat_row))
        return feat_row.reshape(-1)

In [4]:
X_train = None

stocks = []

for i in tqdm(np.unique(df_prices['SecuritiesCode']).tolist()):
    stock = Stock(df_prices[df_prices['SecuritiesCode']==i].values)

    stocks.append(stock)
    
    X = stock.feat[:-2]
    y = stock.target

    split = int(len(y) * 0.90)
    if X_train is None:
        X_train = X[:split,:]
        X_valid = X[split:,:]
        y_train = y[:split]
        y_valid = y[split:]
    else:
        X_train = np.concatenate((X_train, X[:split,:]))
        X_valid = np.concatenate((X_valid, X[split:,:]))
        y_train = np.concatenate((y_train, y[:split]))
        y_valid = np.concatenate((y_valid, y[split:]))

100%|██████████| 2000/2000 [03:47<00:00,  8.77it/s]


In [5]:
model1 = lgb.LGBMRegressor(learning_rate=0.02, n_estimators=256, boosting_type='gbdt')
model1.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[28]	valid_0's l2: 0.000367225


LGBMRegressor(learning_rate=0.02, n_estimators=256)

In [6]:
model2 = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=100, boosting_type='gbdt')
model2.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[6]	valid_0's l2: 0.000367223


LGBMRegressor()

In [7]:
model3 = lgb.LGBMRegressor(learning_rate=0.01, n_estimators=100, boosting_type='gbdt')
model3.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[49]	valid_0's l2: 0.000367222


LGBMRegressor(learning_rate=0.01)

In [8]:
model4 = lgb.LGBMRegressor(learning_rate=0.01, n_estimators=100, boosting_type='gbdt', reg_alpha=10.0, reg_lambda=10.0)
model4.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[68]	valid_0's l2: 0.000367157


LGBMRegressor(learning_rate=0.01, reg_alpha=10.0, reg_lambda=10.0)

In [9]:
model5 = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=100, boosting_type='goss')
model5.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[6]	valid_0's l2: 0.000367223


LGBMRegressor(boosting_type='goss')

In [10]:
model6 = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=100, boosting_type='goss', reg_alpha=10.0, reg_lambda=10.0)
model6.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[8]	valid_0's l2: 0.000367158


LGBMRegressor(boosting_type='goss', reg_alpha=10.0, reg_lambda=10.0)

In [11]:
model7 = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=100, boosting_type='goss', reg_alpha=5.0, reg_lambda=5.0)
model7.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[5]	valid_0's l2: 0.000367182


LGBMRegressor(boosting_type='goss', reg_alpha=5.0, reg_lambda=5.0)

In [12]:
model8 = lgb.LGBMRegressor(learning_rate=0.02, n_estimators=100, boosting_type='goss')
model8.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[28]	valid_0's l2: 0.000367225


LGBMRegressor(boosting_type='goss', learning_rate=0.02)

## Submission

In [13]:
scode_dict = {}
all_codes = []
for i in range(len(stocks)):
    scode_dict[stocks[i].SecuritiesCode] = i
    all_codes.append(int(stocks[i].SecuritiesCode))
all_codes = sorted(all_codes)

In [14]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    #----------------code here----------------#
    prices = prices[prices['Open'].notna()].reset_index(drop=True)
    prices = prices.drop(columns=['RowId'])
    prices['ExpectedDividend'] = prices['ExpectedDividend'].fillna(0)
    
    feat_date = prices.values
    feat_rows = []
    scodes = []
    for raw_row in feat_date:
        scode = int(raw_row[1])
        scodes.append(scode)
        feat_row = stocks[scode_dict[scode]].add_row(raw_row)
        feat_rows.append(feat_row)
    feat_rows = np.array(feat_rows, dtype=np.float32)
    
    pred1 = model1.predict(feat_rows)
    pred2 = model2.predict(feat_rows)
    pred3 = model3.predict(feat_rows)
    pred4 = model4.predict(feat_rows)
    pred5 = model5.predict(feat_rows)
    pred6 = model6.predict(feat_rows)
    pred7 = model7.predict(feat_rows)
    pred8 = model8.predict(feat_rows)
    pred = (pred1 + pred2 + pred3 + pred4 + pred5 + pred6 + pred7 + pred8) / 8
    
    mpred = np.median(pred)
    pred_submission = []
    idx = 0
    for i in range(2000):
        if scodes[idx] == all_codes[i]:
            pred_submission.append(pred[idx])
            idx += 1
        else:
            pred_submission.append(mpred)
    
    rank = np.array(pred_submission).argsort()[::-1].argsort()
    assert len(rank) == 2000
    #----------------------------------------#
    sample_prediction['Rank'] = rank  # make your predictions here
    env.predict(sample_prediction)   # register your predictions

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
