In [1]:
# import libraries
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
from os.path import isfile, isdir, join
import numpy as np
import pandas as pd
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup
import re
from IPython.display import display
from zipfile import ZipFile
import pickle
import unicodedata
import pytz
from joblib import Parallel, delayed
import shutil
import difflib
import random
import math
from shutil import copyfile
import itertools
import time
from tqdm import tqdm
import collections
from collections import deque
import gc

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ndcg_score, accuracy_score

import lightgbm as lgbm
import optuna
from optuna import Trial, visualization

import matplotlib as mpl
from matplotlib import pyplot as plt

import jpx_tokyo_market_prediction

from utility_script import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
Notebook Parameters
'''
SEED = 0
N_STOCK_SAMPLE = 2000
N_DAY_SAMPLE = 300
N_FOLD = 5
STOCK_LIST = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv').loc[lambda x: x.Universe0==1].SecuritiesCode.sort_values().unique().tolist()[:N_STOCK_SAMPLE]
LAGS = {'1d':1, '3d':3, '1w':5, '1m':20, '3m':20*3, '6m':20*6, '12m':20*12}
# LAGS = {'3d':3, '1w':5}
MAX_DAYS_LAG = max(list(LAGS.values()))
WIN_SIZE = 500
JPX_PATH = '../input/jpx-tokyo-stock-exchange-prediction'

N_COMP = 4
N_DAY_PCA = 120
N_DAY_BETA = 60
WEIGHT_SMOOTH = 0.5

In [3]:
class JPXData:
    def __init__(self, window_size, df_names):
        self.size = 0
        self.window_size = window_size
        self.df_names = df_names
        self.num_df = len(df_names)
        self.data = {df_name : pd.DataFrame() for df_name in df_names}
        self.row_counts = {df_name : [] for df_name in df_names}
        self.dates = []
        self.first_date, self.last_date = None, None
        self.features = []
        self.risk_models = {}
        self.curr_features = None
        self.curr_risk_models = None
        self.n_day_hist = 0
        self.archive_map = pd.DataFrame()
        self.init_folders()
        
    def init_folders(self):
        shutil.rmtree(path='./features', ignore_errors=True)
        shutil.rmtree(path='./risk_models', ignore_errors=True)
        os.mkdir('./features')
        os.mkdir('./risk_models')
        
    def append_data(self):
        self.features.append(self.curr_features)
#         self.risk_models[self.last_date] = self.curr_risk_models
        self.n_day_hist += 1
        
    def archive_data(self):
        save_pkl(self.features, f'./features/features_{self.n_day_hist}')
#         save_pkl(self.risk_models, f'./risk_models/risk_models_{self.n_day_hist}')
        self.archive_map = pd.concat([self.archive_map, pd.DataFrame({'Date':list(self.risk_models.keys())}).assign(n_day_hist=self.n_day_hist)])
        self.clear_hist()
        
    def clear_hist(self):
        self.features = []
        self.risk_models = {}
        
    def save_archive_map(self):
        save_pkl(self.archive_map, 'archive_map')
        
    def push_forward(self, new_data, append, last):
        # assign names to new data assuming the same as df_names
        new_data = dict(zip(self.df_names, new_data))
        # case when no enough data
        if self.size < self.window_size:
            for df_name in self.df_names:
                self.data[df_name] = pd.concat([self.data[df_name], new_data[df_name]]).reset_index(drop=True)
                self.row_counts[df_name] = self.row_counts[df_name] + [new_data[df_name].shape[0]]
            self.dates = self.dates + [new_data[self.df_names[0]].Date.iloc[0]] 
            self.size += 1
        # general case (shift by 1 day)
        else:
            for df_name in self.df_names:
                self.data[df_name] = pd.concat([self.data[df_name].iloc[self.row_counts[df_name][0]:], new_data[df_name]]).reset_index(drop=True)
                self.row_counts[df_name] = self.row_counts[df_name][1:] + [new_data[df_name].shape[0]]
            self.dates = self.dates[1:] + [new_data[self.df_names[0]].Date.iloc[0]]  
        # update date range
        self.first_date, self.last_date = self.dates[0], self.dates[-1]
        # generate features & risk models
        if self.size == self.window_size:
            self.curr_features = get_features(self.data)
            self.curr_risk_models = get_cov_matrix(self.data)
            if append==True:
                self.append_data()
                if (self.n_day_hist%20 == 0 and self.n_day_hist > 0) or last==True:
                    self.archive_data()
        log(f'Pushed to latest date: {self.last_date}')

In [4]:
def standard_dist(s, lag):
    tail_data = s.tail(LAGS[lag])
    return (s.iloc[-1] - tail_data.mean()) / tail_data.std()

def ma_pctg_ch(s, lag):
    return s.iloc[-1] / s.tail(LAGS[lag]).mean() - 1

def sharpe(s, lag):
    tail_data = s.tail(LAGS[lag])
    std = tail_data.std()
    if std > 0:
        sharpe_ratio = tail_data.mean() / tail_data.std()
    else:
        sharpe_ratio = 0
    return sharpe_ratio

In [5]:
'''
Risk Model calculation
'''
def cal_sigma_pca(df, n_comp, n_day_pca, n_day_beta):
    # transform data to Date X Stock
    df = df.pivot(index='Date', columns='SecuritiesCode', values='ret').fillna(0).tail(n_day_pca)
    # find the first k PCs
    pca = PCA(n_components=n_comp).fit(df)
    pc = pca.transform(df)
    # calculate factor loadings
    pc = pc[-n_day_beta:]
    df = df.tail(n_day_beta)
    def cal_beta(X, y):
        return np.linalg.inv(X.T @ X) @ X.T @ y
    B = Parallel(n_jobs=-1)(delayed(cal_beta)(pc, df.iloc[:,i]) for i in range(df.shape[1]))
    B = np.concatenate(B).reshape(-1, n_comp)
    # calculate factor covariance
    Omega = np.cov(pc, rowvar=False)
    # calculate final covariance matrix
    sigma_pca = B @ Omega @ B.T
    # reduce size
    sigma_pca = sigma_pca.astype(np.float32)
    return sigma_pca

def cal_sigma_basic(df, weight_smooth, n_day_hist):
    df = df.pivot(index='Date', columns='SecuritiesCode', values='ret').fillna(0).tail(n_day_hist)
    vol_s = df.std()
    var_s_avg = vol_s.mean()**2
    var_s = vol_s**2
    sigma_basic = weight_smooth * var_s_avg * np.diag(np.ones(df.shape[1])) + (1 - weight_smooth) * np.diag(var_s)
    # reduce size
    sigma_basic = sigma_basic.astype(np.float32)
    return sigma_basic

def get_cov_matrix(data):
    df_prices, _, _, _, _ = tuple(data.values())
    df_prices['ret'] = df_prices.groupby('SecuritiesCode').Close.pct_change()
    sigma_pca = cal_sigma_pca(df_prices, N_COMP, N_DAY_PCA, N_DAY_BETA)
    sigma_basic = cal_sigma_basic(df_prices, WEIGHT_SMOOTH, N_DAY_BETA)
    return {'sigma_pca':sigma_pca, 'sigma_basic':sigma_basic}

In [6]:
%%time

def get_features(data):
    df_prices, df_sec_prices, df_fins, df_opts, df_trades = tuple(data.values())
#     df_prices, df_sec_prices, df_fins, df_opts, df_trades = tuple(data.data.values())

    # base table
    features = df_prices.loc[lambda x: x.Date==x.Date.iloc[-1]][['RowId','Date','SecuritiesCode']]

    '''
    Major stock prices features
    '''
    # precalculate new columns
    df_prices['zero_trade'] = (df_prices.Volume==0).astype(int)
    cols = df_prices.columns.tolist()[3:]
    df_prices[cols] = df_prices.groupby('SecuritiesCode')[cols].ffill()
    df_prices['ret'] = df_prices.groupby('SecuritiesCode').Close.pct_change()
    ret_mkt = df_prices.groupby('Date').ret.mean()
    var_mkt = (ret_mkt**2).tail(LAGS['12m']).sum()
    df_prices['ret_mkt'] = df_prices.Date.map(ret_mkt)
    df_prices['spread'] = df_prices['High'] - df_prices['Low']
    df_prices['div_ratio'] = df_prices['ExpectedDividend'].fillna(0) / df_prices['Close']
    df_prices['dollar_traded'] = np.log(df_prices.Volume * (df_prices.Open + df_prices.Close)/2 + 1)
    df_prices['RS_sqrt_vol'] = np.sqrt(np.log(df_prices['High']/df_prices['Close'])*np.log(df_prices['High']/df_prices['Open']) + np.log(df_prices['Low']/df_prices['Close'])*np.log(df_prices['Low']/df_prices['Open']))
    # zero trade indicator
    features['zero_trade'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').zero_trade.last())
    # previous day return
    features['ret'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').ret.last())
    # Change in Close price
    for lag in ['3d','1w']:
        features[f'price_ma_pctg_ch_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').Close.apply(lambda s: ma_pctg_ch(s, lag)))
    for lag in ['1m','3m','6m','12m']:
        features[f'price_standard_dist_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').Close.apply(lambda s: standard_dist(s, lag)))
    # Change in volume
    for lag in ['3d','1w']:
        features[f'volume_ma_pctg_ch_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').Volume.apply(lambda s: ma_pctg_ch(s, lag)))
    for lag in ['1m','3m','6m','12m']:
        features[f'volume_standard_dist_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').Volume.apply(lambda s: standard_dist(s, lag)))
    # daily spread
    for lag in ['3d','1w']:
        features[f'spread_ma_pctg_ch_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').spread.apply(lambda s: ma_pctg_ch(s, lag)))
    for lag in ['1m']:
        features[f'spread_standard_dist_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').spread.apply(lambda s: standard_dist(s, lag)))
    # volatility
    for lag in ['1w','1m','3m','12m']:
        features[f'volatility_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').ret.apply(lambda s: s.tail(LAGS[lag]).std())) 
    # change in volatility
    features['volatility_diff'] = features['volatility_1w'] - features['volatility_1m']
    # market return and volatility
    for lag in ['3d','1w','1m','3m']:
        features[f'ret_mkt_{lag}'] = ret_mkt.tail(LAGS[lag]).sum()
        features[f'vol_mkt_{lag}'] = ret_mkt.tail(LAGS[lag]).std()
    # beta
    df_prices['beta'] = df_prices.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').apply(lambda df: (df.set_index('Date').ret * ret_mkt).tail(LAGS['12m']).sum() / var_mkt))
    features['beta'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode')['beta'].last())
    # excess return
    df_prices['exret'] = df_prices['ret'] - df_prices['beta'] * df_prices['ret_mkt']
    for lag in ['3d','1w','1m','3m']:
        features[f'exret_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode')['exret'].apply(lambda s: s.tail(LAGS[lag]).sum()))
    # supervision
    features['supervision'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').SupervisionFlag.apply(lambda s: s.astype(int).iloc[-1]))
    # div ratio
    features['div_ratio'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').apply(lambda df: df.ExpectedDividend.fillna(0).iloc[-1] / df.Close.tail(LAGS['1m']).mean()))
    # change in dollar value traded
    for lag in ['1w','1m']:
        features[f'dollar_standard_dist_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').dollar_traded.apply(lambda s: standard_dist(s, lag)))
    # RS_sqrt_vol
    features['RS_sqrt_vol'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').RS_sqrt_vol.last())
    # sharpe
    for lag in ['1m','3m']:
        features[f'sharpe_{lag}'] = features.SecuritiesCode.map(df_prices.groupby('SecuritiesCode').ret.apply(lambda s: sharpe(s, lag)))


    '''
    Secondary stock prices features
    '''
    # precalculate new columns
    df_sec_prices['ret'] = df_sec_prices.groupby('SecuritiesCode').Close.pct_change()
    df_sec_prices['dollar_traded'] = np.log(df_sec_prices.Volume * (df_sec_prices.Open + df_sec_prices.Close)/2 + 1)
    # cross-sectional return & volatility
    for n in [1,3]:
        features[f'sec_cross_sect_ret_{n}'] = df_sec_prices.groupby('SecuritiesCode').ret.apply(lambda s: s.tail(n).sum()).mean()
        features[f'sec_cross_sect_vol_{n}'] = df_sec_prices.groupby('SecuritiesCode').ret.apply(lambda s: s.tail(n).sum()).std()
    # Change in volume
    for lag in ['3d','1w']:
        features[f'sec_volume_ma_pctg_ch_{lag}'] = df_sec_prices.groupby('SecuritiesCode').Volume.apply(lambda s: ma_pctg_ch(s, lag)).mean()
    # volatility
    for lag in ['1w','1m','3m']:
        features[f'sec_volatility_{lag}'] = df_sec_prices.groupby('SecuritiesCode').ret.apply(lambda s: s.tail(LAGS[lag]).std()).mean()
    # change in volatility
    features['sec_volatility_diff'] = features['sec_volatility_1w'] - features['sec_volatility_1m']
    # change in dollar value traded
    for lag in ['1w','1m']:
        features[f'sec_dollar_standard_dist_{lag}'] = df_sec_prices.groupby('SecuritiesCode').dollar_traded.apply(lambda s: standard_dist(s, lag)).mean()

    '''
    Time phase features
    '''
    # day in week
    day_in_week_angle = (features.Date.dt.weekday / 5 * 2 * np.pi).iloc[-1]
    features['day_in_week_sin'] = np.sin(day_in_week_angle)
    features['day_in_week_cos'] = np.cos(day_in_week_angle)
    # day in month
    day_in_month_angle = ((features.Date.dt.day - 1) / 31 * 2 * np.pi).iloc[-1]
    features['day_in_month_sin'] = np.sin(day_in_month_angle)
    features['day_in_month_cos'] = np.cos(day_in_month_angle)
    # week in year
    week_in_year_angle = ((features.Date.dt.week - 1) / 52 * 2 * np.pi).iloc[-1]
    features['week_in_year_sin'] = np.sin(week_in_year_angle)
    features['week_in_year_cos'] = np.cos(week_in_year_angle)


    '''
    Financials features
    '''
    # convert string to numbers
    fin_cols = ['NetSales','OperatingProfit','OrdinaryProfit','Profit','EarningsPerShare','TotalAssets','Equity','EquityToAssetRatio','BookValuePerShare']
    df_fins[fin_cols] = df_fins[fin_cols].replace('－',np.nan).astype(float)
    # drop invalid rows
    df_fins = df_fins \
        .loc[lambda x: x.NetSales > 0] \
        .sort_values(['SecuritiesCode','Date']) \
        .reset_index(drop=True)
    # compute each quarter's YoY % change
    yoy_changes = pd.concat([df_fins[['SecuritiesCode','Date','TypeOfCurrentPeriod']], df_fins.groupby(['SecuritiesCode','TypeOfCurrentPeriod'])[fin_cols].pct_change().ffill()], axis=1)
    # take last record as current feature values
    feats_fins = pd.concat([yoy_changes \
                            .loc[lambda x: x.TypeOfCurrentPeriod==q] \
                            .groupby('SecuritiesCode') \
                            .last()[fin_cols] \
                            .set_axis([f'{x}_{q}' for x in fin_cols], axis=1) 
                            for q in ['1Q','2Q','3Q','FY']], 
                           axis=1)
    features = features.merge(feats_fins, how='left', on='SecuritiesCode')
    # num days since last announcement
    features['days_since_last_fin'] = (features.Date - features.SecuritiesCode.map(yoy_changes.groupby('SecuritiesCode').Date.last())) / np.timedelta64(1,'D')


    '''
    Post-processing
    '''
    cols = [c for c in features.columns if c not in ['RowId','Date','SecuritiesCode']]
    features[cols] = features[cols].replace(np.inf, np.nan).replace(-np.inf, np.nan)
    features[cols] = features[cols].fillna(features[cols].mean())
    features[cols] = features[cols].astype(np.float32)
    
    return features

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 10.5 µs


In [7]:
%%time

# initialize Data Object
data = JPXData(window_size=WIN_SIZE, df_names=['df_prices', 'df_sec_prices', 'df_fins', 'df_opts', 'df_trades'])

for folder in ['train_files', 'supplemental_files']:
# for folder in ['train_files']:
    # read all files
    df_prices = pd.read_csv(f'{JPX_PATH}/{folder}/stock_prices.csv', parse_dates=['Date'])
    df_sec_prices = pd.read_csv(f'{JPX_PATH}/{folder}/secondary_stock_prices.csv', parse_dates=['Date'])
    df_fins = pd.read_csv(f'{JPX_PATH}/{folder}/financials.csv', parse_dates=['Date'])
    df_opts = pd.read_csv(f'{JPX_PATH}/{folder}/options.csv', parse_dates=['Date'])
    df_trades = pd.read_csv(f'{JPX_PATH}/{folder}/trades.csv', parse_dates=['Date'])

    # iterate dates
    date_list = df_prices.Date.sort_values().unique()
    for i in range(len(date_list)):
        last = True if i==len(date_list)-1 else False
        data.push_forward([df.loc[lambda x: x.Date==date_list[i]] for df in [df_prices, df_sec_prices, df_fins, df_opts, df_trades]], append=True, last=last)

    # release memory
    del df_prices, df_sec_prices, df_fins, df_opts, df_trades
    gc.collect()
    
data.save_archive_map()
save_pkl(data, 'data')

[2022-06-03 15:39:24] Pushed to latest date: 2017-01-04 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-05 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-06 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-10 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-11 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-12 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-13 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-16 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-17 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-18 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-19 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-20 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-23 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-24 00:00:00
[2022-06-03 15:39:24] Pushed to latest date: 2017-01-25 00:00:00
[2022-06-03 15:39:25] Pus