In [None]:
!pip install ../input/talib-source/talib_binary-0.4.19-cp37-cp37m-manylinux1_x86_64.whl
import talib as ta

In [None]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools

In [None]:
class Config:
    train_path = "../input/jpx-tokyo-stock-exchange-prediction/train_files"
    supp_train_path = "../input/jpx-tokyo-stock-exchange-prediction/supplemental_files"
    data_spec_path = "../input/jpx-tokyo-stock-exchange-prediction/data_specifications"
    exp_test_path = "../input/jpx-tokyo-stock-exchange-prediction/example_test_files"
    info_path = "../input/jpx-tokyo-stock-exchange-prediction"

In [None]:
%%time
o_train = pd.read_csv(os.path.join(Config.train_path, "stock_prices.csv"), parse_dates=["Date"])
o_train.drop(columns=["ExpectedDividend"], inplace=True)
o_train.dropna(axis=0, inplace=True)
o_train.reset_index(drop=True).isna().sum()
o_train.head()

In [None]:
%%time
s_train = pd.read_csv(os.path.join(Config.supp_train_path, "stock_prices.csv"), parse_dates=["Date"])
s_train.drop(columns=["ExpectedDividend"], inplace=True)
s_train.dropna(axis=0, inplace=True)
s_train.reset_index(drop=True).isna().sum()
s_train.head()

In [None]:
prices = pd.concat([o_train, s_train]).reset_index(drop=True)
prices.columns = prices.columns.str.lower()

In [None]:
sec_codes = prices["securitiescode"].unique()

In [None]:
def getFeatures(data:pd.DataFrame = None, test:bool = False):
    def f_memoize_pandas_SECCODE(s, df): return {sc:df[df["securitiescode"] == sc].sort_values(by=["date"]).reset_index(drop=True) for sc in s.unique()}
    sec_codes_dict = f_memoize_pandas_SECCODE(data["securitiescode"], data)
    dfs = []
    for sc in tqdm(sec_codes):
        s_prices = sec_codes_dict[sc]
        features = pd.DataFrame(index=s_prices.index).sort_index()
        
        features['f01'] = s_prices.close/s_prices.open-1 
        features['f02'] = s_prices.open/s_prices.close.shift(1)-1 
        features['f03'] = s_prices.volume.diff() 
        features['f04'] = s_prices.volume
        features['f05'] = ta.ROC(s_prices.close, timeperiod=3)
        features['f06'] = ta.TRIX(s_prices.close, timeperiod=3)
        features['f07'] = ta.ADX(s_prices.high,s_prices.low,s_prices.close, timeperiod=3)
        features['f08'] = ta.WILLR(s_prices.high,s_prices.low,s_prices.close, timeperiod=3)
        features['f09'] = s_prices.volume.rolling(14).std()
        features['f10'] = s_prices.volume/ s_prices.volume.rolling(14).mean()-1
        features['f11'] = s_prices.close/ s_prices.close.ewm(span=14).mean()-1
        features['f12'] = (s_prices.close - s_prices.close.mean())/s_prices.close.std()
        features['f13'] = (s_prices.close - s_prices.close.rolling(window=3, min_periods=1).mean())/s_prices.close.rolling(window=3, min_periods=1).std()
        features['f14'] = pd.qcut(s_prices.volume,q=10,labels=False,duplicates='drop')
        features['f15'] = (s_prices.close - s_prices.open)/(s_prices.high-s_prices.low)
        features['f16'] = s_prices.open/s_prices.close.shift(3)-1
        features['f17'] = (s_prices.volume-s_prices.volume.mean())/s_prices.volume.std()
        
        features['target'] = s_prices['target']
        features['rowid'] = s_prices['rowid']
        features['date'] = s_prices['date']
        features['securitiescode'] = s_prices['securitiescode']
        
        if test == False:
            features = features.dropna()
        dfs.append(features)
        
    return pd.concat(dfs).reset_index(drop=True)  

In [None]:
train_ti = getFeatures(prices)

In [None]:
train_ti.head()

In [None]:
train_ti.tail()

In [None]:
params_lgb = {
        'learning_rate':0.001,
        "objective": "regression",
        "metric": "'mse",
        'boosting_type': "dart",
        "device": "gpu",
        'verbosity': -1,
        'n_jobs': -1, 
        'seed': 21,
        'num_leaves': 112, 
        'subsample':None,
        'bagging_freq': 1, 
        'n_estimators': 1000
    }

In [None]:
import lightgbm as lgb

trainCols = train_ti.drop(columns=["target","rowid","date","securitiescode"],axis=1).columns

In [None]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5, test_size=2)

models = []
for tr_ind, te_ind in tscv.split(train_ti[trainCols]):
    
    X_tr, X_val = train_ti[trainCols].iloc[tr_ind], train_ti[trainCols].iloc[te_ind]
    y_tr, y_val = train_ti['target'].iloc[tr_ind], train_ti['target'].iloc[te_ind]
    
    train_ds = lgb.Dataset(X_tr.values, label=y_tr.values)
    valid_ds = lgb.Dataset(X_val.values, label=y_val.values)
    
    model_lgb = lgb.train(params_lgb,train_ds,valid_sets=valid_ds,early_stopping_rounds=100)

    models.append(model_lgb)

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

data = prices.copy()
for (pr, _, _, _, _, sample_prediction ) in iter_test:
    
    pr.columns = pr.columns.str.lower()
    pr["date"] = pd.to_datetime(pr["date"])
    data = data.append(pr).drop_duplicates(["securitiescode", "date"], keep="last").sort_values(["securitiescode", "date"]).reset_index(drop=True)
    data_f = getFeatures(data,test=True)
    
    d = sample_prediction[["Date","SecuritiesCode"]].reset_index()
    d["Date"] = pd.to_datetime(d["Date"])
    d = d.merge(data_f, left_on=["Date","SecuritiesCode"], right_on=["date","securitiescode"])
    
    for code, _d in d.groupby("SecuritiesCode"):
        preds = []
        
        for lgbm in models:
            preds.append(lgbm.predict(_d[trainCols]))
            
        d.loc[_d.index, "Pred"] = np.mean(preds,axis=0)
        
    d = d.sort_values(by="Pred", ascending=False).set_index("index")
    d["Rank"] = np.arange(0,2000)
    d = d.sort_index()
    
    sample_prediction["Rank"] = d["Rank"]
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    
    env.predict(submission)