In [None]:
!pip --quiet install ../input/treelite/treelite-0.93-py3-none-manylinux2010_x86_64.whl
!pip --quiet install ../input/treelite/treelite_runtime-0.93-py3-none-manylinux2010_x86_64.whl


In [None]:
import pandas as pd
import numpy as np
import gc

from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

import random
import pathlib
from tqdm import tqdm
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict

import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
#plt.style.use('fivethirtyeight')
import xgboost as xgb
import sklearn
import random
import janestreet
import tensorflow as tf

# treelite
import treelite
import treelite_runtime

import warnings
warnings.filterwarnings('ignore')

In [None]:
##TREELITE

In [None]:
SEED=1111
NFOLD = 4

In [None]:
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

## read data in efficiently


In [None]:
INPUT_DIR = '../input/janestreet-save-as-feather/'

In [None]:
#train = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")

# load data blitz fast!
def load_data(input_dir=INPUT_DIR):
    train = pd.read_feather(pathlib.Path(input_dir + 'train.feather'))
    features = pd.read_feather(pathlib.Path(input_dir + 'features.feather'))
    example_test = pd.read_feather(pathlib.Path(input_dir + 'example_test.feather'))
    ss = pd.read_feather(pathlib.Path(input_dir + 'example_sample_submission.feather'))
    return train, features, example_test, ss

train, features, example_test, ss = load_data(INPUT_DIR)

In [None]:
# delete irrelevant files to save memory
del features, example_test, ss
gc.collect()
train.shape

In [None]:
train.head(50)

In [None]:
#train = train.drop(['feature_113','feature_89','feature_101'], 1)

In [None]:
train = train.query('date > 85').reset_index(drop = True) 
train = train[train['weight'] != 0]

#train.fillna(train.mean(),inplace=True)

train['action'] = ((train['resp'].values) > 0).astype(int)


features = [c for c in train.columns if "feature" in c]


In [None]:
train.fillna(train.mean(),inplace=True)

In [None]:
features.remove('feature_0')


In [None]:
#features.remove('feature_48')
#features.remove('feature_45')
#features.remove('feature_3')

In [None]:
len(features)

In [None]:
train.shape

In [None]:
train['resp'] = (((train['resp'].values)*train['weight']) > 0).astype(int)
train['resp_1'] = (((train['resp_1'].values)*train['weight']) > 0).astype(int)
train['resp_2'] = (((train['resp_2'].values)*train['weight']) > 0).astype(int)
train['resp_3'] = (((train['resp_3'].values)*train['weight']) > 0).astype(int)
train['resp_4'] = (((train['resp_4'].values)*train['weight']) > 0).astype(int)

In [None]:

f_mean = np.mean(train[features[1:]].values,axis=0)

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp']

#X_train = train.loc[:, train.columns.str.contains('feature')]


In [None]:
#features.extend(['cross_41_42_43', 'cross_1_2'])

In [None]:
len(features)

In [None]:
X_train=train[features].values
#y_train = (train.loc[:, 'action'])

y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T
print(y_train.shape)

## these are numpy arrays now

**Let us check important feature using logistic relation**


In [None]:
import random
from collections import Counter, defaultdict
from sklearn import model_selection

# ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx



In [None]:
# modeling step 
params={"num_leaves":300,
       "max_bin":450,
       "feature_fraction":0.52,
       "bagging_fraction":0.52,
       "objective":"binary",
       "learning_rate":0.05,
       "boosting_type":"gbdt",
       "metric":"auc"
       }

## should we split by k-fold as well?
# because the tree is definitely overfitting
# it just ekes out a good enough signal in the end

cv = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=SEED)
group = 'date'
target = 'action'
oof = np.zeros(train.shape[0])
models = []

## OUTER FOLD
for fold, (train_idx, val_idx) in tqdm(enumerate(cv.split(train, train[target], group))):
    # train test split
#     x_train, x_val = train[features].iloc[train_idx], train[features].iloc[val_idx]
#     y_train, y_val = train[target].iloc[train_idx], train[target].iloc[val_idx]
    
    
    xtr,xval = X_train[train_idx,:], X_train[val_idx,:]
    ytr,yval = y_train[train_idx,:], y_train[val_idx,:]
    
    ## ensemble by y_train
    for i in range(y_train.shape[1]):
        print('MODEL: ', str(fold)+'+'+str(i))
        #xtr,xval,ytr,yval = train_test_split(X_train ,y_train[:,i],test_size=0.2,stratify=y_train[:,i])

        d_train = lgbm.Dataset(xtr,label=ytr[:,i])
        d_eval = lgbm.Dataset(xval,label=yval[:,i],reference=d_train)
        clf = lgbm.train(params,d_train,valid_sets=[d_train,d_eval],num_boost_round=1000,\
                        early_stopping_rounds=50,verbose_eval=50)
        clf.save_model('model_fold_'+str(fold)+'_'+str(i)+'.txt')

        models.append(clf)
    
    ## there is technically a holdout set in y_val x_val now here...


In [None]:
## evaluate model's raw classification accuracy
# for i in range(y_train.shape[1]):
#     xtr,xval,ytr,yval = train_test_split(X_train ,y_train[:,i],test_size=0.2,stratify=y_train[:,i])
   
#     d_train = lgbm.Dataset(xtr,label=ytr)
#     d_eval = lgbm.Dataset(xval,label=yval,reference=d_train)
#     for model in models:
#         model.predict(d_eval);
    

## Logistic Regression as well


In [None]:
# log_models = []
# from sklearn.metrics import roc_auc_score
# for i in range(y_train.shape[1]):
#     print('model: ',i)
#     xtr,xval,ytr,yval = train_test_split(X_train ,y_train[:,i],test_size=0.2,stratify=y_train[:,i])

#     logreg = LogisticRegression(max_iter = 2000).fit(xtr, ytr);
#     print(logreg.score(xval, yval), roc_auc_score(yval, logreg.predict(xval)));
#     log_models.append(clf)

In [None]:
fig,ax = plt.subplots(figsize=(25,50))
lgbm.plot_importance(clf, ax=ax,importance_type='gain',max_num_features=130)
plt.show()

In [None]:
## treelite post
predictors = []
for fold in range(NFOLD):
    # load LGB with Treelite
    for i in range(y_train.shape[1]):

        model = treelite.Model.load('model_fold_'+str(fold)+'_'+str(i)+'.txt', model_format='lightgbm')

        # generate shared library
        toolchain = 'gcc'
        model.export_lib(toolchain=toolchain, libpath=f'./mymodel{fold}.so',
                         params={'parallel_comp': 32}, verbose=True)# predictor from treelite

        # predictors
        predictor = treelite_runtime.Predictor(f'./mymodel{fold}.so', verbose=True)
        predictors.append(predictor)

## prediction structure
Does not change with the fold and multiple label ensemble

In [None]:
f = np.median
th = 0.500; ## don't adjust we have the median on pred
import janestreet
env = janestreet.make_env()
for (test_df, pred_df) in env.iter_test():
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        #x_tt=test_df[features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        
        #drop test_df
        ## log_models or normal models
        pred = np.mean([model.predict(x_tt) for model in models],axis=0)
        pred = f(pred)
        pred_df.action = np.where(pred >= th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)

In [None]:
#preds = clf.predict(xtr)
#pred_labels = np.rint(preds)


    
#accuracy = sklearn.metrics.accuracy_score(ytr, pred_labels)
