In [51]:
%pwd
import json
import pickle

target_symbol = 'TQQQ'
cache_file = './TQQQ_stock_cache_feature.pkl'
with open(cache_file, 'rb') as fi:
    data_feature = pickle.load(fi)

cache_raw_file = './TQQQ_stock_cache.pkl'
with open(cache_raw_file, 'rb') as fi:
    data_raw = pickle.load(fi)

In [52]:
data_raw

{'TQQQ':             label_1       Open       High        Low      Close  Adj Close  \
 Date                                                                         
 2010-02-11      0.0   0.405858   0.433809   0.404560   0.431471   0.431471   
 2010-02-12      1.0   0.419729   0.436978   0.417288   0.433238   0.433238   
 2010-02-16      1.0   0.443420   0.451058   0.436459   0.450019   0.450019   
 2010-02-17      1.0   0.456046   0.457759   0.449188   0.457656   0.457656   
 2010-02-18      0.0   0.457188   0.469085   0.454435   0.466332   0.466332   
 ...             ...        ...        ...        ...        ...        ...   
 2022-09-30      1.0  20.160000  21.090000  19.280001  19.320000  19.320000   
 2022-10-03      1.0  19.700001  21.010000  19.360001  20.660000  20.660000   
 2022-10-04      0.0  21.900000  22.770000  21.889999  22.580000  22.580000   
 2022-10-05      2.0  21.670000  23.000000  21.010000  22.580000  22.580000   
 2022-10-06      2.0  22.299999  23.059999  

In [55]:
import numpy as np
import math
from prepareMLdata import prepareStockLabel
import pickle
import xgboost as xgb
from catboost import CatBoostClassifier, Pool, cv
import datetime
import pandas as pd
from hyperopt import tpe, hp, fmin, Trials
import copy
from mlStrategyLongshort import strategyEvaluate
import logging
from melog import infome
from mlPredict import dataSplit, acc_metric, error_metric, balanceLabel

def modelHyperOptTuning(target_symbol, train_dev_eval_set, data_raw, o_prediction_file, save_model = './xgb_model.json', tune_obj_reward = {'n_stake':1, 'cash_capital':1000}):
    '''
    Desc: tune ML model

    train_dev_eval_set: input train/dev/test data set

    '''
    tuning_report = {}

    #step-1: model training
    #prepare training data
    train_dev_eval_set['train'], is_binary = balanceLabel(train_dev_eval_set['train'])
    # train_dev_eval_set['dev'],is_binary = balanceLabel(train_dev_eval_set['dev'])

    feat_cols = list(filter(lambda x: x != 'label', train_dev_eval_set['train'].columns))
    label_cols = 'label'

    x_train = train_dev_eval_set['train'][feat_cols].to_numpy()
    y_train = train_dev_eval_set['train'][label_cols].to_numpy().astype(int)

    x_dev = train_dev_eval_set['dev'][feat_cols].to_numpy()
    y_dev = train_dev_eval_set['dev'][label_cols].to_numpy().astype(int)

    eval_set_df = train_dev_eval_set['dev']
    o_prediction_file = './opt_model_dev.csv'
    
#     is_binary = False
    if is_binary:
        loss_function = 'LogLoss'
        eval_metric = 'Accuracy'
    else:
        loss_function = 'MultiClass'
        eval_metric = 'TotalF1' # 'Accuracy' #F1
    
    base_catboost_cfg = {
        'iterations': 0,
        'loss_function' : loss_function,
        # 'cat_features' : [],
        'use_best_model': True,
        'verbose': False,
        'learning_rate': 0.01,
        'l2_leaf_reg' : 0,
    #     reg_alpha = 10,
        'depth': 0,
        'eval_metric' : eval_metric, #'auc', #'merror', #'mlogloss',

    }

    def model_cost(model_cfg):
        catboost_cfg = copy.deepcopy(base_catboost_cfg)
        catboost_cfg.update(
            {
            'iterations': model_cfg['iterations'],
            'learning_rate': model_cfg['learning_rate'],
            'l2_leaf_reg' : model_cfg['l2_leaf_reg'],
            'depth': model_cfg['depth'],
        }
        )
        # print(catboost_cfg)
        model_now = CatBoostClassifier(**catboost_cfg)
        model_now.fit(x_train, y_train, eval_set = [(x_dev, y_dev)])#, early_stopping_rounds=50, verbose=True)
        y_dev_pred = model_now.predict(x_dev)
#         dev_result = model_now.evals_result()
#         dev_acc = dev_result['validation_0'][eval_metric][0]
        dev_acc = acc_metric(y_dev_pred, y_dev)

        #model predict in dev
        feat_cols = list(filter(lambda x: x != 'label', eval_set_df.columns))
        label_cols = 'label' 
        #evaluate set
        x_eval = eval_set_df[feat_cols].to_numpy()
        y_eval = eval_set_df[label_cols].to_numpy().astype(int)
        y_eval_pred = model_now.predict(x_eval)
        y_eval_pred_df = pd.DataFrame(eval_set_df['label'])
        y_eval_pred_df['predict'] = y_eval_pred
        #Save prediction 
        o_cols = ['Open', 'High', 'Low','Close', 'Adj Close', 'Volume',]
        
        o_test = data_raw[target_symbol].loc[y_eval_pred_df.index][o_cols]
        o_test['openinterest'] = 0
        o_test = pd.merge(o_test, y_eval_pred_df['predict'], on='Date', how='inner')
        o_test.to_csv(o_prediction_file, index = True)
        reward_gain = strategyEvaluate(target_symbol, o_prediction_file, n_stake=1, cash_capital= 100, is_model_opt=True)

        return -reward_gain #dev_acc #

    search_space = {
        'iterations': hp.randint('iterations', 5, 100),
        'learning_rate': hp.uniform('learning_rate', 0.01, 1),
        'depth': hp.randint('depth', 2, 8),
        'l2_leaf_reg': hp.randint('l2_leaf_reg', 1, 10),
    }
    best_model = None
    best_model_val = None
    n_cycle = 0
    while(True):
        for ntry in range(3):
            opt_trials = Trials()
            best = fmin(
                fn = model_cost,
                space = search_space,
                algo = tpe.suggest,
                max_evals = 3,
                trials=opt_trials,        
            )
#             logging.info(infome(__file__, '*** XGB MODEL tuinig *** '))
            best_value = sorted(opt_trials.results, key = lambda v: v['loss'])
#             logging.info(infome(__file__, f'best = {best}, {best_value[0]} '))

            if best_model:
                if best_value[0]['loss'] < best_model_val['loss']:
                    best_model_val = best_value[0]
                    best_model = best
            else:
                best_model_val = best_value[0]
                best_model = best

            print(f'**** This is {ntry}-th try, {best}')
        if best_value[0]['loss'] < 0.0 or n_cycle >=3:
            break
        n_cycle += 1

    catboost_cfg = copy.deepcopy(base_catboost_cfg)
    catboost_cfg.update(
        {
        'iterations': best_model['iterations'],
        'learning_rate': best_model['learning_rate'],
        'l2_leaf_reg' : best_model['l2_leaf_reg'],
        'depth': best_model['depth'],
    }
    )
    print(catboost_cfg)
    model_now = CatBoostClassifier(**catboost_cfg)
    model_now.fit(x_train, y_train, eval_set = [(x_dev, y_dev)])#, early_stopping_rounds=50, verbose=True)
 
    model_now.save_model(save_model)
    
    # #report
    tuning_report['model'] = 'catboost'
    tuning_report['param'] = catboost_cfg

    #evaluate model
    model_now.load_model(save_model)
    y_train_pred = model_now.predict(x_train)
    acc = acc_metric(y_train_pred, y_train)
    print(f'train = {acc}')
    tuning_report['metric'] = {'train': acc}
    
    y_dev_pred = model_now.predict(x_dev)
    dev_acc = acc_metric(y_dev_pred, y_dev)
    print(f'dev = {dev_acc}')
    tuning_report['metric'].update({'dev': dev_acc})
 

    return tuning_report

modelHyperOptTuning('TQQQ', data_feature, data_raw, './catboost_dev.csv', save_model = './catboost_model.json', tune_obj_reward = {'n_stake':1, 'cash_capital':1000})

  0%|                                     | 0/3 [00:00<?, ?trial/s, best loss=?]

ERROR:hyperopt.fmin:job exception: catboost/libs/metrics/metric.cpp:6252: Eval metric should have a single value. Metric F1 provides a value for each class, thus it cannot be used as a single value to select best iteration or to detect overfitting. If you just want to look on the values of this metric use custom_metric parameter.



  0%|                                     | 0/3 [00:00<?, ?trial/s, best loss=?]


CatBoostError: catboost/libs/metrics/metric.cpp:6252: Eval metric should have a single value. Metric F1 provides a value for each class, thus it cannot be used as a single value to select best iteration or to detect overfitting. If you just want to look on the values of this metric use custom_metric parameter.

In [77]:
# print(feat_data2['price_mid'].head(3))
# print(feat_data2['price_mid'].shift(-1).head(3))
# a=feat_data2['price_mid'].shift(-1)-feat_data2['price_mid']
print(data_feat_hist['dev'].index)

DatetimeIndex(['2018-12-31', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-07', '2019-01-08', '2019-01-09', '2019-01-10',
               '2019-01-11', '2019-01-14',
               ...
               '2020-03-19', '2020-03-20', '2020-03-23', '2020-03-24',
               '2020-03-25', '2020-03-26', '2020-03-27', '2020-03-30',
               '2020-03-31', '2020-04-01'],
              dtype='datetime64[ns]', name='Date', length=316, freq=None)


In [97]:
import pickle
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def objective(x):
    return {
        'loss': x ** 2,
        'status': STATUS_OK,
        # -- store other results like this
        'eval_time': time.time(),
        'other_stuff': {'type': None, 'value': [0, 1, 2]},
        # -- attachments are handled differently
        'attachments':
            {'time_module': pickle.dumps(time.time)}
        }
trials = Trials()
best = fmin(objective,
    space=hp.uniform('x', -10, 10),
    algo=tpe.suggest,
    max_evals=100,
    trials=trials)
print(best)
print('****')
# print(trials.results)
d = sorted(trials.results, key = lambda v: v['loss'])
print(d[0])
# print(trials.trials[0])


100%|███| 100/100 [00:00<00:00, 501.22trial/s, best loss: 0.0026426603445583396]
{'x': 0.051406812238830174}
****
{'loss': 0.0026426603445583396, 'status': 'ok', 'eval_time': 1665208408.1580944, 'other_stuff': {'type': None, 'value': [0, 1, 2]}}


In [58]:
def error_metric(y1, y2):
    '''
    Calculate accuracy
    '''
    res = list(zip(y1, y2))
    correct = 0
    total = 0
    for v in res:
        correct += (v[0]-v[1]) * (v[0]-v[1])

    return math.sqrt(correct/float(len(res)))


def modelHyperOptTuningRegression(target_symbol, train_dev_eval_set, data_raw, o_prediction_file, save_model = './xgb_model_debug.json'):
    '''
    Desc: tune ML model

    train_dev_eval_set: input train/dev/test data set

    '''
    tuning_report = {}

    #step-1: model training
    #prepare training data
    # train_dev_eval_set['train'] = balanceLabel(train_dev_eval_set['train'])
    # train_dev_eval_set['dev'] = balanceLabel(train_dev_eval_set['dev'])

    feat_cols = list(filter(lambda x: x != 'label', train_dev_eval_set['train'].columns))
    label_cols = 'label'

    x_train = train_dev_eval_set['train'][feat_cols].to_numpy()
    y_train = train_dev_eval_set['train'][label_cols].to_numpy()#.astype(int)

    x_dev = train_dev_eval_set['dev'][feat_cols].to_numpy()
    y_dev = train_dev_eval_set['dev'][label_cols].to_numpy()#.astype(int)

    print('****DEV')
    # print(x_dev[:4,:])
    print(y_dev[:4])

    base_xgb_cfg = {
        'n_estimators': 0,
        'objective' : 'reg:squarederror',
        'booster' : 'gbtree', #'gbtree'
        'eta': 0.02,
        'reg_lambda' : 0,
    #     reg_alpha = 10,
        'max_depth': 0,
        'verbosity': 0,
        'eval_metric' : 'rmse',# 'rmse', #'mlogloss',
        'nthread': 3,
        'rate_drop':0.,
        'subsample': 1.0,

    }

    def model_cost(model_cfg):
        xgb_cfg = copy.deepcopy(base_xgb_cfg)
        xgb_cfg.update(
            {
            'n_estimators': model_cfg['n_estimators'],
            'eta': model_cfg['eta'],
            'reg_lambda' : model_cfg['reg_lambda'],
            'max_depth': model_cfg['max_depth'],
        }
        )
        # print(xgb_cfg)
        model_now = xgb.XGBRFRegressor(**xgb_cfg)
        model_now.fit(x_train, y_train, eval_set = [(x_dev, y_dev)])#, early_stopping_rounds=50, verbose=True)
        dev_result = model_now.evals_result()
        # y_dev_pred = model_now.predict(x_dev)
        # dev_acc = acc_metric(y_dev_pred, y_dev)
        v = dev_result['validation_0']['rmse'][0]
        return v

    search_space = {
        'n_estimators': hp.randint('n_estimators', 2, 100),
        'eta': hp.uniform('eta', 0.01, 1.0),
        'max_depth': hp.randint('max_depth', 2, 6),
        'reg_lambda': hp.randint('reg_lambda', 1, 20),
    }
    best = fmin(
        fn = model_cost,
        space = search_space,
        algo = tpe.suggest,
        max_evals = 20
    )
    print(best)

    xgb_cfg = copy.deepcopy(base_xgb_cfg)
    xgb_cfg.update(
        {
        'n_estimators': best['n_estimators'],
        'eta': best['eta'],
        'reg_lambda' : best['reg_lambda'],
        'max_depth': best['max_depth'],
    }
    )
    print(xgb_cfg)

    model_now = xgb.XGBRFRegressor(**xgb_cfg)
    model_now.fit(x_train, y_train, eval_set = [(x_dev, y_dev)])#, early_stopping_rounds=50, verbose=True)
    dev_result=model_now.evals_result()
    a = dev_result['validation_0']['rmse'][0]
    # print(a)
    model_now.save_model(save_model)
    
    # #report
    tuning_report['model'] = 'xgb'
    tuning_report['param'] = xgb_cfg

    #evaluate model
    model_now.load_model(save_model)
    y_train_pred = model_now.predict(x_train)
    acc = error_metric(y_train_pred, y_train)
    print(f'train = {acc}')
    tuning_report['metric'] = {'train': acc}
    print('*** TRAIN ***')
    print(list(zip(y_train_pred, y_train))[:10])
    print('*** TRAIN ***')
    
    y_dev_pred = model_now.predict(x_dev)
    dev_acc = error_metric(y_dev_pred, y_dev)
    print(f'dev = {dev_acc}')
    tuning_report['metric'].update({'dev': dev_acc})
    
    dev1 = list(zip(y_dev_pred, y_dev))
    print(dev1[:5])

    x_eval = train_dev_eval_set['eval'][feat_cols].to_numpy()
    y_eval = train_dev_eval_set['eval'][label_cols].to_numpy()#.astype(int)
    
    y_eval_pred = model_now.predict(x_eval)
    dev_acc = error_metric(y_eval_pred, y_eval)
    print(f'dev = {dev_acc}')
    tuning_report['metric'].update({'eval': dev_acc})


    return tuning_report,dev1,list(zip(y_train_pred, y_train)),list(zip(y_eval_pred, y_eval))     

In [59]:
from catboost import CatBoostClassifier, Pool, cv
import datetime
import pandas as pd
from hyperopt import tpe, hp, fmin
import copy
# data_feat_hist['dev'].head(10)
result = modelHyperOptTuningRegression('TQQQ', data_feat_hist, data, o_prediction_file=None, save_model = './xgb_model_debug.json')

****DEV
[9.20965037 9.11346969 8.74873104 9.23463199]
[0]	validation_0-rmse:4.89332                                                   
[0]	validation_0-rmse:5.92280                                                   
[0]	validation_0-rmse:5.40551                                                   
[0]	validation_0-rmse:4.94343                                                   
[0]	validation_0-rmse:5.62281                                                   
[0]	validation_0-rmse:5.83282                                                   
[0]	validation_0-rmse:5.68359                                                   
[0]	validation_0-rmse:5.79120                                                   
[0]	validation_0-rmse:5.04199                                                   
[0]	validation_0-rmse:5.05273                                                   
[0]	validation_0-rmse:5.50638                                                   
[0]	validation_0-rmse:6.17268                          



In [133]:
import logging

import pandas as pd

a=pd.DataFrame({'x':[1,2,3,2,4]})
# print(dir(a.index))
a.index.name='a'
b=a.shift(-1)
b.dropna(inplace=True)
# print(a)
print(b)
print(a.loc[b.index])
c=b.gt(a.loc[b.index])
set(b['x'])

     x
a     
0  2.0
1  3.0
2  2.0
3  4.0
   x
a   
0  1
1  2
2  3
3  2


{2.0, 3.0, 4.0}