In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import sklearn
sklearn.__version__

In [None]:
import pandas as pd
import pickle as pkl

In [None]:
%run utils.ipynb
%run targets.ipynb

In [None]:
file = 'ETHBTC_1m_1519496760000_1549568220000.csv'

In [None]:
# Get Data
df = pd.read_csv('../data/'+file)
df.shape
df.head(10)

In [None]:
data = df.loc[:, ['OPEN_TIME', 'CLOSE_TIME', 'OPEN', 'HIGH', 'LOW', 'CLOSE']]
data.shape
data[:5]

In [None]:
# Get Target
%run targets.ipynb
target = get_target_1('../data/'+file)
target.shape
target.head()

In [None]:
data = pd.merge(data, target, on='CLOSE_TIME')
data.shape
data.head()

In [None]:
TR_split, CV_split, TT_split = TR_CV_TT_split(data, 0.8, 0, 0.2)

TR_split.shape
CV_split.shape
TT_split.shape

TR_split.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
window = 60
def data_transformer(id, data, target, retrain=False):
    global scaler
    global window
    
    if retrain==True:
        scaler = scaler.fit(data)
        
    data = scaler.transform(data)
    return get_multiple_ts_dataset(id, data, target, tr_win=window, tt_win=0, point_target=True)

In [None]:
TR_id     = TR_split.iloc[:, :2  ].values
TR_data   = TR_split.iloc[:, 2:-1].values
TR_target = TR_split.iloc[:, -1: ].values

TT_id     = TT_split.iloc[:, :2  ].values
TT_data   = TT_split.iloc[:, 2:-1].values
TT_target = TT_split.iloc[:, -1: ].values

TR_id.shape
TR_data.shape
TR_target.shape

TT_id.shape
TT_data.shape
TT_target.shape

In [None]:
%run hyperopt.ipynb
%run models.ipynb

In [None]:
hyperopt = Hyperopt(RFC())
best_loss, best_params = hyperopt.run(
    TR_id, TR_data, TR_target, 
    eval_set={
        'TT': (TT_id, TT_data, TT_target)
    },
    n_cv=3,
    algo='grid',
    data_transformer=data_transformer,
    max_evals=1
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

TR_id, TR_X, TR_y = data_transformer(TR_id, TR_data, TR_target, retrain=True)
TT_id, TT_X, TT_y = data_transformer(TT_id, TT_data, TT_target, retrain=False)

model = RandomForestClassifier(**best_params)
model = model.fit(TR_X, TR_y)

TR_y_pred = model.predict_proba(TR_X)[:, 1]
TT_y_pred = model.predict_proba(TT_X)[:, 1]

TR_metrics = eval_class_metrics(TR_y, TR_y_pred, metrics=['auc', 'f1', 'acc', 'log_loss'])
TT_metrics = eval_class_metrics(TT_y, TT_y_pred, metrics=['auc', 'f1', 'acc', 'log_loss'])

res = pd.DataFrame({x:[] for x in ['auc', 'f1', 'acc', 'log_loss']})
res = res.append(pd.Series(TR_metrics, name='TR'))
res = res.append(pd.Series(TT_metrics, name='TT'))

print(res)

In [None]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

In [None]:
data_tag = '__'.join([str(x)+'_'+str(y) for x, y in {
    'intv':'1m', 
    'cols':'OHCL',
    'win':window
}.iteritems()])

proc_tag = '__'.join([str(x)+'_'+str(y) for x, y in {
    'model': nb_name.replace('.ipynb', ''),
    'TT_accuracy':'{:.4f}'.format(TT_metrics['acc']),
    'CV_accuracy':'{:.4f}'.format(best_loss)
}.iteritems()])

import time
time_tag = time.strftime('%y%m%d_%H%M%S')

big_tag = time_tag+'_|_'+data_tag+'_|_'+proc_tag

data_dump = {
    'best_loss': best_loss,
    'best_params': best_params,
    'model': model,
    'TR_id': TR_id,
    'TR_y_pred': TR_y_pred,
    'TT_id': TT_id,
    'TT_y_pred': TT_y_pred,
    'results': res
}

f = open('../models/'+time_tag+'.pkl', 'wb')
pkl.dump(data_dump, f, -1)
f.close()

%notebook -e ../models/{big_tag}.ipynb