In [5]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tseries.offsets import BDay
from hsmm_core.hmm import hmm_engine
from hsmm_core.observation_models import ExpIndMixDiracGauss
from hsmm_core.data_utils import load_data, TradingHours
from hsmm_core.data_utils import load_data, TradingHours
from hsmm_core.feature_spaces import hmm_features
from hsmm_core.hmm import hmm_calibration
from hsmm_core.data_utils import load_data, TradingHours
from hsmm_core.labelling import DataLabellingSimple
from hsmm_core.consts import ThresholdMethod, LabellingChoice
import pickle
from hsmm_core.consts import InitialisationMethod
import datetime as dt
plt.style.use('ggplot')
%matplotlib inline

In [37]:
def remove_nans(features_tuple, labels, idx=1):
    # not the cleanest but useful
    # function to clean up nans as I seem to use it a lot, so better to have one function
    # combines the features and labels and removes rows with nans across so we dont lose the ordering
    # returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice', 'Duration', 'states', 'TradedTime',
                                       'TradedPrice', 'ticker'], axis=1)
    df_concat = pd.concat([features_df, labels_only.iloc[:, 0:idx]], axis=1, sort='False')
    # only using 1st set of labels- but we can re-write this a bit
    df_x_nan = df_concat.dropna()  # dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1  # location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]  # keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1)  # keeping the features only

    return features_, labels_ #return features and labels in the X,y order that scikit takes the input


In [7]:

ticker = 'test_SYNT_2states'


data_dir = os.getenv('FINANCE_DATA')
features_path='/home/ak/Data/features_models/features/'
labels_path= '/home/ak/Data/features_models/labels'

ticker_labels_path = os.path.join(labels_path,ticker+'/NON_DIRECTIONAL')

if not os.path.exists(os.path.join(data_dir, ticker)):
    os.makedirs(os.path.join(data_dir, ticker))
    
if not os.path.exists(ticker_labels_path):
    os.makedirs(ticker_labels_path)

    ####paths####
main_path = '/home/ak/Data/features_models/'

# models_path=os.path.join(main_path,'models')
# hmm_models_path = os.path.join(models_path,'hmm_models')
# features_ticker_path = os.path.join(features_path, ticker)
# predictions_path = os.path.join(main_path, 'predictions')



In [8]:

no_states = 2
sigmas = [0.05, 0.002] # fast and slow
# Duration is measured in seconds for now (to be revised). lambda units are seconds^{-1}
# so here we consider

lambdas = [1./35., 1./10.]
weights = [0.1, 0.6]

obs_model = ExpIndMixDiracGauss(no_states)
obs_model.set_up_initials(priors={'sigmas': sigmas, 'lambdas': lambdas, 'weights': weights})

hmm_ = hmm_engine(obs_model, no_states)

# set up some priors
tpm = np.array([[0.4, 0.6], [0.7, 0.3]])
pi = np.array([0.4, 0.6])
hmm_.set_up_initials(priors={'tpm': tpm, 'pi': pi})

no_dates = 3
start_date = pd.datetime(2017, 6, 1)
dummy_dates = [start_date + BDay(i) for i in range(no_dates)]

no_points = 5000

rng = np.random.RandomState(1234)



In [9]:

# silly hack, add 1 millisecond so that the initial timestamp is printed with milliseconds and does not
# break the parsing of Timestamps when loading

morning_start = dt.time(8, 0, 0, 1)

initial_price = 100

for dd in dummy_dates:
    random_states = hmm_.sample_states(rng=rng, length=no_points)
    observation_points = obs_model.sample_data(no_points, rng=rng, state=random_states)
    # The first duration is always zero
    observation_points[0, 0] = 0.

    file_path = os.path.join(data_dir, ticker)
    file_name = '.'.join([dd.strftime('%Y%m%d'), 'csv'])

    data_to_save = pd.DataFrame({'states': random_states,
                                 'Duration': observation_points[:, 0],
                                 'ReturnTradedPrice': observation_points[:, 1],
                                 })
    data_to_save['TradedTime'] = pd.Series()

    # Now calculate the Traded prices and traded times in reverse order as to what would happen
    # with real data.
    # data_to_save.loc[0, 'TradedTime'] = dt.datetime.combine(dd.date(), morning_start)
    data_to_save['TradedTime'] = data_to_save['Duration'].cumsum().apply(lambda dur:
                                                                         (dt.datetime.combine(dd.date(), morning_start)+\
                                                                                     dt.timedelta(seconds=dur)).time())

    data_to_save['TradedPrice'] = initial_price * (1. + data_to_save['ReturnTradedPrice']).cumprod()
    data_to_save.to_csv(os.path.join(file_path, file_name), index=False)

print "ok"


ok


### Feature Creation ###

In [10]:

n_hidden_states = no_states

init_params = {
    "obs_model_params": {
                                'obs_model_name': 'ExpIndMixDiracGauss',
                                'em_init_method': InitialisationMethod.cluster

    },
    "hidden_model_params": {
                                'no_hidden_states': n_hidden_states,
                                'pi':pi,
                                'tpm': tpm,
                                'em_init_method': InitialisationMethod.uniform
    },
    "update_tag": 'tpsml'
}




In [11]:
trd_hours_filter = TradingHours.all_trading_day
hmm_calibration_engine = hmm_calibration(no_parallel_procs=None,
                                         init_params=init_params)


trained_hmms = hmm_calibration_engine.hmm_fit_func(ticker, data, trd_hours_filter,
                                                   force_recalc=False)


for date, date_hmm in trained_hmms.iteritems():
    feature_engine = hmm_features(date_hmm)
    features = feature_engine.generate_features(data[date])

## saving hmm models ##

In [13]:
# ###saving trained model hmms###
# seq_model = "_".join((str(ticker),str(n_hidden_states),'state',"trained","hmm","models", ".pickle"))
# print("saving the model:", seq_model)
# pickle.dump(init_params, open(os.path.join(models_path,seq_model), 'wb'))

In [14]:
models_dates = trained_hmms.keys() #dates of the trained hmm models
models_dates

['20170601', '20170602', '20170605']

In [15]:
from itertools import permutations, product
windows =[2, 5, 10, 25, 100]
thresholds = [0.001, 0.005, 0.01, 0.05, 0.1]

## create labels ##

In [16]:
window =10
threshold =0.05


labelling_method_params = [{

    'labelling_method': LabellingChoice.price_move_in_window,
    'rolling_window': window,
    # Uncomment below if you want to check a price move only above a certain level
    'updown_threshold': threshold, #this is multiplied by 100
    'threshold_method': ThresholdMethod.arbitrary,
}]

for label_init in labelling_method_params:
    print label_init
    labeller = DataLabellingSimple(label_init)
    labeller.label_training_data(data)

keys_ = data.keys()

for key_, _ in enumerate(keys_):
    data[keys_[key_]].to_csv(ticker_labels_path+'/'+str(keys_[key_])+'.csv', index=False)


{'rolling_window': 10, 'labelling_method': <LabellingChoice.price_move_in_window: 'PrMov'>, 'updown_threshold': 0.05, 'threshold_method': <ThresholdMethod.arbitrary: 'arbitrary'>}


## For Labels: Locations we need ##

In [18]:
# ticker = 'SYNT_2states'
# ticker = 'SYNT_4states'

features_path = os.path.join(main_path, 'features')
ticker_labels_path = os.path.join(labels_path, ticker)
# ticker_models_path = os.path.join(models_path, ticker)
# ticker_predictions_path = os.path.join(predictions_path, ticker)

ticker_features_path = os.path.join(features_path, ticker)

###

# list of files    
labels_list = os.listdir(ticker_labels_path)

# features_list = os.listdir(ticker_features_path)

### Loading Labels###

In [34]:
non_directional =os.path.join(ticker_labels_path, labels_list[0])
# no=1 #which file to load- this needs to correspond to a date
def simple_labels(dates_folder,no=1 ):
    label_dates=os.listdir(dates_folder)
    file_name= os.path.join(non_directional,label_dates[no])
    labels = pd.read_csv(file_name).drop(columns=['ReturnTradedPrice','Duration','states','TradedTime','TradedPrice','ticker'], axis=1).iloc[:,0]
    return labels 

In [36]:
#extract the dates only


0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
5       1.0
6       1.0
7       1.0
8       1.0
9       1.0
10      1.0
11      0.0
12      0.0
13      0.0
14      0.0
15      0.0
16      0.0
17      0.0
18      0.0
19      0.0
20      0.0
21      0.0
22      0.0
23      0.0
24      0.0
25      0.0
26      1.0
27      1.0
28      1.0
29      1.0
       ... 
4961    1.0
4962    1.0
4963    0.0
4964    1.0
4965    1.0
4966    1.0
4967    1.0
4968    1.0
4969    1.0
4970    0.0
4971    0.0
4972    0.0
4973    1.0
4974    1.0
4975    0.0
4976    0.0
4977    1.0
4978    1.0
4979    0.0
4980    0.0
4981    0.0
4982    0.0
4983    0.0
4984    1.0
4985    1.0
4986    1.0
4987    1.0
4988    1.0
4989    1.0
4990    1.0
Name: label_PrMov__window_10__thres_arbitrary__5.0, Length: 4991, dtype: float64

In [22]:
data_dic = load_data(ticker, which_trading_hours=TradingHours.all_trading_day)
## clf fitting##
for date, date_hmm in trained_hmms.iteritems():
    feature_engine = hmm_features(date_hmm)
    features_load = feature_engine.generate_features(data_dic[date])
    simple_labels(non_directional, no=1)
#     labels_load = data_cls.ticker_labels_csv(date=date)
#     features, labels_clean = remove_nans(features_load, labels_load)
#     x_std = sc.fit_transform(features.values.astype(np.float)) #fit & transform the features
#     X_train, X_test, y_train, y_test = train_test_split( \
#         x_std, labels_clean, test_size=0.05, random_state=1, stratify=labels_clean) #probably can get rid of this
#     models_cls = FitModels(X_train, y_train)
#     best_clfs = {'SVC': models_cls.svm_clf(kernel_choice="rbf"),
#                  'RIDGE_clf': models_cls.ridge_clf(),
#                  'GBOOST': models_cls.gradient_boost_clf(),
#                  'GP_clf': models_cls.gp_clf(),
#                  'RF_clf': models_cls.random_forest_clf(),
#                  }
#     # This is sequence for the name of the best classifiers.
#     seq_clf = "_".join((str(date),labels_clean.columns.values[0],"clfs", ".pickle"))
#     print("saving the classifiers:",seq_clf)
#     pickle.dump(best_clfs, open(os.path.join(ticker_models_path,seq_clf), 'wb'))
