In [4]:
import pandas as pd
import numpy as np
import os
from pandas.tseries.offsets import BDay
from hsmm_core.utils import mc_limiting_distribution, states_from_limit_dist
from hsmm_core.observation_models import ExpIndMixDiracGauss
from hsmm_core.feature_spaces import hmm_features
from hsmm_core.hsmm_runner import HmmCalibration

from hsmm_core.hmm import hmm_impl

from hsmm_core.data_utils import DataLoader, TradingHours
from hsmm_core.labelling import DataLabellingSimple
from hsmm_core.consts import ThresholdMethod, LabellingChoice
import pickle
from hsmm_core.consts import InitialisationMethod
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
sc = StandardScaler()


In [2]:
# ticker = 'SYNT_2states' #testing a new synthetic ticker

data_dir = os.getenv('FINANCE_DATA') #main directory
features_path = os.path.join(os.path.expanduser("~"), 'Data/features_models/features/') #where features are saved
labels_path = os.path.join(os.path.expanduser("~"), 'Data/features_models/labels') #where labels are saved
ticker_labels_path = os.path.join(labels_path, ticker + '/NON_DIRECTIONAL')

if not os.path.exists(os.path.join(data_dir, ticker)):
    os.makedirs(os.path.join(data_dir, ticker))

if not os.path.exists(ticker_labels_path):
    os.makedirs(ticker_labels_path)

labels_list = os.listdir(ticker_labels_path)

# ####paths####
main_path = os.path.join(os.path.expanduser("~"), 'Data/features_models/')

models_path = os.path.join(main_path, 'models')
ticker_models_path = os.path.join(models_path, ticker)

hmm_models_path = os.path.join(models_path,'hmm_models') #only if we store the hmm models
if not os.path.exists(ticker_models_path):
    os.makedirs(ticker_models_path)


In [3]:
os.listdir(ticker_models_path)[1]

'synt_model_20160623_label_PrMov__window_25__thres_arbitrary__10.0_clfs_.pickle'

In [13]:
data_cls = DataLoader(path_=data_dir, ticker=ticker)

In [14]:
##get dates as a list 
data_files=os.listdir(os.path.join(data_dir, ticker))

synt_dates=[os.path.splitext(x)[0] for x in data_files]

In [15]:
synt_dates.sort()
start_date=synt_dates[0]
end_date =synt_dates[-1]
start_date, end_date

('20160601', '20160719')

In [12]:
data_loader_init = {
        'trading_hours_filter': TradingHours.only_mkt_hours
    }

no_states=2

hmm_init = {
    'obs_model_name': 'CensoredExpIndMixDiracGauss',
    'em_obs_init_method': InitialisationMethod.cluster,
    'em_hidden_init_method': InitialisationMethod.uniform,
    'no_hidden_states': no_states,
    'update_tag': 'tpsml'
}

data_loader = DataLoader(**data_loader_init)
# keep the hash of the data loader to uniquely identify how the data was loaded ( perhaps a dollar clock was
# used), as this affects the calibration of the hmm
data_loader_hash = data_loader.data_loader_hash()

data = data_loader.load_trades_data(ticker, start_date=start_date, end_date=end_date)
##calibrate the hmm models
hmm_calibration_engine = HmmCalibration(init_params=hmm_init)
hmm_calibration_engine.run_calibration_all_data(ticker, data, data_loader_hash,
                                                force_recalc=False, use_multiprocessing=False,
                                                n_processes=2)

# Create the hmm feature engine and for every change the hmm model in the features engine
features_engine = hmm_features()

## create labels and store the labels with the data ###

In [13]:
window = 25
threshold = 0.1

labelling_method_params = [{

    'labelling_method': LabellingChoice.price_move_in_window,
    'rolling_window': window,
    # Uncomment below if you want to check a price move only above a certain level
    'updown_threshold': threshold,  # this is multiplied by 100
    'threshold_method': ThresholdMethod.arbitrary,
}]

for label_init in labelling_method_params:
    print label_init
    labeller = DataLabellingSimple(label_init)
    labeller.label_training_data(data)

# SAVE THE DATA WITH LABELS 
for date, date_data in data.iteritems():
    date_data.to_csv(os.path.join(ticker_labels_path, str(date)+'.csv'))

{'rolling_window': 25, 'labelling_method': 'PrMov', 'updown_threshold': 0.1, 'threshold_method': 'arbitrary'}


In [1]:
dd_keys=data.keys()


NameError: name 'data' is not defined

## create features for out of sample predictions ##