In [1]:
from hsmm_core.prediction_engines import *
from hsmm_core.data_utils import load_data, TradingHours
import numpy as np
import os
import pickle
from hsmm_core.consts import ThresholdMethod, LabellingChoice

In [59]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RidgeClassifierCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.cross_validation import KFold
from sklearn.multiclass import OneVsRestClassifier #support from multiclass
#####metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
import sklearn.linear_model as lm
%matplotlib inline

plt.style.use('ggplot')
%matplotlib inline

from sklearn.svm import SVC


sc = StandardScaler()

In [24]:
def fwd_dates(_dates_list, _key_date):
    #returns a list of dates that are forward from the key_date
    fwd_dates_list  = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list

def remove_nans_labels(features_tuple, labels): 
    #function to clean up nans as I seem to use it a lot, so better to have one function
    #combines the features and labels and removes rows with nans across so we dont lost the ordering
    #returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice','Duration', 'states', 'TradedTime',
                                         'TradedPrice', 'ticker'], axis=1)
    df_concat = pd.concat([features_df, labels_only.iloc[:, 0:1]], axis=1, sort='False')#only using 1st set of label
    df_x_nan = df_concat.dropna() #dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1 #location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]#keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1) #keeping the features only
    
    return features_, labels_

def remove_nans_duration(features_tuple, labels): 
    #function to clean up nans as I seem to use it a lot, so better to have one function
    #combines the features and labels and removes rows with nans across so we dont lost the ordering
    #returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice', 'states', 'TradedTime',
                                         'TradedPrice', 'ticker'], axis=1)
    df_concat = pd.concat([features_df, labels_only['Duration']], axis=1, sort='False')#only using 1st set of label
    df_x_nan = df_concat.dropna() #dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1 #location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]#keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1) #keeping the features only
    
    return features_, labels_

def prec_recall_report(y_true_, y_predict_):
    #function to ge the sci-kit learn classification metrics into a pretty DF for csv!
    report = pd.DataFrame(list(precision_recall_fscore_support(y_true_, y_predict_)),
                          index=['Precision', 'Recall', 'F1-score', 'Support']).T
    # Now add the 'Avg/Total' row
    report.loc['Avg/Total', :] = precision_recall_fscore_support(y_true_, y_predict_, average='weighted')
    report.loc['Avg/Total', 'Support'] = report['Support'].sum()
    return report

In [11]:

#####OOP#####
class DataLoader(object):
    def __init__(self, path_, ticker):
        self.main_path = path_
        self.ticker = ticker
        self.labels_path = os.path.join(self.main_path, 'labels')
        self.features_path = os.path.join(self.main_path, 'features')
        self.ticker_labels_path = os.path.join(self.labels_path, self.ticker)
        self.ticker_features_path = os.path.join(self.features_path, self.ticker)

    def ticker_features(self, date):
        file_loc = os.path.join(self.ticker_features_path, str(date) + '.pickle')
        with open(file_loc, 'rb') as handle:
            ticker_features = pickle.load(handle)
        return ticker_features

    def ticker_labels_pickle(self, date):
        file_loc = os.path.join(self.ticker_labels_path, str(date) + '.pickle')
        with open(file_loc, 'rb') as handle:
            ticker_labels = pickle.load(handle)
        return ticker_labels

    def ticker_labels_csv(self, date):
        file_loc = os.path.join(self.ticker_labels_path, str(date) + '.csv')
        ticker_labels = pd.read_csv(file_loc)
        return ticker_labels

    @staticmethod
    def open_pickle_file(path, pickle_file):
        file_loc = os.path.join(path, pickle_file)
        pickle_to_file = pickle.load(open(file_loc, "rb"))
        return pickle_to_file

    @staticmethod
    def get_date_from_file(file_, numb_):
        return os.path.splitext(file_[numb_])[0]


class PriceIndicators(object):
    # a class to be expanded that uses features for base case -price only-indicators
    """"Requires:
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average."""

    def __init__(self, symbol, labels_df):
        self.symbol = symbol
        self.labels = labels_df

    def MACD(self, short_window=5, long_window=20):
        short_rolling_px = self.labels['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.labels['TradedPrice'].rolling(window=long_window).mean()
        px_indx = long_rolling_px - short_rolling_px
        return px_indx


class FitModels(object):

    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    # # Train a SVM classification model

    def ridge_clf(self, cv_folds=5):
        
        model_ridge_clf= RidgeClassifierCV(alphas = np.arange(0.1, 1000,0.1),\
                                           cv=KFold(cv_folds),normalize= True).fit(self.X_train,self.y_train.values.ravel())
        #check if class_weight should be used as 'balanced'

        return model_ridge_clf

    def svm_clf(self, kernel_choice):

        param_grid = dict(kernel=[str(kernel_choice)],
                          C=[1, 5, 10, 25, 50, 100],
                          gamma=[0.0001, 0.001, 0.01, 0.02, 0.05, 0.01])
        svc = svm.SVC(class_weight='balanced')
        clf = GridSearchCV(svc,param_grid)
        clf.fit(self.X_train, np.asanyarray(self.y_train).reshape(self.y_train.shape[0]))

        return clf

    def gradient_boost_clf(self, learning_rate=0.25):
        #this needs to be written properly- but this is somewhat optimised#
        GBR = GradientBoostingClassifier(n_estimators=3000, learning_rate=learning_rate,
                                           max_depth=4, max_features='sqrt',
                                           min_samples_leaf=15, min_samples_split=10)

        gb_boost_clf = GBR.fit(self.X_train, self.y_train)

        return gb_boost_clf
    
    def gp_clf(self):
        #The length parameter l controls the smoothness of the function and σf the vertical variation. 
        #For simplicity, we use the same length parameter l for all input dimensions (isotropic kernel)
        
        kernel = 1.0 * RBF([1.0]) #isotropic
        gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(self.X_train, self.y_train)
        #hyperparameters are optimised by default
        return gpc_rbf_isotropic

    def random_forest_clf(self, no_est=100):
        rfc=RandomForestClassifier(n_estimators=no_est, max_depth=4,n_jobs=-1, warm_start=True)
        rfc.fit(X_train, y_train)
        
        return rfc


    def run_cv(self, clf_class, **kwargs):
        # Construct a kfolds object
        kf = KFold(len(self.y_train), n_folds=10, shuffle=True)
        y_pred = self.y_train.copy()

        # Iterate through folds
        for train_index, test_index in kf:
            X_train_local, X_test_local = self.X_train[train_index], self.X_train[test_index]
            y_train_local = self.y_train[train_index]
            # Initialize a classifier with key word arguments
            clf = clf_class(**kwargs)
            clf.fit(self.X_train, self.y_train)
            y_pred[test_index] = clf.predict(X_test_local)
        return y_pred


class PredictModels(FitModels):
    def __init__(self):
        pass

def no_nans(label):
    return np.sum(np.isnan(label))


def remove_last_element(arr):
    return arr[np.arange(arr.size - 1)]



In [12]:
ticker = 'SYNT_2states'
features_path='/home/ak/Documents/Data/features_models/features/'

####paths####
labels_path = '/home/ak/Documents/Data/features_models/labels'
main_path = '/home/ak/Documents/Data/features_models/'

models_path=os.path.join(main_path,'models')
hmm_models_path = os.path.join(models_path,'hmm_models')
features_ticker_path = os.path.join(features_path, ticker)
predictions_path = os.path.join(main_path, 'predictions')
# ticker = 'SYNT_4states'

features_path = os.path.join(main_path, 'features')
ticker_labels_path = os.path.join(labels_path, ticker)
ticker_models_path = os.path.join(models_path, ticker)
ticker_predictions_path = os.path.join(predictions_path, ticker)

ticker_features_path = os.path.join(features_path, ticker)

###

# list of files    
labels_list = os.listdir(ticker_labels_path)

features_list = os.listdir(ticker_features_path)

In [13]:
os.listdir(models_path)

['2_state_trained_hmm_params_.pickle',
 '2_state_trained_hmm_models_.pickle',
 'SYNT_2states',
 'SYNT_3states',
 'hmm_models',
 'SYNT_4states']

In [16]:
####
data_cls = DataLoader(path_=main_path, ticker=ticker)
idx = 1  # take first label-index for the data-frame of labels


In [17]:


n_hidden_states = 2

startprob = np.array([0.6, 0.4])

transmat = np.array([[0.2, 0.8], [0.6, 0.4]])


init_params = {
    "obs_model_params": {
                                'obs_model_name': 'ExpIndMixDiracGauss',
                                'em_init_method': InitialisationMethod.cluster

    },
    "hidden_model_params": {
                                'no_hidden_states': n_hidden_states,
                                'pi':startprob,
                                'tpm': transmat,
                                'em_init_method': InitialisationMethod.uniform
    },
    "update_tag": 'tpsml'
}


# start_dt = '20171002'

# end_dt = '20171003'

trading_hours_filter = TradingHours.only_mkt_hours

data_dic = load_data(ticker, no_of_days=2) #, start_date=start_dt, end_date=end_dt)
hmm_calibration_engine = hmm_calibration(no_parallel_procs=None,
                                         init_params=init_params)


trained_hmms = hmm_calibration_engine.hmm_fit_func(ticker, data_dic, trading_hours_filter,
                                                   force_recalc=False)

###saving hmm model params###
seq_params = "_".join((str(n_hidden_states),'state',"trained","hmm","params", ".pickle"))
print("saving the model params:",seq_params)
pickle.dump(init_params, open(os.path.join(models_path,seq_params), 'wb'))
###saving trained model hmms###
seq_model = "_".join((str(n_hidden_states),'state',"trained","hmm","models", ".pickle"))
print("saving the model params:",seq_params)
pickle.dump(init_params, open(os.path.join(models_path,seq_model), 'wb'))

models_dates=trained_hmms.keys()


Calibrating hmm on date 20171221
Number of points in data set is 4852, number of points with large price change 3301
('saving the model params:', '2_state_trained_hmm_params_.pickle')
('saving the model params:', '2_state_trained_hmm_params_.pickle')


In [18]:
# for date, date_hmm in trained_hmms.iteritems():
#     feature_engine = hmm_features(date_hmm)
#     features = feature_engine.generate_features(data_dic[date])

In [19]:
models_dates

['20171221', '20171017']

In [36]:
import time

start = time.time()

for date, date_hmm in trained_hmms.iteritems():
    newpath=os.path.join(features_ticker_path, str(date))
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    feature_engine = hmm_features(date_hmm)
    features_load = feature_engine.generate_features(data_dic[date])
    labels_load = data_cls.ticker_labels_csv(date=date)
    features, duration = remove_nans_duration(features_load, labels_load)
#     x_std = sc.fit_transform(features.values.astype(np.float)) #fit & transform the features
#     X_train, X_test, y_train, y_test = train_test_split( \
#         x_std, labels_clean, test_size=0.05, random_state=1, stratify=labels_clean) #probably can get rid of this
# #     models_cls = FitModels(X_train, y_train)
# #     best_clfs = {'SVC': models_cls.svm_clf(kernel_choice="rbf"), 
# #                  'Ridge_clf': models_cls.ridge_clf(), 
#                  'GBOOST': models_cls.gradient_boost_clf(),
#                  'GP_clf': models_cls.gp_clf(),
#                  'RF_clf': models_cls.random_forest_clf(),
#                 }
            
#     # This is sequence for the name of the best classifiers.
#     seq_clf = "_".join((str(date),labels_clean.columns.values[0],"clfs", ".pickle"))
#     print("saving the classifiers:",seq_clf)
#     pickle.dump(best_clfs, open(os.path.join(ticker_models_path,seq_clf), 'wb'))


end = time.time()
print(end - start)            

4.85042381287


In [37]:
print(features.shape[0]==duration.shape[0])


True


In [65]:
features[:-1].shape

(4846, 15)

In [80]:
from sklearn.svm import SVR
import matplotlib.pyplot as plt
y=duration[1:].values
X=sc.fit_transform(features[:-1].values.astype(np.float))
clf = svm.SVR(C=1.0, epsilon=0.2)
# Fit regression model
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1, epsilon=0.25)
svr_lin = SVR(kernel='linear', C=1e3,epsilon=0.25)
svr_poly = SVR(kernel='poly', C=1e3, degree=2, epsilon=0.25)
y_rbf = svr_rbf.fit(X, y).predict(X)
y_lin = svr_lin.fit(X, y).predict(X)
y_poly = svr_poly.fit(X, y).predict(X)
y_predict_vector=[y_rbf, y_lin, y_poly]
# #############################################################################
# Look at the results
# lw = 2
# # plt.scatter(y, y, color='darkorange', label='data')
# plt.scatter(y, y_rbf, color='navy', lw=lw, label='RBF model')
# # plt.plot(y, y_lin, color='c', lw=lw, label='Linear model')
# # plt.plot(y, y_poly, color='cornflowerblue', lw=lw, label='Polynomial model')
# plt.xlabel('data')
# plt.ylabel('target')

from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score
def regression_metrics(y_predict, y_true):
    print ('R2 score:',r2_score(y_true, y_predict))
    print ('Explained_Variance Score:',explained_variance_score(y_true, y_predict))
          
for output in y_predict_vector:
    regression_metrics(y, output)
        

('R2 score:', -13.941455382705358)
('Explained_Variance Score:', -12.579151849446776)
('R2 score:', -53.21043008911015)
('Explained_Variance Score:', -47.786034019888774)
('R2 score:', -38.95998696599079)
('Explained_Variance Score:', -34.8434531899501)


In [75]:
from sklearn.model_selection import learning_curve

X_plot = np.linspace(0, 5, 100000)[:, None]
train_sizes, train_scores_svr, test_scores_svr = \
    learning_curve(clf, X[:300], y[:300], train_sizes=np.linspace(0.1, 1, 10),
                   scoring="neg_mean_squared_error", cv=10)
    
y_svr = clf.predict(X_plot)
svr_predict = time.time() - t0
print("SVR prediction for %d inputs in %.3f s"
      % (X_plot.shape[0], svr_predict))

NotFittedError: This SVR instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates=[0.05, 0.1, 0.25, 0.75,1]
X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = train_test_split(X_, y_, random_state=0)
for learning_rate in learning_rates:
    GBR = GradientBoostingClassifier(n_estimators=3000, learning_rate=learning_rate,
                                           max_depth=4, max_features='sqrt',
                                           min_samples_leaf=15, min_samples_split=10)
    GBR.fit(X_train_sub,y_train_sub)
    print ("accuracy_score(validation):{0:3f}".format(GBR.score(X_validation_sub, y_validation_sub)))

In [20]:
oos_data_dic =load_data(ticker, no_of_days=20)
hmm_fwd_dates= fwd_dates(_dates_list=oos_data_dic.keys(),_key_date=date) #create fwd out of sample dates

for fwd_date in hmm_fwd_dates: #OOS testing
    fwd_features = feature_engine.generate_features(oos_data_dic[fwd_date])
    fwd_labels = data_cls.ticker_labels_csv(date=fwd_date)
    features_fwd, labels_fwd= remove_nans(fwd_features, fwd_labels)    
    x_std_fwd = sc.fit_transform(features_fwd.values.astype(np.float)) #fit & transform the features
    y_true = labels_fwd
    CLFs=['SVC', 'GP_clf', 'GBOOST']
    for clf in CLFs:
        y_predict_clf = best_clfs[clf].predict(x_std_fwd)
        classif_rate= np.mean(y_predict_clf.ravel() == np.asanyarray(y_true).ravel())*100
        print("classification rate for %s %f"%(clf, classif_rate))
#         clf_report= prec_recall_report(y_true, y_predict_clf)
#         report_name = "_".join(( 'performance','report','ticker',str(fwd_date),'.csv'))
#         report_loc = os.path.join(ticker_predictions_path, report_name)
#         clf_report.to_csv(report_name)


classification rate for SVC 29.898403
classification rate for GP_clf 47.978437
classification rate for GBOOST 43.520630
classification rate for SVC 46.079440
classification rate for GP_clf 37.415106
classification rate for GBOOST 43.136448
classification rate for SVC 29.347602
classification rate for GP_clf 49.598683
classification rate for GBOOST 42.539617
classification rate for SVC 39.575608
classification rate for GP_clf 42.315616
classification rate for GBOOST 44.705398
classification rate for SVC 48.651986
classification rate for GP_clf 37.579749
classification rate for GBOOST 41.181313
classification rate for SVC 31.067961
classification rate for GP_clf 46.684569
classification rate for GBOOST 42.016112
classification rate for SVC 34.337598
classification rate for GP_clf 46.657037
classification rate for GBOOST 47.111019
classification rate for SVC 28.280124
classification rate for GP_clf 48.671473
classification rate for GBOOST 43.501545


In [None]:
features = feature_engine.generate_features(data_dic[date])
labels = data_cls.ticker_labels_csv(date=date)

In [None]:
X_test, y_test =remove_nans(features, labels)

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
oos_data_dic =load_data(ticker, no_of_days=20)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
model=best_clfs['KR']
model.probability = True



# fpr, tpr, _ = roc_curve(y_test, y_predict_probabilities)
# roc_auc = auc(fpr, tpr)

# plt.figure()
# plt.plot(fpr, tpr, color='darkorange',
#          lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.legend(loc="lower right")
# plt.show()

In [None]:
model=best_clfs['SVC']

In [None]:
model.probability

In [None]:
hmm_keys=trained_hmms.keys()
trained_hmms[hmm_keys[0]]

In [None]:
import pandas as pd 
import numpy as np 
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
from MKLpy.regularization import normalization,rescale_01
from MKLpy.metrics.pairwise import HPK_kernel
from MKLpy.regularization import kernel_centering, kernel_normalization, tracenorm
from MKLpy.algorithms import EasyMKL,RMGD,RMKL,AverageMKL
import os
from sklearn.metrics.pairwise import rbf_kernel as RBF 


In [None]:
path = '/home/ak/Documents/Research/temp/mkl_example.dmp'

# os.listdir(path)


In [None]:
y_train_mkl=y_train.values.reshape(y_train.shape[0])
x_train_mkl=X_train
dump_svmlight_file(X_train, y_train_mkl, path) 

In [None]:
X,Y = load_svmlight_file(path)
X = X.toarray()	#Important! MKLpy require dense matrices!
X = rescale_01(X)
X = normalization(X) 

KL = [HPK_kernel(X,degree=d) for d in range(1,11)]
KL2 = [RBF(X, gamma=gamma) for gamma in [1., 10, 100.]] 

EasyMKL().fit(KL2,Y)