In [78]:
from hsmm_core.data_utils import DataLoader, TradingHours
from hsmm_core.feature_spaces import hmm_features
from hsmm_core.hsmm_runner import HmmCalibration
import time
from hsmm_core.consts import InitialisationMethod
from hsmm_core.data_utils import TradingHours, DataLoader
from hsmm_core.labelling import DataLabellingSimple
from hsmm_core.consts import ThresholdMethod, LabellingChoice
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import precision_recall_fscore_support
sc = StandardScaler()
import os
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score

##useful functions
def fwd_dates(_dates_list, _key_date):
    # returns a list of dates that are forward from the key_date
    fwd_dates_list = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list
def remove_nans(features_tuple, labels, idx=1):
    # not the cleanest but useful
    # function to clean up nans as I seem to use it a lot, so better to have one function
    # combines the features and labels and removes rows with nans across so we dont lose the ordering
    # returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice', 'Duration', 'states', 'TradedTime',
                                       'TradedPrice'], axis=1)
    df_concat = pd.concat([features_df, labels_only.iloc[:, 0:idx]], axis=1, sort='False')
    # only using 1st set of labels- but we can re-write this a bit
    df_x_nan = df_concat.dropna()  # dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1  # location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]  # keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1)  # keeping the features only
    return features_, labels_

def prec_recall_report(y_true, y_predict):
    # function to ge the sci-kit learn classification metrics into a pretty DF for csv!
    report = pd.DataFrame(list(precision_recall_fscore_support(y_true, y_predict)),
                          index=['Precision', 'Recall', 'F1-score', 'Support']).T
    # Now add the 'Avg/Total' row
    report.loc['Avg/Total', :] = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    report.loc['Avg/Total', 'Support'] = report['Support'].sum()
    return report

class MarketFeatures(object):
    # a class to be expanded that uses features for base case -market based only-indicators/features
    """"Requires:
    a dataframe that has TradedPrice And Volume columns
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average.
    """

    def __init__(self, df):
        #         self.ticker = ticker
        self.df = df

    def load_data(self):
        pass

    def ma_spread(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.df['TradedPrice'].rolling(window=long_window).mean()
        px_name = "_".join(('px_indx', str(short_window), str(long_window)))
        self.df[px_name] = long_rolling_px - short_rolling_px
        return self.df

    def ma_spread_duration(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['Duration'].rolling(window=short_window).mean()
        long_rolling_px = self.df['Duration'].rolling(window=long_window).mean()
        dur_name = "_".join(('dur_indx', str(short_window), str(long_window)))
        self.df[dur_name] = long_rolling_px - short_rolling_px
        return self.df

    def obv_calc(self):
        # on balance volume indicator
        self.df['SignedVolume'] = self.df['Volume'] * np.sign(self.df['TradedPrice'].diff()).cumsum()
        self.df['SignedVolume'].iat[1] = 0
        self.df['OBV'] = self.df['SignedVolume']  # .cumsum()
        self.df = self.df.drop(columns=['SignedVolume'])
        return self.df

    def chaikin_mf(self, period=5):
        # Chaikin money flow indicator
        self.df["MF Multiplier"] = (self.df['TradedPrice'] - (self.df['TradedPrice'].expanding(period).min()) \
                                    - (self.df['TradedPrice'].expanding(period).max() \
                                       - self.df['TradedPrice'])) / (
                                           self.df['TradedPrice'].expanding(period).max() - self.df[ \
                                       'TradedPrice'].expanding(period).min())
        self.df["MF Volume"] = self.df['MF Multiplier'] * self.df['Volume']
        self.df['CMF_' + str(period)] = self.df['MF Volume'].sum() / self.df["Volume"].rolling(period).sum()
        self.df = self.df.drop(columns=['MF Multiplier', 'MF Volume'])
        return self.df



In [79]:
if __name__ == '__main__':

    ## locations

    data_dir = os.getenv('FINANCE_DATA')  # main directory referenced in all the code
    data_only_drive = '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2'  # external date only drive

    # this is the central location for all the features/models/predictions
    features_models = os.path.join(data_dir,
                                   'features_models')  # main path where all the sub-directories are (features, models, labels)

    # this is the central location for all the labels
    labels = os.path.join(features_models, 'labels')  # label subdirectory
    # this is the central location for all the features #feature subdirectory
    features = os.path.join(features_models, 'features')

    # location to save results
    model_save_loc = os.path.join(data_only_drive, 'Data', 'features_models',
                                  'models')  # location where all the symbols are
    # from the main directory select all the symbols that are finishing in .L for FTSE
    symbols_ftse = [s for s in os.listdir(features) if s.endswith('.L')]
    main_path = os.path.join(data_dir, 'features_models')  # main directory

    features_path = os.path.join(main_path, 'features')  # all the features - same as above -redundant
    labels_path = os.path.join(main_path, 'labels')  # all the labels

  
    # same as above- new target directory, where all the models and output is saved

    model_paths = os.path.join(data_only_drive, 'Data', 'features_models',
                               'models')
    # using lambda to make a small function that just takes in the symbol
    # and produces the relevant path of all fitted single kernel models

    def symbol_fitted_models_path(symbol): return  os.path.join(model_paths, symbol, 'SINGLE_KERNEL')

    # provides a fitted list of above path

    def symbol_list_fitted_dates(symbol): return sorted(os.listdir(symbol_fitted_models_path(symbol)))

    # fitted model sub-directory- the fitted model is stored in this sub-directory as a pickle

    def symbol_fitted_model_date_loc(file_path, model_date_no): 
        return os.path.join(file_path, str(symbol_list_fitted_dates(symbol)[model_date_no]))


    def symbol_model_date_loc(model_date_path): 
        return os.path.join(model_date_path, os.listdir(model_date_path)[0])

    # test case ##
    
      # symbols to use as a starting point
    good_symbols = [
        'RDSa.L', 'PRU.L', 'III.L', 'REL.L', 'CNA.L', 'SHP.L', 'MKS.L',
        'CPI.L', 'ULVR.L', 'ECM.L', 'AV.L', 'GKN.L', 'TSCO.L', 'ITV.L',
        'BARC.L', 'CPG.L', 'AAL.L', 'LGEN.L', 'LAND.L', 'VOD.L', 'HSBA.L',
        'RSA.L', 'DMGOa.L', 'RR.L', 'DGE.L', 'BATS.L', 'MAB.L',
        'KGF.L', 'SPT.L', 'AZN.L'
    ]

    symbol = good_symbols[1] # picking PRU as an example

    # test symbol path, which essentially produces the path where all the fitted models are.
    # '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/SPT.L/SINGLE_KERNEL'

    test_path = symbol_fitted_models_path(symbol=symbol)

    # now lets take the exact sub-directory which corresponds to date # 5 in our list of fitted models

    test_fitted_model_date_loc = symbol_fitted_model_date_loc(test_path, 5)

#     # now pick the first item in that list which is basically the pickle file
#     # this is the entire path you have to load with pickle.load

    model_pickle = os.path.join(test_fitted_model_date_loc, os.listdir(test_fitted_model_date_loc)[0])

    # will take the input of a path (should be the models path, and a number and will produce a pickle file

    


In [80]:
test_fitted_model_date_loc

'/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/PRU.L/SINGLE_KERNEL/20170124'

In [81]:
model_pickle

'/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/PRU.L/SINGLE_KERNEL/20170124/PRU.L_20170124_label_PrMov__window_5__thres_arbitrary__0.1_clf_fitted_.pickle'

In [82]:
print model_pickle

# getting the fitted model date, as this will be used in fwd dates
model_date = test_fitted_model_date_loc.split("/")[-1]

print('Your symbol is:', symbol, 'and the model date is:' ,model_date)
symbol_labels_path = os.path.join(labels_path, symbol, 'NON_DIRECTIONAL')
print symbol_labels_path
symbol_features_path = os.path.join(features_path, symbol, 'MODEL_BASED')
print symbol_features_path

# get all the dates of the labels from the labels path - this may be a bit redundant in the end

labels_dates = sorted([os.listdir(symbol_labels_path)[idx].split(".")[0]
                       for idx, _ in enumerate(os.listdir(symbol_labels_path))])

# this is the location of the out of sample features

oos_features_path = os.path.join(symbol_features_path, model_date)
oos_dates_list = sorted([oos_date.split("_")[5] for oos_date in
                         sorted(os.listdir(oos_features_path))])  # list of oos features

# keep only the fwd dates i.e the oos dates

fwd_dates = fwd_dates(_dates_list=labels_dates, _key_date=model_date)






/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/PRU.L/SINGLE_KERNEL/20170124/PRU.L_20170124_label_PrMov__window_5__thres_arbitrary__0.1_clf_fitted_.pickle
('Your symbol is:', 'PRU.L', 'and the model date is:', '20170124')
/media/ak/WorkDrive/Data/features_models/labels/PRU.L/NON_DIRECTIONAL
/media/ak/WorkDrive/Data/features_models/features/PRU.L/MODEL_BASED


In [44]:
import fnmatch

In [45]:
test=os.listdir(oos_features_path)[0]

In [46]:
model_pickle

'/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/PRU.L/SINGLE_KERNEL/20170124/PRU.L_20170124_label_PrMov__window_5__thres_arbitrary__0.1_clf_fitted_.pickle'

In [47]:
pat ='*20170125*'
print fnmatch.fnmatch(model_pickle, pat)

False


In [48]:
model_pickle

'/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/PRU.L/SINGLE_KERNEL/20170124/PRU.L_20170124_label_PrMov__window_5__thres_arbitrary__0.1_clf_fitted_.pickle'

In [49]:

for file_oos in os.listdir(oos_features_path):
    if fnmatch.fnmatch(file_oos, pat):
        print file_oos

PRU.L_3_states_features_date:_20170125_now:_20181229_.pickle


In [50]:
oos_file = "_".join((symbol, '3_states_features_date:',oos_date,'now:_20181229_.pickle'))

In [51]:
test_precision= os.path.join(test_path, 'accuracy_dictionary.pickle')
precision_dict=pickle.load(open(test_precision, 'r'))

IOError: [Errno 2] No such file or directory: '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/PRU.L/SINGLE_KERNEL/accuracy_dictionary.pickle'

In [52]:
test_recall= os.path.join(test_path, 'recall_dictionary.pickle')
recall_dict=pickle.load(open(test_recall, 'r'))

IOError: [Errno 2] No such file or directory: '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/PRU.L/SINGLE_KERNEL/recall_dictionary.pickle'

In [53]:
G = {'E': 18.0, 'D': 17.0, 'C': 19.0, 'B': 15.0, 'A': 0}
d = float(sum(G.values())) / len(G)
print (d)

13.8


In [None]:
d2=float(sum(precision_dict.values()))/len(precision_dict)
d3= float(np.median(precision_dict.values()))

In [None]:
r2=float(sum(recall_dict.values()))/len(recall_dict)
r3= float(np.median(recall_dict.values()))

In [None]:
r3

In [None]:
import matplotlib.pyplot as plt
plt.hist(precision_dict.values())
plt.show()

In [None]:
test_fitted_model_date_loc = symbol_fitted_model_date_loc(test_path, 5)

In [None]:
test_fitted_model_date_loc

In [57]:
model_directories=[symbol_fitted_model_date_loc(test_path, idx) for idx, date in enumerate(os.listdir(test_path))]

In [67]:
models_locations =[os.path.join(model_directories[dir_idx],os.listdir(model_directories[dir_idx])[0]) for dir_idx, model_dir in enumerate(model_directories)]
    

In [77]:
for model_idx, model_loc in enumerate(models_locations):
    model_date= model_loc.split("/")[8]
    model_pickle = model_loc

20170117
20170118
20170119
20170120
20170123
20170124
20170125
20170126
20170127
20170130
20170131
20170301
20170703
20170704
20170705
20170706
20170707
20170710
20170711
20170712
20170713
20170714
20170717
20170718
20170719
20170720
20170721
20170724
20170725
20170726
20170727
20170728
20170731
20170801
20170802
20170803
20170804
20170807
20170808
20170809
20170810
20170811
20170814
20170815
20170816
20170817
20170818
20170821
20170822
20170823
20170824
20170825
20170829
20170830
20170831
20180201
20180202
20180205
20180206
20180207
20180208
20180209
20180212
20180213
20180214
20180215
20180216
20180219
20180220
20180221
20180222
20180223
20180226
20180227
20180228
20180403
20180404
20180405
20180406
20180409
20180410
20180411
20180413
20180418
20180419
20180420


In [None]:
def model_loc_specific(model_no): return  model_locations[model_no][0]

In [None]:
model_no = 4
model_locations[model_no]

In [None]:
# model_pickle = os.path.join(test_fitted_model_date_loc, os.listdir(test_fitted_model_date_loc)[0])

In [None]:
def model_pickle_to_svc(model_pickle):
    pickle_to_file = pickle.load(open(model_pickle, "rb"))
    best_estimator = pickle_to_file['SVC'].best_estimator_
    return best_estimator

In [None]:
def model_date(fitted_model_date_loc):
    model_date = fitted_model_date_loc.split("/")[-1]
    return model_date

In [None]:
main_path

In [27]:
metrics_loc = os.path.join(data_only_drive, 'Data','features_models','metrics')

In [34]:
os.listdir(metrics_loc)

['PRU.L_20170124_results_metrics.pickle']

In [35]:
metrics_file = os.path.join(metrics_loc,os.listdir(metrics_loc)[0])

In [37]:
metrics_dict =pickle.load(open(metrics_file, "rb"))

In [38]:
metrics_dicamaz


['recall', 'F1-score', 'accuracy']

In [21]:
metrics_dict_load

'/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/metrics/PRU.L_20170124_results_metrics.pickle'