In [4]:

import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
sc = StandardScaler()
import os
import pickle
import fnmatch
import os
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score
from collections import defaultdict

In [5]:
##useful functions

def fwd_dates(_dates_list, _key_date):
    # returns a list of dates that are forward from the key_date
    fwd_dates_list = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list

def common_member(a, b): 
      
    a_set = set(a) 
    b_set = set(b) 
      
    # check length  
    if len(a_set.intersection(b_set)) > 0: 
        return(a_set.intersection(b_set))   
    else: 
        return("no common elements") 


def remove_nans(features_tuple, labels, idx=1):
    # not the cleanest but useful
    # function to clean up nans as I seem to use it a lot, so better to have one function
    # combines the features and labels and removes rows with nans across so we dont lose the ordering
    # returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice', 'Duration', 'states', 'TradedTime',
                                       'TradedPrice'], axis=1)
    df_concat = pd.concat([features_df, labels_only.iloc[:, 0:idx]], axis=1, sort='False')
    # only using 1st set of labels- but we can re-write this a bit
    df_x_nan = df_concat.dropna()  # dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1  # location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]  # keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1)  # keeping the features only
    return features_, labels_


def prec_recall_report(y_true, y_predict):
    # function to ge the sci-kit learn classification metrics into a pretty DF for csv!
    report = pd.DataFrame(list(precision_recall_fscore_support(y_true, y_predict)),
                          index=['Precision', 'Recall', 'F1-score', 'Support']).T
    # Now add the 'Avg/Total' row
    report.loc['Avg/Total', :] = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    report.loc['Avg/Total', 'Support'] = report['Support'].sum()
    return report


class DataLoader(object):
    def __init__(self, path_main, ticker):
        self.main_path = path_main
        self.ticker = ticker

        self.features_labels_path = os.path.join(self.main_path, 'features_models')
        self.features_path = os.path.join(self.features_labels_path, 'features')
        # collection of per symbol non directional labels
        self.labels_path = os.path.join(self.features_labels_path, 'labels', self.ticker, 'NON_DIRECTIONAL')
        self.symbol_features_path = os.path.join(self.features_labels_path, 'features', self.ticker, 'MODEL_BASED')
        # list of all the model -oos hmm feature dates - each folder is a collection of oos feature dates
        self.hmm_dates_list = os.listdir(self.symbol_features_path)  # each folder are the OOS features from each HMM
        self.compute_date = os.listdir(os.path.join( \
            self.symbol_features_path, \
            os.listdir(self.symbol_features_path)[1]))[1].split("_")[7]

    def ticker_features(self, model_date, date):
        # need to make this a lot more flexible with number of states
        if model_date < date:
            file_name = "_".join(
                (self.ticker, '3', 'states', 'features', 'date:', date, 'now:', self.compute_date, '.pickle'))
            file_loc = os.path.join(self.symbol_features_path, str(model_date), file_name)
            with open(file_loc, 'rb') as handle:
                ticker_features = pickle.load(handle)
        else:
            print('Loading Feature Date which is in-sample. Change your Model Date')
        return ticker_features

    def ticker_labels_csv(self, date):
        file_loc = os.path.join(self.labels_path, str(date) + '.csv')
        ticker_labels = pd.read_csv(file_loc, index_col=0)
        return ticker_labels

    @staticmethod
    def open_pickle_file(path, pickle_file):
        file_loc = os.path.join(path, pickle_file)
        pickle_to_file = pickle.load(open(file_loc, "rb"))
        return pickle_to_file

    @staticmethod
    def get_date_from_file(file_, numb_):
        return os.path.splitext(file_[numb_])[0]

class MarketFeatures(object):
    # a class to be expanded that uses features for base case -market based only-indicators/features
    """"Requires:
    a dataframe that has TradedPrice And Volume columns
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average.
    """

    def __init__(self, df):
        #         self.ticker = ticker
        self.df = df

    def load_data(self):
        pass

    def ma_spread(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.df['TradedPrice'].rolling(window=long_window).mean()
        px_name = "_".join(('px_indx', str(short_window), str(long_window)))
        self.df[px_name] = long_rolling_px - short_rolling_px
        return self.df

    def ma_spread_duration(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['Duration'].rolling(window=short_window).mean()
        long_rolling_px = self.df['Duration'].rolling(window=long_window).mean()
        dur_name = "_".join(('dur_indx', str(short_window), str(long_window)))
        self.df[dur_name] = long_rolling_px - short_rolling_px
        return self.df

    def obv_calc(self):
        # on balance volume indicator
        self.df['SignedVolume'] = self.df['Volume'] * np.sign(self.df['TradedPrice'].diff()).cumsum()
        self.df['SignedVolume'].iat[1] = 0
        self.df['OBV'] = self.df['SignedVolume']  # .cumsum()
        self.df = self.df.drop(columns=['SignedVolume'])
        return self.df

    def chaikin_mf(self, period=5):
        # Chaikin money flow indicator
        self.df["MF Multiplier"] = (self.df['TradedPrice'] - (self.df['TradedPrice'].expanding(period).min()) \
                                    - (self.df['TradedPrice'].expanding(period).max() \
                                       - self.df['TradedPrice'])) / (
                                           self.df['TradedPrice'].expanding(period).max() - self.df[ \
                                       'TradedPrice'].expanding(period).min())
        self.df["MF Volume"] = self.df['MF Multiplier'] * self.df['Volume']
        self.df['CMF_' + str(period)] = self.df['MF Volume'].sum() / self.df["Volume"].rolling(period).sum()
        self.df = self.df.drop(columns=['MF Multiplier', 'MF Volume'])
        return self.df

In [6]:
# locations

data_dir = os.getenv('FINANCE_DATA')  # main directory referenced in all the code
data_only_drive = '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2'  # external date only drive

# this is the central location for all the features/models/predictions
features_models = os.path.join(data_dir,
                               'features_models')
features_models_dod = os.path.join(data_only_drive, 'features_models')
# main path where all the sub-directories are (features, models, labels)

# this is the central location for all the labels
labels = os.path.join(features_models, 'labels')  # label subdirectory
# this is the central location for all the features #feature subdirectory
features = os.path.join(features_models, 'features')

# location to save results
model_save_loc = os.path.join(data_only_drive, 'Data', 'features_models', 'models')

# location where all the models are:

model_save_loc = os.path.join(data_only_drive, 'Data', 'features_models','models')
# from the main directory select all the symbols that are finishing in .L for FTSE
symbols_ftse = [s for s in os.listdir(features) if s.endswith('.L')]

main_path = os.path.join(data_dir, 'features_models')  # main directory

# location to save results
model_loc = os.path.join(data_only_drive, 'Data', 'features_models', 'models')

metrics_loc = os.path.join(data_only_drive, 'Data','features_models','metrics')

features_path = os.path.join(main_path, 'features')  # all the features - same as above -redundant

labels_path = os.path.join(main_path, 'labels')  # all the labels

# same as above- new target directory, where all the models and output is saved
# on the data only drive

model_paths = os.path.join(data_only_drive, 'Data', 'features_models',
                           'models')

In [119]:
# pick a symbol that works
symbol ='ITV.L'

# test symbol path, which essentially produces the path where all the fitted models are.
# '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/SPT.L/SINGLE_KERNEL'
symbol_labels_path = os.path.join(labels_path, symbol, 'NON_DIRECTIONAL')
labels_dates = [os.listdir(symbol_labels_path)[idx].split(".")[0] 
                for idx, _ in enumerate(os.listdir(symbol_labels_path))]

# specific symbol features list of directories. so this has all the model-based directories of features
# each date on this list corresponds to an hmm model, and each date-directory contains all the features 
# constructed out of sample

symbol_features_dates_path = os.path.join(features, symbol, 'MODEL_BASED')
symbol_model_path = os.path.join(model_paths,symbol,'SINGLE_KERNEL')
symbol_model_locations = [os.path.join(symbol_model_path,os.listdir(symbol_model_path)[idx]) for idx,
                          _ in enumerate(os.listdir(symbol_model_path))]

# we construct a list of all the hmm-model-date directories, each containing OOS features
features_dates = os.listdir(symbol_features_dates_path)
model_dates = os.listdir(symbol_model_path)
common_dates= sorted(list(common_member(sorted(list(common_member(features_dates, model_dates)))
          , labels_dates))) #dates common for labels, features, models

In [120]:


syml_features_loc = os.path.join(features,symbol,'MODEL_BASED') #create the symbol feature locations
syml_models_loc = os.path.join(model_paths,symbol,'SINGLE_KERNEL') #create the symbol model locations
syml_labels_loc = os.path.join(labels_path, symbol, 'NON_DIRECTIONAL')# create the symbol labels location
# features_dates = os.listdir(symbol_features_dates_path)
# model_dates = os.listdir(symbol_model_path)


In [121]:
# lists of dates
labels_dates = [os.listdir(syml_labels_loc)[idx].split(".")[0] for idx, _ in enumerate(os.listdir(syml_labels_loc))]
features_dates= os.listdir(syml_features_loc)
model_dates= os.listdir(syml_models_loc)


In [122]:
common_dates=sorted(list(common_member(sorted(list(common_member(features_dates, model_dates))),labels_dates)))


In [123]:
#symbol_compute_date
symbol_compute_date = os.listdir(os.path.join(syml_features_loc,os.listdir(syml_features_loc)[1]))[1].split("_")[7]


In [124]:
#model date- this date corresponds to the model which was used for out of sample
date_idx = 1 
# #contains the list of detailed features for the specific model date
# os.listdir(os.path.join(syml_features_loc, common_dates[date_idx])) 
print symbol
from datetime import datetime
from datetime import timedelta

#create a time-delta which will be needed for the common date
daysdelta=timedelta(days=1)

accuracy_results=defaultdict(dict)
accuracy_models_results=defaultdict(dict)
recall_models_results=defaultdict(dict)
f1_models_results=defaultdict(dict)


ITV.L


In [125]:
# for common_date in common_dates:
#     #move the common date (which is the hmm date, and folder name) one dat forward
#     common_day_start=datetime.strptime(common_date, '%Y%m%d') +daysdelta
#     #convert it back to a string (prob we can do in one go)
#     first_oos_day= common_day_start.strftime('%Y%m%d')
#     # getting the model
#     model_name ="_".join((symbol,common_date,'label_PrMov__window_5__thres_arbitrary__0.1_clf_fitted_.pickle'))
#     model_pickle=os.path.join(syml_models_loc,common_date, model_name)
#     pickle_to_file = pickle.load(open(model_pickle, "rb")) 


In [126]:

# now lets take all the model directories and locations##

for common_date in common_dates:
    #move the common date (which is the hmm date, and folder name) one dat forward
    common_day_start=datetime.strptime(common_date, '%Y%m%d') +daysdelta
    #convert it back to a string (prob we can do in one go)
    first_oos_day= common_day_start.strftime('%Y%m%d')
    # getting the model
    model_name ="_".join((symbol,common_date,'label_PrMov__window_5__thres_arbitrary__0.1_clf_fitted_.pickle'))
    model_pickle=os.path.join(syml_models_loc,common_date, model_name)
    
    pickle_to_file = pickle.load(open(model_pickle, "rb")) #load your model
    
    best_estimator = pickle_to_file['SVC'] #.best_estimator_
    print('Your symbol is:', symbol, 'and the model date is:' ,common_date)
    # set a few OOS dates
    
    fwd_dates_list = sorted([i for i in common_dates if i > first_oos_day])[:3]
    
    fitted_models_results = {
            'accuracy': defaultdict(dict),
            'recall': defaultdict(dict),
            'F1-score': defaultdict(dict)
    }
    oos_features_path= os.path.join(syml_features_loc, common_date)
    for dic_idx, fwd_date in enumerate(fwd_dates_list):
        feature_file = "_".join((symbol,'3_states_features_date:',fwd_date,'now:',symbol_compute_date,'.pickle'))
        features_loc = os.path.join(syml_features_loc,common_date, feature_file)
        features_tuple=pickle.load(open(features_loc, "rb"))
        market_data_oos= pd.read_csv(os.path.join(syml_labels_loc, 
                                                  '.'.join((fwd_date,'csv'))),index_col=0)
        features_df = pd.concat([features_tuple[0], features_tuple[1],
                             features_tuple[2], features_tuple[3]], axis=1)
        df_w_market_features = MarketFeatures(df=MarketFeatures(\
                                                                df=MarketFeatures(
                        df=MarketFeatures(df=market_data_oos).obv_calc()).chaikin_mf()).ma_spread()).ma_spread_duration()
# fix the sort issue!!
        df_concat = pd.concat([features_df, df_w_market_features], axis=1).dropna()

        label_name = str(df_concat.columns[df_concat.columns.str.contains(pat='label')].values[0])

        df_final = df_concat.drop(columns=['TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', \
                                           'Volume', label_name])
        if len(df_final)> 5:
            X = MinMaxScaler().fit_transform(df_final)

            y_labels = df_concat[df_concat.columns[df_concat.columns.str.contains(pat='label')]].iloc[:, 0]
            y_predict = best_estimator.predict(X)
            accuracy_models_results[common_date][datetime.strptime(fwd_date, '%Y%m%d').strftime('%Y%m%d')] =(accuracy_score(y_labels, y_predict))
            recall_models_results[common_date][datetime.strptime(fwd_date, '%Y%m%d').strftime('%Y%m%d')] =(recall_score(y_labels,y_predict))
            f1_models_results[common_date][datetime.strptime(fwd_date, '%Y%m%d').strftime('%Y%m%d')]=f1_score(y_true= y_labels, y_pred=y_predict)
        else:
            print ('skipping')
fitted_models_results['accuracy']= accuracy_models_results
fitted_models_results['recall']= recall_models_results
fitted_models_results['F1-score']= f1_models_results

results_loc = str(os.path.join(metrics_loc, "_".join((symbol,"results_metrics.pickle"))))

with open(results_loc, 'wb') as f:
    pickle.dump(fitted_models_results, f)
   


('Your symbol is:', 'ITV.L', 'and the model date is:', '20170712')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170713')
skipping
skipping
skipping
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170714')
skipping
skipping
skipping
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170717')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170718')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170719')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170720')
skipping
skipping
skipping
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170721')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170724')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170725')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170726')
skipping
skipping
skipping
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170727')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20170728')
('Your symbol is:', '

('Your symbol is:', 'ITV.L', 'and the model date is:', '20180411')
skipping
skipping
skipping
('Your symbol is:', 'ITV.L', 'and the model date is:', '20180412')
skipping
skipping
skipping
('Your symbol is:', 'ITV.L', 'and the model date is:', '20180413')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20180416')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20180417')
skipping
skipping
('Your symbol is:', 'ITV.L', 'and the model date is:', '20180418')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20180419')
('Your symbol is:', 'ITV.L', 'and the model date is:', '20180420')


In [128]:
fitted_models_results

{'F1-score': defaultdict(dict,
             {'20170712': {'20170714': 0.1183431952662722,
               '20170717': 0.1081081081081081,
               '20170718': 0.07852193995381063},
              '20170717': {'20170719': 0.06462303231151616,
               '20170720': 0.12170087976539588,
               '20170721': 0.11692307692307694},
              '20170718': {'20170720': 0.13048780487804879,
               '20170721': 0.13009540329575023,
               '20170724': 0.20443349753694584},
              '20170719': {'20170721': 0.0, '20170724': 0.0, '20170725': 0.0},
              '20170721': {'20170724': 0.2295652173913043,
               '20170725': 0.19494584837545126,
               '20170726': 0.2911543038476795},
              '20170724': {'20170726': 0.34111675126903557,
               '20170727': 0.16562282533054976,
               '20170728': 0.15454545454545454},
              '20170725': {'20170727': 0.0, '20170728': 0.0, '20170731': 0.0},
              '20170727': {'20

In [129]:
fitted_models_results.keys()

['recall', 'F1-score', 'accuracy']

In [130]:
accuracy_dict = fitted_models_results['accuracy']
recall_dict = fitted_models_results['recall']
f1_score_dict = fitted_models_results['F1-score']

In [131]:
pd.DataFrame(pd.DataFrame(accuracy_dict).apply(lambda x:np.nanmean(x), axis=1)).mean()

0    0.490305
dtype: float64

In [132]:
pd.DataFrame(pd.DataFrame(accuracy_dict).apply(lambda x:np.nanmean(x), axis=0)).mean()

0    0.499611
dtype: float64