In [1]:

import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
sc = StandardScaler()
import os
import pickle
import fnmatch
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score

In [2]:
##useful functions

def fwd_dates(_dates_list, _key_date):
    # returns a list of dates that are forward from the key_date
    fwd_dates_list = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list

def common_member(a, b): 
      
    a_set = set(a) 
    b_set = set(b) 
      
    # check length  
    if len(a_set.intersection(b_set)) > 0: 
        return(a_set.intersection(b_set))   
    else: 
        return("no common elements") 


def remove_nans(features_tuple, labels, idx=1):
    # not the cleanest but useful
    # function to clean up nans as I seem to use it a lot, so better to have one function
    # combines the features and labels and removes rows with nans across so we dont lose the ordering
    # returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice', 'Duration', 'states', 'TradedTime',
                                       'TradedPrice'], axis=1)
    df_concat = pd.concat([features_df, labels_only.iloc[:, 0:idx]], axis=1, sort='False')
    # only using 1st set of labels- but we can re-write this a bit
    df_x_nan = df_concat.dropna()  # dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1  # location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]  # keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1)  # keeping the features only
    return features_, labels_


def prec_recall_report(y_true, y_predict):
    # function to ge the sci-kit learn classification metrics into a pretty DF for csv!
    report = pd.DataFrame(list(precision_recall_fscore_support(y_true, y_predict)),
                          index=['Precision', 'Recall', 'F1-score', 'Support']).T
    # Now add the 'Avg/Total' row
    report.loc['Avg/Total', :] = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    report.loc['Avg/Total', 'Support'] = report['Support'].sum()
    return report


class DataLoader(object):
    def __init__(self, path_main, ticker):
        self.main_path = path_main
        self.ticker = ticker

        self.features_labels_path = os.path.join(self.main_path, 'features_models')
        self.features_path = os.path.join(self.features_labels_path, 'features')
        # collection of per symbol non directional labels
        self.labels_path = os.path.join(self.features_labels_path, 'labels', self.ticker, 'NON_DIRECTIONAL')
        self.symbol_features_path = os.path.join(self.features_labels_path, 'features', self.ticker, 'MODEL_BASED')
        # list of all the model -oos hmm feature dates - each folder is a collection of oos feature dates
        self.hmm_dates_list = os.listdir(self.symbol_features_path)  # each folder are the OOS features from each HMM
        self.compute_date = os.listdir(os.path.join( \
            self.symbol_features_path, \
            os.listdir(self.symbol_features_path)[1]))[1].split("_")[7]

    def ticker_features(self, model_date, date):
        # need to make this a lot more flexible with number of states
        if model_date < date:
            file_name = "_".join(
                (self.ticker, '3', 'states', 'features', 'date:', date, 'now:', self.compute_date, '.pickle'))
            file_loc = os.path.join(self.symbol_features_path, str(model_date), file_name)
            with open(file_loc, 'rb') as handle:
                ticker_features = pickle.load(handle)
        else:
            print('Loading Feature Date which is in-sample. Change your Model Date')
        return ticker_features

    def ticker_labels_csv(self, date):
        file_loc = os.path.join(self.labels_path, str(date) + '.csv')
        ticker_labels = pd.read_csv(file_loc, index_col=0)
        return ticker_labels

    @staticmethod
    def open_pickle_file(path, pickle_file):
        file_loc = os.path.join(path, pickle_file)
        pickle_to_file = pickle.load(open(file_loc, "rb"))
        return pickle_to_file

    @staticmethod
    def get_date_from_file(file_, numb_):
        return os.path.splitext(file_[numb_])[0]

class MarketFeatures(object):
    # a class to be expanded that uses features for base case -market based only-indicators/features
    """"Requires:
    a dataframe that has TradedPrice And Volume columns
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average.
    """

    def __init__(self, df):
        #         self.ticker = ticker
        self.df = df

    def load_data(self):
        pass

    def ma_spread(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.df['TradedPrice'].rolling(window=long_window).mean()
        px_name = "_".join(('px_indx', str(short_window), str(long_window)))
        self.df[px_name] = long_rolling_px - short_rolling_px
        return self.df

    def ma_spread_duration(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['Duration'].rolling(window=short_window).mean()
        long_rolling_px = self.df['Duration'].rolling(window=long_window).mean()
        dur_name = "_".join(('dur_indx', str(short_window), str(long_window)))
        self.df[dur_name] = long_rolling_px - short_rolling_px
        return self.df

    def obv_calc(self):
        # on balance volume indicator
        self.df['SignedVolume'] = self.df['Volume'] * np.sign(self.df['TradedPrice'].diff()).cumsum()
        self.df['SignedVolume'].iat[1] = 0
        self.df['OBV'] = self.df['SignedVolume']  # .cumsum()
        self.df = self.df.drop(columns=['SignedVolume'])
        return self.df

    def chaikin_mf(self, period=5):
        # Chaikin money flow indicator
        self.df["MF Multiplier"] = (self.df['TradedPrice'] - (self.df['TradedPrice'].expanding(period).min()) \
                                    - (self.df['TradedPrice'].expanding(period).max() \
                                       - self.df['TradedPrice'])) / (
                                           self.df['TradedPrice'].expanding(period).max() - self.df[ \
                                       'TradedPrice'].expanding(period).min())
        self.df["MF Volume"] = self.df['MF Multiplier'] * self.df['Volume']
        self.df['CMF_' + str(period)] = self.df['MF Volume'].sum() / self.df["Volume"].rolling(period).sum()
        self.df = self.df.drop(columns=['MF Multiplier', 'MF Volume'])
        return self.df

In [3]:

# main locations

data_dir = os.getenv('FINANCE_DATA')  # main directory referenced in all the code
# data_oos_loc= ('/media/ak/My Passport/Experiment Data')
# passport = ('/media/ak/My Passport')
# experimental_data = os.path.join(passport, 'Experiment Data')
features_loc = os.path.join(data_oos_loc,'features') #
labels_loc = os.path.join(data_oos_loc,'labels')
models_loc = os.path.join(data_oos_loc,'models')
features= os.listdir(features_loc)
# os.listdir(os.path.join(data_dir,'features_models','models'))


NameError: name 'data_oos_loc' is not defined

In [None]:
# pick a symbol that works
symbol ='MKS.L'

syml_features_loc = os.path.join(features_loc,symbol,'MODEL_BASED') #create the symbol feature locations
syml_models_loc = os.path.join(models_loc,symbol,'SINGLE_KERNEL') #create the symbol model locations
syml_labels_loc = os.path.join(labels_loc,symbol,'NON_DIRECTIONAL')# create the symbol labels location


In [64]:
# lists of dates
model_dates = os.listdir(syml_models_loc)
labels_dates = [os.listdir(syml_labels_loc)[idx].split(".")[0] for idx, _ in enumerate(os.listdir(syml_labels_loc))]
features_dates_dir = os.listdir(syml_features_loc)


In [68]:
common_date = list(common_member(labels_dates, oos_features_dates))
common_date_features_loc = os.path.join(syml_features_loc, common_date[2])
common_date_features = os.listdir(common_date_features_loc)
common_dates = [os.listdir(common_date_features)[idx] for ]

In [83]:
common_date_features

['MKS.L_3_states_features_date:_20180413_now:_20181225_.pickle',
 'MKS.L_3_states_features_date:_20180416_now:_20181225_.pickle',
 'MKS.L_3_states_features_date:_20180417_now:_20181225_.pickle',
 'MKS.L_3_states_features_date:_20180418_now:_20181225_.pickle',
 'MKS.L_3_states_features_date:_20180419_now:_20181225_.pickle',
 'MKS.L_3_states_features_date:_20180420_now:_20181225_.pickle']

In [82]:
#model date- this date corresponds to the model which was used for out of sample
date_idx = 1 
#contains the list of detailed features for the specific model date
features_dates_detail = os.path.join(syml_features_loc, common_date_features[date_idx]) 
oos_features_dates =[os.listdir(features_dates_detail)[idx].split("_")[5] for idx,_ in enumerate(os.listdir(features_dates_detail))]

OSError: [Errno 2] No such file or directory: '/media/ak/My Passport/Experiment Data/features/MKS.L/MODEL_BASED/MKS.L_3_states_features_date:_20180416_now:_20181225_.pickle'

['20170808',
 '20180202',
 '20170116',
 '20170117',
 '20170118',
 '20170119',
 '20170120',
 '20170123',
 '20170124',
 '20170125',
 '20170126',
 '20170127',
 '20170130',
 '20170131',
 '20170703',
 '20170704',
 '20170705',
 '20170706',
 '20170707',
 '20170710',
 '20170711',
 '20170712',
 '20170713',
 '20170714',
 '20170717',
 '20170718',
 '20170719',
 '20170720',
 '20170721',
 '20170724',
 '20170725',
 '20170726',
 '20170727',
 '20170728',
 '20170731',
 '20170801',
 '20170802',
 '20170803',
 '20170804',
 '20170807',
 '20170809',
 '20170810',
 '20170811',
 '20170814',
 '20170815',
 '20170816',
 '20170817',
 '20170818',
 '20170821',
 '20170822',
 '20170823',
 '20170824',
 '20170825',
 '20170829',
 '20170830',
 '20170831',
 '20170901',
 '20170904',
 '20170905',
 '20170906',
 '20170907',
 '20170908',
 '20170911',
 '20170912',
 '20170913',
 '20170914',
 '20170915',
 '20170918',
 '20170919',
 '20170920',
 '20170921',
 '20170922',
 '20170925',
 '20170926',
 '20170927',
 '20170928',
 '20170929',

In [37]:

def model_pickle_to_svc(model_pickle):

    pickle_to_file = pickle.load(open(model_pickle, "rb"))
    best_estimator = pickle_to_file['SVC'].best_estimator_

    return best_estimator


# test case ##



In [28]:
if __name__ == '__main__':


    
    # test symbol path, which essentially produces the path where all the fitted models are.
    # '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/SPT.L/SINGLE_KERNEL'

    symbol_labels_path = syml_labels_loc
    
    # specific symbol features list of directories. so this has all the model-based directories of features
    # each date on this list corresponds to an hmm model, and each date-directory contains all the features 
    # constructed out of sample
    
    symbol_features_dates_path = syml_features_loc
    
    # we construct a list of all the hmm-model-date directories, each containing OOS features
    list_features_dates_dirs = os.listdir(symbol_features_dates_path)
    
    # go into each hmm-model-generated directory and get us the location of that directory
    def oos_date_specific_features_dir(date_dir_no): 
        return os.path.join(symbol_features_dates_path,list_features_dates_dirs[date_dir_no]) #basically 
    
    # this is an example
    oos_feature_file_locations_dict ={}
    oos_labels_file_locations_dict ={}
    for date_idx, date in enumerate(list_features_dates_dirs):

        # create a directory of all the feature locations        
        oos_feature_file_locations_dict[date] = [os.listdir(oos_date_specific_features_dir(date_idx))[loc_idx]
                                              for loc_idx,_ in 
                                               enumerate(os.listdir(oos_date_specific_features_dir(date_idx)))]
    
        # create a directory of all the label locations
        oos_labels_file_locations_dict[date]=[".".join((oos_feature_file_locations_dict[date][idx].split("_")[5],"csv"))
                                             for idx,_ in enumerate(oos_feature_file_locations_dict[date])]
        
       
        

    
    
    

In [59]:
#use the key of keys to load up the 
 # go through all the keys and basically get the out of sample features and labels
keys= oos_feature_file_locations_dict.keys()
#so you need to iterate over idxs's and over keys'
# for key_idx,_ in enumerate(keys):
# print 
key_idx=1 #one loop

# location= os.path.join(oos_date_specific_features_dir(date_idx),oos_feature_file_locations_dict[keys[key_idx]][1])
# test_features=pickle.load(open(location, "rb")) #second loop
# test_labels = pd.read_csv(oos_labels_file_locations_dict[keys[key_idx]][1])

In [None]:
datacls = DataLoader(path_main=data_dir, ticker=symbol)  # test symbol -create class
symbol_compute_date = datacls.compute_date
from collections import defaultdict
all_symbols_d = defaultdict(dict)
symbol_model_dates = dict()

In [None]:
print symbol
test_path = symbol_fitted_models_path(symbol=symbol)

# now lets take all the model directories and locations##
model_directories = [symbol_fitted_model_date_loc(test_path, idx) for idx, date in enumerate(os.listdir(test_path))]
models_locations = [os.path.join(model_directories[dir_idx], os.listdir(model_directories[dir_idx])[0]) for
                    dir_idx, model_dir in enumerate(model_directories)]
for model_idx, model_loc in enumerate(models_locations):
    model_date = model_loc.split("_")[6]
    print ('model date is:',model_date)
    model_pickle = model_loc

    print model_pickle
    pickle_to_file = pickle.load(open(model_pickle, "rb")) #load your model

    best_estimator = pickle_to_file['SVC'].best_estimator_

    print('Your symbol is:', symbol, 'and the model date is:' ,model_date)
    fwd_dates_list = [i for i in keys if i > model_date]
    # set up the dictionary for metrics #
    M = len(fwd_dates_list)
    T = 1
    T_2 = 4
    fitted_models_results = {
            'accuracy': np.empty((M,T)),
            'recall': np.empty((M,T)),
            'F1-score': np.empty((M,T)),
            'precision_recall_fscore_support': np.empty((M, T_2))
        }
    for key in fwd_dates_list:
        listA=oos_feature_file_locations_dict[key] # list of out of sample features
        listB=oos_labels_file_locations_dict[key] # list of out of sample labels
        for a, b in zip(listA, listB):
            if fwd_date > model_date:
                fwd_date=b.split(".")[0]
                features_date= a.split("_")[5]
                print('model date', model_date)
                print('features_date', features_date)
            else:
                print('tiny problem, model date is not behind!')
#             exists=os.path.isfile(os.path.join(symbol_features_dates_path,key,a))            
#             if exists:
#                 print('###-computing-###')
#                 features_tuple=pickle.load(open(os.path.join(symbol_features_dates_path,key,a), "rb"))
#                 market_data_oos= pd.read_csv(os.path.join(symbol_labels_path, b),index_col=0)
# #                 features_df = pd.concat([features_tuple[0], features_tuple[1],
# #                                      features_tuple[2], features_tuple[3]], axis=1, sort=False)
# #                 df_w_market_features = MarketFeatures(df=MarketFeatures(\
# #                     df=MarketFeatures(
#                         df=MarketFeatures(df=market_data_oos).obv_calc()).chaikin_mf()).ma_spread()).ma_spread_duration()

#                 df_concat = pd.concat([features_df, df_w_market_features], axis=1, sort='False').dropna()

#                 label_name = str(df_concat.columns[df_concat.columns.str.contains(pat='label')].values[0])

#                 df_final = df_concat.drop(columns=['TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', \
#                                                    'Volume', label_name])
#                 if len(df_final)> 5:
#                     X = MinMaxScaler().fit_transform(df_final)

#                     y_labels = df_concat[df_concat.columns[df_concat.columns.str.contains(pat='label')]].iloc[:, 0]
#                     y_predict = best_estimator.predict(X)
#                     print accuracy_score(y_labels, y_predict)
#                 else:
#                     print('skipping...')

#     #                 results_loc = str(os.path.join(metrics_loc, "_".join((symbol,model_date,"results_metrics.pickle"))))
#     #                 fitted_models_results['accuracy'][fwd_date, :] = accuracy_score(y_labels, y_predict)
#     #                 fitted_models_results['recall'][fwd_date, :] = recall_score(y_true=y_labels, y_pred=y_predict)
#     #                 fitted_models_results['F1-score'][fwd_date, :] =f1_score(y_true= y_labels, y_pred=y_predict)
#     #                 fitted_models_results['precision_recall_fscore_support'][fwd_idx, :] = precision_recall_fscore_support(y_true=  y_labels, y_pred=y_predict, average='micro')

#     #                 with open(results_loc, 'wb') as f:
#     #                     pickle.dump(fitted_models_results, f)
#             else:
#                 print ('problem')

#             #             features_tuple = pickle.load(open(os.path.join(symbol_features_dates_path,a), "rb"))

In [None]:
os.path.join(symbol_features_dates_path,key,a)

In [None]:
fwd_date

In [None]:
    import os
    exists = os.path.isfile('/path/to/file')
    if exists:
        # Store configuration file values
    else:
        # Keep presets

In [None]:
fwd_dates_list

In [None]:
fwd_dates_list