In [81]:
%pylab inline
%matplotlib inline
#To import all shogun classes
from shogun import *

import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
sc = StandardScaler()
import os
import pickle
import fnmatch
import os
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score
from collections import defaultdict
from collections import defaultdict


Populating the interactive namespace from numpy and matplotlib


In [3]:
##useful functions

def fwd_dates(_dates_list, _key_date):
    # returns a list of dates that are forward from the key_date
    fwd_dates_list = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list

def common_member(a, b): 
      
    a_set = set(a) 
    b_set = set(b) 
      
    # check length  
    if len(a_set.intersection(b_set)) > 0: 
        return(a_set.intersection(b_set))   
    else: 
        return("no common elements") 


def remove_nans(features_tuple, labels, idx=1):
    # not the cleanest but useful
    # function to clean up nans as I seem to use it a lot, so better to have one function
    # combines the features and labels and removes rows with nans across so we dont lose the ordering
    # returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice', 'Duration', 'states', 'TradedTime',
                                       'TradedPrice'], axis=1)
    df_concat = pd.concat([features_df, labels_only.iloc[:, 0:idx]], axis=1, sort='False')
    # only using 1st set of labels- but we can re-write this a bit
    df_x_nan = df_concat.dropna()  # dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1  # location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]  # keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1)  # keeping the features only
    return features_, labels_


def prec_recall_report(y_true, y_predict):
    # function to ge the sci-kit learn classification metrics into a pretty DF for csv!
    report = pd.DataFrame(list(precision_recall_fscore_support(y_true, y_predict)),
                          index=['Precision', 'Recall', 'F1-score', 'Support']).T
    # Now add the 'Avg/Total' row
    report.loc['Avg/Total', :] = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    report.loc['Avg/Total', 'Support'] = report['Support'].sum()
    return report


class DataLoader(object):
    def __init__(self, path_main, ticker):
        self.main_path = path_main
        self.ticker = ticker

        self.features_labels_path = os.path.join(self.main_path, 'features_models')
        self.features_path = os.path.join(self.features_labels_path, 'features')
        # collection of per symbol non directional labels
        self.labels_path = os.path.join(self.features_labels_path, 'labels', self.ticker, 'NON_DIRECTIONAL')
        self.symbol_features_path = os.path.join(self.features_labels_path, 'features', self.ticker, 'MODEL_BASED')
        # list of all the model -oos hmm feature dates - each folder is a collection of oos feature dates
        self.hmm_dates_list = os.listdir(self.symbol_features_path)  # each folder are the OOS features from each HMM
        self.compute_date = os.listdir(os.path.join( \
            self.symbol_features_path, \
            os.listdir(self.symbol_features_path)[1]))[1].split("_")[7]

    def ticker_features(self, model_date, date):
        # need to make this a lot more flexible with number of states
        if model_date < date:
            file_name = "_".join(
                (self.ticker, '3', 'states', 'features', 'date:', date, 'now:', self.compute_date, '.pickle'))
            file_loc = os.path.join(self.symbol_features_path, str(model_date), file_name)
            with open(file_loc, 'rb') as handle:
                ticker_features = pickle.load(handle)
        else:
            print('Loading Feature Date which is in-sample. Change your Model Date')
        return ticker_features

    def ticker_labels_csv(self, date):
        file_loc = os.path.join(self.labels_path, str(date) + '.csv')
        ticker_labels = pd.read_csv(file_loc, index_col=0)
        return ticker_labels

    @staticmethod
    def open_pickle_file(path, pickle_file):
        file_loc = os.path.join(path, pickle_file)
        pickle_to_file = pickle.load(open(file_loc, "rb"))
        return pickle_to_file

    @staticmethod
    def get_date_from_file(file_, numb_):
        return os.path.splitext(file_[numb_])[0]

class MarketFeatures(object):
    # a class to be expanded that uses features for base case -market based only-indicators/features
    """"Requires:
    a dataframe that has TradedPrice And Volume columns
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average.
    """

    def __init__(self, df):
        #         self.ticker = ticker
        self.df = df

    def load_data(self):
        pass

    def ma_spread(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.df['TradedPrice'].rolling(window=long_window).mean()
        px_name = "_".join(('px_indx', str(short_window), str(long_window)))
        self.df[px_name] = long_rolling_px - short_rolling_px
        return self.df

    def ma_spread_duration(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['Duration'].rolling(window=short_window).mean()
        long_rolling_px = self.df['Duration'].rolling(window=long_window).mean()
        dur_name = "_".join(('dur_indx', str(short_window), str(long_window)))
        self.df[dur_name] = long_rolling_px - short_rolling_px
        return self.df

    def obv_calc(self):
        # on balance volume indicator
        self.df['SignedVolume'] = self.df['Volume'] * np.sign(self.df['TradedPrice'].diff()).cumsum()
        self.df['SignedVolume'].iat[1] = 0
        self.df['OBV'] = self.df['SignedVolume']  # .cumsum()
        self.df = self.df.drop(columns=['SignedVolume'])
        return self.df

    def chaikin_mf(self, period=5):
        # Chaikin money flow indicator
        self.df["MF Multiplier"] = (self.df['TradedPrice'] - (self.df['TradedPrice'].expanding(period).min()) \
                                    - (self.df['TradedPrice'].expanding(period).max() \
                                       - self.df['TradedPrice'])) / (
                                           self.df['TradedPrice'].expanding(period).max() - self.df[ \
                                       'TradedPrice'].expanding(period).min())
        self.df["MF Volume"] = self.df['MF Multiplier'] * self.df['Volume']
        self.df['CMF_' + str(period)] = self.df['MF Volume'].sum() / self.df["Volume"].rolling(period).sum()
        self.df = self.df.drop(columns=['MF Multiplier', 'MF Volume'])
        return self.df

In [58]:
def featureCreation(idxKey, locDict):
    """ gives out clean features and labels for a given locDict and a idxKey """
    keys = list(locDict.keys())
    featuresIdxDirFileLoc = locDict[keys[idxKey]][0]
    labelsIdxDirFileLoc = locDict[keys[idxKey]][1]
    ''' read the features file'''
    featuresTupleFile = pkl.load(open(featuresIdxDirFileLoc, "rb"), encoding='latin1')
    dfFeatures = pd.concat([featuresTupleFile[0], featuresTupleFile[1], \
                            featuresTupleFile[2], featuresTupleFile[3]], axis=1, sort=False).fillna(0)
    ''' read the labels file'''
    labelsDf = pd.read_csv(labelsIdxDirFileLoc)
    ''' pop the labels out'''
    labels = labelsDf['label_PrMov__window_5__thres_arbitrary__0.1']
    '''dataframe of Features and Labels - X and Y'''
    dfXY = pd.concat([dfFeatures, labels], axis=1, sort='False').dropna()
    labelName = str(dfXY.columns[dfXY.columns.str.contains(pat='label')].values[0])
    ''' drop the labels from the features'''
    dfX = dfXY.drop(columns=[labelName])
    arrX = np.array(dfX)
    ''' feature normalisation'''
    # feature scaling in [0,1] - X = rescale_01(arrX)
    X = normalization(rescale_01(arrX))
    y = dfXY[dfXY.columns[dfXY.columns.str.contains(pat='label')]].iloc[:, 0]
    ''' returns features, labels'''
    return X, y

In [123]:
passport_fin_data_real_data = '/media/ak/My Passport/Data/FinDataReal'
passport_data_drive = '/media/ak/My Passport/Data/'
features_models = '/media/ak/WorkDrive/Data/features_models/'
experiment_data = '/media/ak/My Passport/Experiment Data/'
alternate_label_results = '/media/ak/My Passport/Experiment Data/Alt_Label_Results'

In [49]:
#The Symbols we can look at

In [47]:
symbols_ftse = [s for s in os.listdir(os.path.join(experiment_data,os.listdir(experiment_data)[-1])) if s.endswith('.L')]

## Labels and Symbols 

In [51]:
# Label Paths #
labelPaths = ['LabelsAlternateOne','LabelsAlternateTwo','LabelsAlternateThree','LabelsAlternateFour','LabelsAlternateFive']

passportData = '/media/ak/My Passport/Data/FinDataReal/'

### set symbol 

In [89]:
symbolIdx =1
symbolChoice = symbols_ftse[symbolIdx]

#### check that the alternate labels exist - if not move to the next symbol

In [102]:
def label_path_creation(symbol,passport_drive):
    labelPaths = ['LabelsAlternateOne','LabelsAlternateTwo','LabelsAlternateThree','LabelsAlternateFour','LabelsAlternateFive']
    # [x for b in a for x in b] # Works fine
    labelSymbolPaths = [os.path.join(passport_drive, label, symbol) for label in labelPaths for symbol in symbols]
    return labelSymbolPaths
def check_label_path_exists(symbol, passport_drive, label):
    label_path_to_check =  os.path.join(os.path.join(passport_drive, label, symbol))
    
    if os.path.exists(label_path_to_check) and len(os.listdir(label_path_to_check)) ==0:
        print('Directory exists but is empty') 
    elif os.path.exists(label_path_to_check) and len(os.listdir(label_path_to_check)) >0:
        print('Directory exists and is not empty!')
    return print('good to go')

### check that the path exists and is not empty

In [103]:

for label in labelPaths:
    print(check_label_path_exists(symbolChoice, passport_drive, label))

Directory exists and is not empty!
good to go
None
Directory exists and is not empty!
good to go
None
Directory exists and is not empty!
good to go
None
Directory exists and is not empty!
good to go
None
Directory exists and is not empty!
good to go
None


In [107]:
labelPaths = ['LabelsAlternateOne','LabelsAlternateTwo','LabelsAlternateThree','LabelsAlternateFour','LabelsAlternateFive']
symbols = os.listdir(passport_drive+'/Labels')
# [x for b in a for x in b] # Works fine
labelSymbolPaths = [os.path.join(passport_drive, label, symbol) for label in labelPaths for symbol in symbols]
# double list comprehenstion that creates a directory in the Passport file for each label/symbol
# labelSymbolPaths
def specific_Symbol_Labe_lPaths(labelSymbolPaths, symbolChoice):
    return [s for s in labelSymbolPaths if symbolChoice in s]

In [112]:
# count how many and the dates in each Path
alternateDateDates = dict()
for label in labelSymbolPaths:
    alternateDateDates[label] =   os.listdir(specific_Symbol_Labe_lPaths(labelSymbolPaths, symbolChoice)[0])

### Features work

In [168]:
idx =1
#file that has all the features from experiments
experiments_features_path  = os.path.join(experiment_data,os.listdir(experiment_data)[-1])
# for symbol with idx above, create the specific features path
## where all teh features are saved: experiments_features_path 

### which symbol are you using?

In [162]:
symbols_ftse[idx]

'AAL.L'

### this is the old models dates

In [None]:
symbol_feature_path = os.path.join(experiments_features_path,symbols_ftse[idx],'MODEL_BASED')
# the list dir above has all the models that were used to create features
symbol_feature_dates = os.listdir(symbol_feature_path)
### <--- this is the old hmm models dates

## and these are the label dates

In [186]:
altnerateLabelKeys  = list(alternateDateDates.keys())
label_dates = alternateDateDates[altnerateLabelKeys[0]]
# 0 here needs to be indexed by the key we want, and the key corresponds to a label 

['/media/ak/My Passport/Data/FinDataReal/LabelsAlternateOne/AAL.L',
 '/media/ak/My Passport/Data/FinDataReal/LabelsAlternateTwo/AAL.L',
 '/media/ak/My Passport/Data/FinDataReal/LabelsAlternateThree/AAL.L',
 '/media/ak/My Passport/Data/FinDataReal/LabelsAlternateFour/AAL.L',
 '/media/ak/My Passport/Data/FinDataReal/LabelsAlternateFive/AAL.L']

### features extraction

In [128]:
features_path = os.path.join(features_models, 'features')
symbol_features_path = os.path.join(features_path, symbolChoice, 'MODEL_BASED')

In [157]:
symbol_feature_idex =1  ## this is an HMM model date -or HMMModelDate in the other code base
## next create a path for the HMM model date
symbol_feature_files_paths = os.path.join(symbol_feature_path, symbol_feature_dates[symbol_feature_idex])
## get all the model date feature files
symbol_features_files_idex = os.listdir(symbol_feature_files_paths)

### extracting feature dates

In [160]:
symbolEachModelFeaturesDates = [file.split("_")[5] for file in symbol_features_files_idex] 
#this needs to be indexed by symbol_feature_idex or hmmDateIdx date in the loop

In [196]:
symbolAlternateLabelLocs = [x for x in altnerateLabelKeys if str(symbols_ftse[idx]) in x]


['/media/ak/My Passport/Data/FinDataReal/LabelsAlternateOne/AAL.L/20170127.csv',
 '/media/ak/My Passport/Data/FinDataReal/LabelsAlternateTwo/AAL.L/20170127.csv',
 '/media/ak/My Passport/Data/FinDataReal/LabelsAlternateThree/AAL.L/20170127.csv',
 '/media/ak/My Passport/Data/FinDataReal/LabelsAlternateFour/AAL.L/20170127.csv',
 '/media/ak/My Passport/Data/FinDataReal/LabelsAlternateFive/AAL.L/20170127.csv']

### get common dates

In [165]:
commonDates = list(set(symbolEachModelFeaturesDates) & set(label_dates))
##common dates based on the attributes

In [201]:


for commonDate in commonDates:
    print(commonDate)
    FeatureFileLoc = os.path.join(symbol_feature_files_paths,"".join((symbols_ftse[idx],'_3_states_features_',
 'date:_',commonDate, '_now:_',str('20181227'),'_.pickle')))
    alternateLabels  = [os.path.join(x, commonDate+'.csv') for x in symbolAlternateLabelLocs]
   

20170829
20170816
20170809
20170814
20170810
20170824
20170126
20170125
20170803
20170830
20170117
20170801
20170807
20170120
20170815
20170118
20170823
20170808
20170822
20170123
20170131
20170811
20170124
20170825
20170821
20170818
20170804
20170817
20170831
20170802
20170119
20170130
20170127


In [208]:
for i,_ in enumerate(alternateLabels):
    print(os.path.exists(alternateLabels[i]))

False
False
False
False
False


In [None]:
 #conditions= [os.path.exists(FeatureFileLoc), [os.path.exists(alternateLabels[i]) for i in [0,1,2,3,4]]

In [82]:
all_symbols_d = defaultdict(dict)
symbol_model_dates = dict()


In [131]:
symbol_model_dates = os.listdir(symbol_feature_path)

In [135]:
#os.listdir(os.path.join(symbol_feature_path,symbol_model_dates[1]))

In [141]:
testLoca = os.path.join('/media/ak/My Passport/Experiment Data/MKLExpPath', symbolChoice)
common_loc = os.path.join(testLoca,'CommonLocationsDicts.pkl' )
loc_dists_loc = os.path.join(testLoca,'LocDictsListCorrect.pkl' )

In [146]:
with open(loc_dists_loc, 'rb') as f:
    test=pickle.load(f)