In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
sc = StandardScaler()
import os
import pickle
import fnmatch

In [9]:
##useful functions
def fwd_dates(_dates_list, _key_date):
    # returns a list of dates that are forward from the key_date
    fwd_dates_list = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list
def remove_nans(features_tuple, labels, idx=1):
    # not the cleanest but useful
    # function to clean up nans as I seem to use it a lot, so better to have one function
    # combines the features and labels and removes rows with nans across so we dont lose the ordering
    # returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice', 'Duration', 'states', 'TradedTime',
                                       'TradedPrice'], axis=1)
    df_concat = pd.concat([features_df, labels_only.iloc[:, 0:idx]], axis=1, sort='False')
    # only using 1st set of labels- but we can re-write this a bit
    df_x_nan = df_concat.dropna()  # dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1  # location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]  # keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1)  # keeping the features only
    return features_, labels_ 

In [12]:

class DataLoader(object):
    def __init__(self, path_main, ticker):
        self.main_path = path_main
        self.ticker = ticker

        self.features_labels_path = os.path.join(self.main_path, 'features_models')
        self.features_path = os.path.join(self.features_labels_path, 'features')
        # collection of per symbol non directional labels
        self.labels_path = os.path.join(self.features_labels_path, 'labels', self.ticker, 'NON_DIRECTIONAL')
        self.symbol_features_path = os.path.join(self.features_labels_path, 'features', self.ticker, 'MODEL_BASED')
        # list of all the model -oos hmm feature dates - each folder is a collection of oos feature dates
        self.hmm_dates_list = os.listdir(self.symbol_features_path) #each folder are the OOS features from each HMM
        self.compute_date= os.listdir(os.path.join( \
                                                   self.symbol_features_path, \
                                                   os.listdir(self.symbol_features_path)[1]))[1].split("_")[7]

    def ticker_features(self, model_date, date):
        # need to make this a lot more flexible with number of states
        if model_date < date:
            file_name = "_".join((self.ticker, '3', 'states', 'features', 'date:', date, 'now:', self.compute_date, '.pickle'))
            file_loc = os.path.join(self.symbol_features_path, str(model_date), file_name)
            with open(file_loc, 'rb') as handle:
                ticker_features = pickle.load(handle)
        else:
            print('Loading Feature Date which is in-sample. Change your Model Date')
        return ticker_features

    def ticker_labels_csv(self, date):
        file_loc = os.path.join(self.labels_path, str(date) + '.csv')
        ticker_labels = pd.read_csv(file_loc, index_col=0)
        return ticker_labels

    @staticmethod
    def open_pickle_file(path, pickle_file):
        file_loc = os.path.join(path, pickle_file)
        pickle_to_file = pickle.load(open(file_loc, "rb"))
        return pickle_to_file

    @staticmethod
    def get_date_from_file(file_, numb_):
        return os.path.splitext(file_[numb_])[0]

In [10]:
def common_member(a, b): 
    
        a_set = set(a)
        b_set = set(b)
    
        # check length 
        if len(a_set.intersection(b_set)) > 0:
            return(a_set.intersection(b_set)) 
        else:
            return("no common elements")

In [11]:


def prec_recall_report(y_true, y_predict):
    # function to ge the sci-kit learn classification metrics into a pretty DF for csv!
    report = pd.DataFrame(list(precision_recall_fscore_support(y_true, y_predict)),
                          index=['Precision', 'Recall', 'F1-score', 'Support']).T
    # Now add the 'Avg/Total' row
    report.loc['Avg/Total', :] = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    report.loc['Avg/Total', 'Support'] = report['Support'].sum()
    return report

class MarketFeatures(object):
    # a class to be expanded that uses features for base case -market based only-indicators/features
    """"Requires:
    a dataframe that has TradedPrice And Volume columns
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average.
    """

    def __init__(self, df):
        #         self.ticker = ticker
        self.df = df

    def load_data(self):
        pass

    def ma_spread(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.df['TradedPrice'].rolling(window=long_window).mean()
        px_name = "_".join(('px_indx', str(short_window), str(long_window)))
        self.df[px_name] = long_rolling_px - short_rolling_px
        return self.df

    def ma_spread_duration(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['Duration'].rolling(window=short_window).mean()
        long_rolling_px = self.df['Duration'].rolling(window=long_window).mean()
        dur_name = "_".join(('dur_indx', str(short_window), str(long_window)))
        self.df[dur_name] = long_rolling_px - short_rolling_px
        return self.df

    def obv_calc(self):
        # on balance volume indicator
        self.df['SignedVolume'] = self.df['Volume'] * np.sign(self.df['TradedPrice'].diff()).cumsum()
        self.df['SignedVolume'].iat[1] = 0
        self.df['OBV'] = self.df['SignedVolume']  # .cumsum()
        self.df = self.df.drop(columns=['SignedVolume'])
        return self.df

    def chaikin_mf(self, period=5):
        # Chaikin money flow indicator
        self.df["MF Multiplier"] = (self.df['TradedPrice'] - (self.df['TradedPrice'].expanding(period).min()) \
                                    - (self.df['TradedPrice'].expanding(period).max() \
                                       - self.df['TradedPrice'])) / (
                                           self.df['TradedPrice'].expanding(period).max() - self.df[ \
                                       'TradedPrice'].expanding(period).min())
        self.df["MF Volume"] = self.df['MF Multiplier'] * self.df['Volume']
        self.df['CMF_' + str(period)] = self.df['MF Volume'].sum() / self.df["Volume"].rolling(period).sum()
        self.df = self.df.drop(columns=['MF Multiplier', 'MF Volume'])
        return self.df

In [24]:
data_dir = os.getenv('FINANCE_DATA')  # main directory referenced in all the code\n",
data_only_drive = '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2'  # external date only drive\n",

"# location to save results\n",
model_save_loc = os.path.join(data_only_drive, 'Data', 'features_models', 'models')

In [14]:
# and produces the relevant path of all fitted single kernel models\n",
# this path is from the Data Only Drive\n",
def symbol_fitted_models_path(symbol): return os.path.join(model_paths, symbol, 'SINGLE_KERNEL')
# provides a list of the above path
def symbol_list_fitted_dates(symbol): return sorted(os.listdir(symbol_fitted_models_path(symbol)))

In [111]:

def symbol_fitted_model_date_loc(file_path, model_date_no):
    return os.path.join(file_path, str(symbol_list_fitted_dates(symbol)[model_date_no]))

def symbol_fitted_models_path(symbol): return os.path.join(model_paths, symbol, 'SINGLE_KERNEL')
def symbol_list_fitted_dates(symbol): return sorted(os.listdir(symbol_fitted_models_path(symbol)))
def symbol_fitted_model_date_loc(file_path, model_date_no):
    return os.path.join(file_path, str(symbol_list_fitted_dates(symbol)[model_date_no]))

def symbol_labels_path(symbol): 
    return os.path.join(labels,symbol,'NON_DIRECTIONAL')

def symbol_labels_list(symbol):
    return os.listdir(symbol_labels_path(symbol))
    
def symbol_hmm_dates_list(symbol):
    return os.listdir(os.path.join(features,symbol,'MODEL_BASED'))

def symbol_hmm_dates_path(symbol):
    return os.path.join(features,symbol,'MODEL_BASED')
def symbol_features_oos_path(symbol, idx):
    return os.path.join(symbol_hmm_dates_path(symbol),os.listdir(symbol_hmm_dates_path(symbol))[idx])

def fitted_model(idx):
    fitted_model = os.path.join(ticker_fitted_models,os.listdir(ticker_fitted_models)[idx])
    return fitted_model


In [17]:
def model_pickle_to_svc(model_pickle):
    pickle_to_file = pickle.load(open(model_pickle, "rb"))
    best_estimator = pickle_to_file['SVC'].best_estimator_
    return best_estimator

In [68]:
#make list of symbols with models:

features_models = os.path.join(data_dir,'features_models')
features = os.path.join(data_dir,'features_models','features')
labels = os.path.join(data_dir,'features_models','labels')
features_models_dod = os.path.join(data_only_drive,'Data', 'features_models')
model_paths = os.path.join(data_only_drive,'Data', 'features_models','models')

main_path = os.path.join(data_dir, 'features_models')  # main directory

In [69]:
symbols_w_models = [s for s in os.listdir(model_save_loc) if s.endswith('.L')]
symbols_w_features = os.listdir(features)
symbols_w_labels = os.listdir('/media/ak/WorkDrive/Data/features_models/labels/')


In [60]:
symbols_with_features_labels= list(common_member(symbols_w_features, symbols_w_labels))
all_good_symbols =list(common_member(symbols_w_models, symbols_with_features_labels))
len(all_good_symbols)

49

In [75]:
# for idx, symbol in enumerate(all_good_symbols):
#     length = len(os.listdir(os.path.join(models, all_good_symbols[idx],'SINGLE_KERNEL')))
#     if  length <=0:
#         print symbol
#         print ("not ok", length)
#     else:
#         continue
#         print ("ok")

In [98]:
ticker = 'KGF.L'
ticker_fitted_models = symbol_fitted_models_path(ticker)
ticker_labels_path = symbol_labels_path(ticker)
ticker_labels_list = [os.listdir(ticker_labels_path)[idx].split(".")[0]
                      for idx,_ in enumerate(os.listdir(ticker_labels_path))]
ticker_features_list = symbol_hmm_dates_list(ticker)

In [102]:
#'/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/SPT.L/SINGLE_KERNEL'
test_path= symbol_fitted_models_path(symbol=symbol)
# now lets take the exact sub-directory which corresponds to date # 5 in our list of fitted models
symb

In [1]:
datacls = DataLoader(path_main=data_dir, ticker=ticker) #test symbol -create class
symbol_compute_date = datacls.compute_date


NameError: name 'DataLoader' is not defined

In [175]:
labels_dates = [os.listdir(symbol_labels_path(ticker))[idx].split(".")[0] for idx,_ in enumerate(symbol_labels_path(ticker))]
common_dates_dict={}
for idx, date in enumerate(ticker_features_list):
    specific_date_features_path = os.path.join(symbol_hmm_dates_path(ticker), str(date))
    features_dates=[os.listdir(specific_date_features_path)[idx].split("_")[5] 
                    for idx,_ in enumerate(os.listdir(specific_date_features_path))]
    common_dates_dict[date] = sorted(list(common_member(labels_dates,features_dates)))

#     for specific_idx, specific_date in enumerate(os.listdir(specific_date_features_path)):
#         print os.path.join(specific_date_features_path, specific_date)
#         features_date= specific_date.split("_")[5]
#         print os.path.join(labels)


In [209]:
common_keys = common_dates_dict.keys()
common_date= common_dates_dict[common_keys[1]][1]
labels_file = os.path.join(symbol_labels_path(ticker),".".join((str(common_date),'csv')))
data_df =pd.read_csv(labels_file)
if data_df.shape[0]>10:
    print('legend')
else:
    print('problemo')

legend


In [161]:
labels_dates =[os.listdir(symbol_labels_path(ticker))[idx].split(".")[0] for idx,_ in enumerate(os.listdir(symbol_labels_path(ticker)))]

In [212]:
for idx, features_file in enumerate(os.listdir(specific_date_features_path)):
    file_loc= os.path.join(specific_date_features_path,features_file)
    pickle.load(open(file_loc, "rb"))