In [1]:
from hsmm_core.data_utils import DataLoader, TradingHours
from hsmm_core.feature_spaces import hmm_features
from hsmm_core.hsmm_runner import HmmCalibration
import time
from hsmm_core.consts import InitialisationMethod
from hsmm_core.data_utils import TradingHours, DataLoader
from hsmm_core.labelling import DataLabellingSimple
from hsmm_core.consts import ThresholdMethod, LabellingChoice
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import precision_recall_fscore_support
sc = StandardScaler()
import os
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score

In [2]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score

In [3]:
##useful functions
def fwd_dates(_dates_list, _key_date):
    # returns a list of dates that are forward from the key_date
    fwd_dates_list = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list
def remove_nans(features_tuple, labels, idx=1):
    # not the cleanest but useful
    # function to clean up nans as I seem to use it a lot, so better to have one function
    # combines the features and labels and removes rows with nans across so we dont lose the ordering
    # returns features and labels
    features_df = pd.concat([features_tuple[0], features_tuple[1], features_tuple[2], \
                             features_tuple[3]], axis=1, sort=False)
    labels_only = labels.drop(columns=['ReturnTradedPrice', 'Duration', 'states', 'TradedTime',
                                       'TradedPrice'], axis=1)
    df_concat = pd.concat([features_df, labels_only.iloc[:, 0:idx]], axis=1, sort='False')
    # only using 1st set of labels- but we can re-write this a bit
    df_x_nan = df_concat.dropna()  # dropping all nans
    label_column_loc_ = df_x_nan.shape[1] - 1  # location of labels column in the clean df
    labels_ = df_x_nan.iloc[:, label_column_loc_:label_column_loc_ + 1]  # keep pure labels
    features_ = df_x_nan.drop(df_x_nan.columns[label_column_loc_], axis=1)  # keeping the features only
    return features_, labels_ 

def prec_recall_report(y_true, y_predict):
    # function to ge the sci-kit learn classification metrics into a pretty DF for csv!
    report = pd.DataFrame(list(precision_recall_fscore_support(y_true, y_predict)),
                          index=['Precision', 'Recall', 'F1-score', 'Support']).T
    # Now add the 'Avg/Total' row
    report.loc['Avg/Total', :] = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    report.loc['Avg/Total', 'Support'] = report['Support'].sum()
    return report

class MarketFeatures(object):
    # a class to be expanded that uses features for base case -market based only-indicators/features
    """"Requires:
    a dataframe that has TradedPrice And Volume columns
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average.
    """

    def __init__(self, df):
        #         self.ticker = ticker
        self.df = df

    def load_data(self):
        pass

    def ma_spread(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.df['TradedPrice'].rolling(window=long_window).mean()
        px_name = "_".join(('px_indx', str(short_window), str(long_window)))
        self.df[px_name] = long_rolling_px - short_rolling_px
        return self.df

    def ma_spread_duration(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['Duration'].rolling(window=short_window).mean()
        long_rolling_px = self.df['Duration'].rolling(window=long_window).mean()
        dur_name = "_".join(('dur_indx', str(short_window), str(long_window)))
        self.df[dur_name] = long_rolling_px - short_rolling_px
        return self.df

    def obv_calc(self):
        # on balance volume indicator
        self.df['SignedVolume'] = self.df['Volume'] * np.sign(self.df['TradedPrice'].diff()).cumsum()
        self.df['SignedVolume'].iat[1] = 0
        self.df['OBV'] = self.df['SignedVolume']  # .cumsum()
        self.df = self.df.drop(columns=['SignedVolume'])
        return self.df

    def chaikin_mf(self, period=5):
        # Chaikin money flow indicator
        self.df["MF Multiplier"] = (self.df['TradedPrice'] - (self.df['TradedPrice'].expanding(period).min()) \
                                    - (self.df['TradedPrice'].expanding(period).max() \
                                       - self.df['TradedPrice'])) / (
                                           self.df['TradedPrice'].expanding(period).max() - self.df[ \
                                       'TradedPrice'].expanding(period).min())
        self.df["MF Volume"] = self.df['MF Multiplier'] * self.df['Volume']
        self.df['CMF_' + str(period)] = self.df['MF Volume'].sum() / self.df["Volume"].rolling(period).sum()
        self.df = self.df.drop(columns=['MF Multiplier', 'MF Volume'])
        return self.df


## locations ##

In [4]:
# symbols to use as a starting point
good_symbols = [
    'RDSa.L', 'PRU.L', 'III.L', 'REL.L', 'CNA.L', 'SHP.L', 'MKS.L',
    'CPI.L', 'ULVR.L',  'ECM.L', 'AV.L', 'GKN.L', 'TSCO.L',  'ITV.L',  
    'BARC.L', 'CPG.L', 'AAL.L', 'LGEN.L', 'LAND.L', 'VOD.L', 'HSBA.L', 
    'RSA.L', 'DMGOa.L', 'RR.L', 'DGE.L', 'BATS.L','MAB.L',
    'KGF.L', 'SPT.L', 'AZN.L'
    ]
    

In [7]:
#simple functions
model_paths = os.path.join(data_only_drive,'Data','features_models','models') #same as above- new target directory, where all the models and output is saved
# using lambda to make a small function that just takes in the symbol and produces the relevant path of all fitted single kernel models
symbol_fitted_models_path = lambda symbol: os.path.join(model_paths, symbol, 'SINGLE_KERNEL') 
#provides a fitted list of above path
symbol_list_fitted_dates= sorted(os.listdir(symbol_fitted_models_path(symbol)))
#fitted model sub-directory- the fitted model is stored in this sub-directory as a pickle
symbol_fitted_model_date_loc = lambda file_path, model_date_no: os.path.join(file_path,\
                                                              symbol_list_fitted_dates[model_date_no])
# will take the input of a path (should be the models path, and a number and will produce a pickle file
symbol_model_date_loc = lambda model_date_path:os.path.join(model_date_path, os.listdir(model_date_path)[0])

NameError: name 'data_only_drive' is not defined

In [None]:
symbol= 'SPT.L' #this is a test symbol
#test symbol path, which essentially produces the path where all the fitted models are.
#'/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2/Data/features_models/models/SPT.L/SINGLE_KERNEL'
test_path= symbol_fitted_models_path(symbol=symbol)
# now lets take the exact sub-directory which corresponds to date # 5 in our list of fitted models
test_fitted_model_date_loc = symbol_fitted_model_date_loc(test_path, 5)
# now pick the first item in that list which is basically the pickle file
# this is the entire path you have to load with pickle.load
model_pickle = os.path.join(test_fitted_model_date,os.listdir(test_fitted_model_date)[0])


In [None]:
test_path

In [None]:
#again this is the pickle file that we get the location for. now it is written in the form of a function

model_loc= symbol_model_date_loc(test_fitted_model_date)

#getting the fitted model date, as this will be used in fwd dates
model_date= test_fitted_model_date.split("/")[-1]

model_pickle = os.path.join(test_fitted_model_date,os.listdir(test_fitted_model_date)[0])



In [None]:
pickle_to_file = pickle.load(open(model_loc, "rb"))

In [None]:
best_estimator = pickle_to_file['SVC'].best_estimator_

In [None]:
symbol_labels_path = os.path.join(labels_path, symbol, 'NON_DIRECTIONAL')
symbol_features_path = os.path.join(features_path, symbol, 'MODEL_BASED')

In [None]:
labels_dates = sorted([os.listdir(symbol_labels_path)[idx].split(".")[0]\
                for idx,_ in enumerate(os.listdir(symbol_labels_path))])

In [None]:
os.listdir(oos_features_path)[0]

In [None]:
fwd_dates =fwd_dates(_dates_list=labels_dates, _key_date=test_fitted_model_date.split("/")[-1])

In [8]:
oos_features_path = os.path.join(symbol_features_path, model_date )
oos_dates_list = sorted([oos_date.split("_")[5] for oos_date in
                                     sorted(os.listdir(oos_features_path))])  # list of oos features

NameError: name 'symbol_features_path' is not defined

In [6]:
oos

NameError: name 'oos_features_path' is not defined

In [None]:
oos_file= "_".join(('SPT.L_3_states_features_date:',fwd_date,'now:_20181230_.pickle'))

In [None]:
for fwd_date in fwd_dates:
    fwd_file= "_".join(('SPT.L_3_states_features_date:',fwd_date,'now:_20181230_.pickle'))
    

In [None]:
features_date = os.listdir(oos_features_path)[0].split("_")[5]
features_tuple= os.listdir(oos_features_path)[0]
labels_oos= pd.read_csv(os.path.join(labels_path, symbol,'NON_DIRECTIONAL', features_date+'.csv'), index_col=0)

In [None]:
features_tuple

In [None]:
features_loc = os.path.join(symbol_features_path,model_date,features_tuple)
features_tuple= pickle.load(open(features_loc, "rb"))

In [None]:
features_df = pd.concat([features_tuple[0], features_tuple[1],\
                         features_tuple[2], features_tuple[3]], axis=1, sort=False)

In [None]:
df_w_market_features = MarketFeatures(df=MarketFeatures( \
                            df=MarketFeatures(
                                df=MarketFeatures(df=labels_oos).obv_calc()).chaikin_mf()).ma_spread()).ma_spread_duration()


In [None]:
df_concat = pd.concat([features_df, df_w_market_features], axis=1, sort='False').dropna()

                        # ok start putting in the magic
                        # y_duration = np.asanyarray(df_concat['Duration'].shift(window).dropna())
                        # y_price = np.asanyarray(df_concat['TradedPrice'].shift(window).dropna())

                        # drop things we dont need: traded price, duration, traded time, labels etc!
label_name = str(df_concat.columns[df_concat.columns.str.contains(pat='label')].values[0])

df_final = df_concat.drop(columns=['TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', \
                                   'Volume', label_name])

In [None]:
from sklearn.preprocessing import MinMaxScaler
X= MinMaxScaler().fit_transform(df_final)

y_labels = df_concat[df_concat.columns[df_concat.columns.str.contains(pat='label')]].iloc[:, 0]


In [None]:
y_predict =best_estimator.predict(X)

In [None]:
# _fitted_model_results['svm_test_accuracy'][_idx, :] = accuracy_score(y_test, y_predict)
# _fitted_model_results['svm_test_recall'][_idx, :] = recall_score(y_true=y_test, y_pred=y_predict)
# _fitted_model_results['svm_train_accuracy'][_idx, :] = accuracy_score(y_train, y_predict_train)
# _fitted_model_results['svm_train_recall'][_idx, :] = recall_score(y_true=y_train, y_pred=y_predict_train)
# _fitted_model_results['svm_test_F1'][_idx, :] = f1_score(y_true=y_test, y_pred=y_predict)

In [None]:
print accuracy_score(y_labels, y_predict)
print recall_score(y_true=y_labels, y_pred=y_predict)

In [None]:
f1_score(y_true=y_labels, y_pred=y_predict)