In [1]:
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.model_selection import KFold, cross_val_score
from sklearn.multiclass import OneVsRestClassifier #support from multiclass
import time
from sklearn.svm import SVC



class DataLoader(object):
    def __init__(self, path_main, ticker):
        self.main_path = path_main
        self.ticker = ticker

        self.features_labels_path = os.path.join(self.main_path, 'features_models')
        self.features_path = os.path.join(self.features_labels_path, 'features')
        # collection of per symbol non directional labels
        self.labels_path = os.path.join(self.features_labels_path, 'labels', self.ticker, 'NON_DIRECTIONAL')
        self.symbol_features_path = os.path.join(self.features_labels_path, 'features', self.ticker, 'MODEL_BASED')
        # list of all the model -oos hmm feature dates - each folder is a collection of oos feature dates
        self.hmm_dates_list = os.listdir(self.symbol_features_path) #each folder are the OOS features from each HMM
        self.compute_date= os.listdir(os.path.join( \
                                                   self.symbol_features_path, \
                                                   os.listdir(self.symbol_features_path)[1]))[1].split("_")[7]

    def ticker_features(self, model_date, date):
        # need to make this a lot more flexible with number of states
        if model_date < date:
            file_name = "_".join((self.ticker, '3', 'states', 'features', 'date:', date, 'now:', self.compute_date, '.pickle'))
            file_loc = os.path.join(self.symbol_features_path, str(model_date), file_name)
            with open(file_loc, 'rb') as handle:
                ticker_features = pickle.load(handle)
        else:
            print('Loading Feature Date which is in-sample. Change your Model Date')
        return ticker_features

    def ticker_labels_csv(self, date):
        file_loc = os.path.join(self.labels_path, str(date) + '.csv')
        ticker_labels = pd.read_csv(file_loc, index_col=0)
        return ticker_labels

    @staticmethod
    def open_pickle_file(path, pickle_file):
        file_loc = os.path.join(path, pickle_file)
        pickle_to_file = pickle.load(open(file_loc, "rb"))
        return pickle_to_file

    @staticmethod
    def get_date_from_file(file_, numb_):
        return os.path.splitext(file_[numb_])[0]


class MarketFeatures(object):
    # a class to be expanded that uses features for base case -market based only-indicators/features
    """"Requires:
    a dataframe that has TradedPrice And Volume columns
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average.
    """

    def __init__(self, df):
        #         self.ticker = ticker
        self.df = df

    def load_data(self):
        pass

    def ma_spread(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.df['TradedPrice'].rolling(window=long_window).mean()
        px_name = "_".join(('px_indx', str(short_window), str(long_window)))
        self.df[px_name] = long_rolling_px - short_rolling_px
        return self.df
    
    def ma_spread_duration(self, short_window=5, long_window=20):
        # function that produces the MA spread, which can be used on its own or as an input for MACD
        short_rolling_px = self.df['Duration'].rolling(window=short_window).mean()
        long_rolling_px = self.df['Duration'].rolling(window=long_window).mean()
        dur_name = "_".join(('dur_indx', str(short_window), str(long_window)))
        self.df[dur_name] = long_rolling_px - short_rolling_px
        return self.df

    def obv_calc(self):
        # on balance volume indicator
        self.df['SignedVolume'] = self.df['Volume'] * np.sign(self.df['TradedPrice'].diff()).cumsum()
        self.df['SignedVolume'].iat[1] = 0
        self.df['OBV'] = self.df['SignedVolume']  # .cumsum()
        self.df = self.df.drop(columns=['SignedVolume'])
        return self.df

    def chaikin_mf(self, period=5):
        # Chaikin money flow indicator
        self.df["MF Multiplier"] = (self.df['TradedPrice'] - (self.df['TradedPrice'].expanding(period).min()) \
                                    - (self.df['TradedPrice'].expanding(period).max() \
                                       - self.df['TradedPrice'])) / (
                                           self.df['TradedPrice'].expanding(period).max() - self.df[ \
                                            'TradedPrice'].expanding(period).min())
        self.df["MF Volume"] = self.df['MF Multiplier'] * self.df['Volume']
        self.df['CMF_' + str(period)] = self.df['MF Volume'].sum() / self.df["Volume"].rolling(period).sum()
        self.df = self.df.drop(columns=['MF Multiplier', 'MF Volume'])
        return self.df


class FitModels(object):

    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    # # Train a SVM classification model

    def best_kernel_ridge(self, kernel_choice):
        kr_clf = OneVsRestClassifier(GridSearchCV(KernelRidge(kernel=str(kernel_choice)), cv=10,
                                                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                                                              "gamma": np.logspace(-2, 2, 5)})).fit(self.X_train,
                                                                                                    self.y_train)

        return kr_clf

    def best_svm_clf(self, kernel_choice):
        param_grid = dict(kernel=[str(kernel_choice)],
                          C=[1, 5, 10, 25, 50, 100],
                          gamma=[0.0001, 0.001, 0.01, 0.02, 0.05, 0.01])

        clf = OneVsRestClassifier(
            GridSearchCV(SVC(class_weight='balanced'), param_grid, verbose=1, n_jobs=-1, cv=10)) \
            .fit(self.X_train, self.y_train)
        return clf

    def best_gradient_boost_clf(self):
        # this needs to be written properly- just a baseline placeholder here!
        GBR = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                        max_depth=4, max_features='sqrt',
                                        min_samples_leaf=15, min_samples_split=10, loss='huber',
                                        random_state=5)

        gb_boost_clf = OneVsRestClassifier(GBR).fit(self.X_train, self.y_train)

        return gb_boost_clf

    def best_MKL_clf(self):
        pass

    def best_knn_clf(self):
        pass

    def best_random_forest_clf(self):
        pass

    def run_cv(self, clf_class, **kwargs):
        # Construct a kfolds object
        kf = KFold(len(self.y_train), n_folds=10, shuffle=True)
        y_pred = self.y_train.copy()

        # Iterate through folds
        for train_index, test_index in kf:
            X_train_local, X_test_local = self.X_train[train_index], self.X_train[test_index]
            y_train_local = self.y_train[train_index]
            # Initialize a classifier with key word arguments
            clf = clf_class(**kwargs)
            clf.fit(self.X_train, self.y_train)
            y_pred[test_index] = clf.predict(X_test_local)
        return y_pred


class PredictModels(FitModels):
    def __init__(self):
        pass


def no_nans(label):
    return np.sum(np.isnan(label))


def remove_last_element(arr):
    return arr[np.arange(arr.size - 1)]

sc = StandardScaler()



In [2]:
def open_pickle_file(path, pickle_file):
    file_loc = os.path.join(path, pickle_file)
    pickle_to_file = pickle.load(open(file_loc, "rb"))
    return pickle_to_file
def common_member(a, b): 
      
    a_set = set(a) 
    b_set = set(b) 
      
    # check length  
    if len(a_set.intersection(b_set)) > 0: 
        return(a_set.intersection(b_set))   
    else: 
        return("no common elements") 

In [3]:
#   sort out locations

#   data_dir: main directory , data_only_drive: the big drive where everything is saved

data_dir = os.getenv('FINANCE_DATA')
data_only_drive = '/mnt/usb-Seagate_Expansion_Desk_NA8XEHR6-0:0-part2' #external date only drive




# this is the central location for all the features/models/predictions
features_models = os.path.join(data_dir, 'features_models')

# this is the central location for all the labels
labels = os.path.join(features_models, 'labels')
# this is the central location for all the features
features = os.path.join(features_models, 'features')

# location to save results
model_save_loc = os.path.join(data_only_drive, 'Data','features_models','models')
# from the main directory select all the symbols that are finishing in .L for FTSE
symbols_ftse = [s for s in os.listdir(features) if s.endswith('.L')]

In [None]:
os.listdir()

In [5]:

# this is the central location for all the saved HMM models



main_path = os.path.join(data_dir, 'features_models') #main directory

features_path = os.path.join(main_path, 'features') # all the features
labels_path = os.path.join(main_path, 'labels') # all the labels

In [386]:
all_symbols_hmm=dict()
all_symbols_list=[]
all_symbol_test_dict={}
from collections import defaultdict
all_symbols_d = defaultdict(dict)

for symbol in symbols_ftse: # for all symbols
    datacls = DataLoader(path_main=data_dir, ticker=symbol) #test symbol -create class
    symbol_compute_date = datacls.compute_date
    print symbol
    symbol_labels_path = os.path.join(labels_path, symbol, 'NON_DIRECTIONAL')
    symbol_features_path = os.path.join(features_path, symbol,'MODEL_BASED')
    hmm_models_dates_list = os.listdir(symbol_features_path) #list of all the OOS pre-computed lists of features- each indexed by the date
    for hmm_idx, hmm_date in enumerate(sorted(hmm_models_dates_list)):
        oos_features_date_path= os.path.join(symbol_features_path, hmm_models_dates_list[hmm_idx]) #location list of feature files
        oos_dates_list = sorted([oos_date.split("_")[5] for oos_date in sorted(os.listdir(oos_features_date_path))]) #list of oos features
        print len(oos_dates_list)      
        oos_labels_features =[(os.path.join(symbol_labels_path, oos_date+'.csv'), \
                                   os.path.join(oos_features_date_path,\
                                                "_".join((symbol,'3','states_features_date:',oos_date,'now:',symbol_compute_date,'.pickle')))) \
                                  for oos_date in oos_dates_list ]
        all_symbols_d[symbol][hmm_idx] = oos_labels_features


SMIN.L
50
80
22
68
184
54
108
4
120
24
106
196
88
2
92
90
12
116
130
172
142
146
26
36
160
96
126
122
136
138
194
42
52
128
132
20
176
114
192
48
74
186
156
150
64
58
72
148
10
62
18
40
98
44
112
94
38
78
162
84
100
118
174
144
190
166
180
30
134
110
16
28
168
60
56
104
102
70
32
170
158
178
152
66
182
188
6
14
0
8
140
164
82
124
154
76
34
46
86
CEY.L
102
122
22
30
226
256
82
150
4
162
24
148
238
130
42
2
134
110
132
260
12
158
172
276
78
214
184
58
188
26
98
202
272
138
168
38
54
164
178
180
236
240
170
274
246
174
258
20
218
156
72
234
62
56
116
228
198
86
192
114
190
10
242
18
88
140
278
44
264
266
92
48
154
136
84
104
254
270
120
50
204
126
252
142
76
160
216
186
52
32
232
28
250
208
68
222
176
70
152
16
210
80
46
268
100
66
146
144
64
36
112
40
108
34
244
212
200
220
74
194
248
224
230
6
14
0
8
182
206
60
124
166
262
90
196
106
118
94
96
128
RR.L
25
40
11
34
92
27
54
2
60
12
53
98
44
1
46
45
6
58
65
86
71
73
13
18
80
48
63
61
68
69
97
21
26
64
66
10
88
57
96
24
37
93
78
75
32
29
3

92
29
94
13
49
101
69
84
19
27
82
89
90
118
85
87
10
109
78
36
117
31
28
58
114
99
43
96
57
95
5
9
44
70
22
46
24
77
68
42
52
60
25
102
63
71
38
80
108
93
26
16
116
14
104
34
111
88
35
76
8
105
40
23
50
33
73
72
32
18
56
20
54
17
106
100
110
37
97
112
115
3
7
0
4
91
103
30
62
83
45
98
53
59
47
48
64
AAL.L
36
4
54
42
2
9
30
15
45
17
24
7
5
12
13
41
8
10
32
1
40
55
37
22
19
18
50
49
44
0
53
25
3
46
31
16
39
48
27
34
52
11
28
51
29
23
33
20
35
38
14
26
6
21
47
43
SDR.L
38
22
142
66
4
78
24
64
154
46
2
50
48
12
74
88
130
100
104
26
118
54
84
80
94
96
152
86
90
20
134
72
150
32
144
114
108
30
106
10
18
56
70
52
36
120
42
58
76
132
102
148
124
138
92
68
16
126
62
60
28
128
116
136
110
140
146
6
14
0
8
98
122
40
82
112
34
44
STAN.L
50
80
22
68
184
54
108
4
120
24
106
196
88
2
92
90
12
116
130
198
172
142
146
26
36
160
96
126
122
136
138
194
42
52
128
132
20
176
114
192
48
74
186
156
150
64
58
72
148
10
62
18
40
98
44
200
112
94
38
78
162
84
100
118
174
144
190
166
180
30
134
110
16
28
168
60


In [404]:
# for symbol_key in all_symbols_d.keys():
#     print symbol_key
#     fitted_model_symbol_path = os.path.join(data_only_drive, 'Data', 'features_models', 'models', str(symbol_key),
#                                             'SINGLE_KERNEL')
symbol_key= 'MKS.L'
for day_no in (all_symbols_d[symbol_key].keys()):
    print(day_no)
    print('doing symbol:', symbol_key,'for date:', day_no)
    oos_file_location= all_symbols_d[symbol_key][day_no]
    if len(oos_file_location) == 0:
        continue
    else:
        for i in xrange(len(oos_file_location)):
            print oos_file_location[i]
        print oos_file_location[0][0].split("/")[-1].split(".")[0]
# #     for idx,_ in enumerate(oos_file_locations):
#         print idx
#         print oos_file_locations[idx][0]
#         print oos_file_locations[idx][1]
        

0
('doing symbol:', 'MKS.L', 'for date:', 0)
20170925
1
('doing symbol:', 'MKS.L', 'for date:', 1)
20180406
2
('doing symbol:', 'MKS.L', 'for date:', 2)
20180228
3
('doing symbol:', 'MKS.L', 'for date:', 3)
20170712
4
('doing symbol:', 'MKS.L', 'for date:', 4)
20170905
5
('doing symbol:', 'MKS.L', 'for date:', 5)
20180419
6
('doing symbol:', 'MKS.L', 'for date:', 6)
20170825
7
('doing symbol:', 'MKS.L', 'for date:', 7)
20180405
8
('doing symbol:', 'MKS.L', 'for date:', 8)
20170118
9
('doing symbol:', 'MKS.L', 'for date:', 9)
20170906
10
('doing symbol:', 'MKS.L', 'for date:', 10)
20170704
11
('doing symbol:', 'MKS.L', 'for date:', 11)
20170919
12
('doing symbol:', 'MKS.L', 'for date:', 12)
20180220
13
('doing symbol:', 'MKS.L', 'for date:', 13)
20180420
14
('doing symbol:', 'MKS.L', 'for date:', 14)
20170915
15
('doing symbol:', 'MKS.L', 'for date:', 15)
20170918
16
('doing symbol:', 'MKS.L', 'for date:', 16)
20180413
17
('doing symbol:', 'MKS.L', 'for date:', 17)
20170830
18
('doing s

In [468]:
#checks#
symbol_key='HSBA.L'
day_no=0
oos_file_location= all_symbols_d[symbol_key][day_no]
for idx, file_locs in enumerate(oos_file_location):
    print idx
    print file_locs[0].split("/")[-1].split(".")[0]
# oos_file_location[][0]

0
20171009
1
20171010
2
20171011
3
20171012
4
20171013
5
20171016
6
20171017
7
20171018
8
20171019
9
20171020
10
20171023
11
20171024
12
20171025
13
20171026
14
20171027
15
20171030
16
20171031
17
20180201
18
20180202
19
20180205
20
20180206
21
20180207
22
20180208
23
20180209
24
20180212
25
20180213
26
20180214
27
20180215
28
20180216
29
20180219
30
20180220
31
20180221
32
20180222
33
20180223
34
20180226
35
20180227
36
20180228
37
20180403
38
20180404
39
20180405
40
20180406
41
20180409
42
20180410
43
20180411
44
20180412
45
20180413
46
20180416
47
20180417
48
20180418
49
20180419
50
20180420


In [465]:
file_locs[1]
pickle.load(open(file_locs[1]), 'rb')

TypeError: load() takes exactly 1 argument (2 given)

In [466]:
# create file object with permissions
with open(file_locs[1], 'rb') as f: feature_files = pickle.load(f)
    # load using pickle de-serializer


In [467]:
feature_files

(      fischer_score_dlambda  fischer_score_dsigma  fischer_score_dweight
 0              1.533441e-09              0.000000               1.303356
 1              1.480137e+00            748.223172              -1.892495
 2             -8.440577e+00             97.193938              -3.520032
 3             -4.995809e+00           -502.900213              -5.204684
 4             -1.153023e+01          -1102.994364              -6.889336
 5             -1.066085e+01          -1102.994364              -5.423727
 6             -1.040978e+01          -1102.994364              -3.981188
 7             -1.004655e+01          -1102.994364              -2.534124
 8             -3.373598e+01          -1102.994364              -0.065394
 9             -2.549448e+01          -1102.994364               2.395201
 10            -5.562124e+01          -1703.088514               0.710549
 11            -5.474924e+01          -1703.088514               2.176735
 12            -6.617941e+01          

In [318]:
file_1 =[(symbol_key, day_key, loc_idx) for symbol_key in all_symbols_d[symbol_key] \
         for day_key in all_symbols_d[symbol_key].keys() \
         for loc_idx in all_symbols_d[symbol_key][day_key][loc_idx][1]]

In [335]:
for k, v in all_symbols_d.items():
    print v

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [374]:
for k, v in all_symbols_d.iteritems():
    for v in all_symbols_d[k].items():
        (idx, files)=v
    

M = len(_keys) - 1  # number of models- essentially one for each day
    T = 1
    ###array for results###

    _clfs_svm =[]

    all_scores={
        'test-mean': np.empty((M, 5)),
        'test-std':  np.empty((M, 5)),
        'train-mean': np.empty((M, 5)),
        'train-std': np.empty((M, 5)),
        }
# 5 is from the train sizes below
    _fitted_model_results = {
        'clfs': np.empty((M, T)),
        'svm_test_F1': np.empty((M, T)),
        'svm_data_date': np.empty((M, T)),
        'svm_test_recall': np.empty((M, T)),
        'svm_train_recall': np.empty((M, T)),
        'svm_test_accuracy': np.empty((M, T)),
        'svm_train_accuracy': np.empty((M, T)),
    }

In [372]:
files[1][1]

'/media/ak/WorkDrive/Data/features_models/features/SMIN.L/MODEL_BASED/20170919/SMIN.L_3_states_features_date:_20170921_now:_20181223_.pickle'

In [205]:
print len((all_symbols_hmm.keys()))
print len(symbols_ftse)
for key, value in all_symbols_hmm.items():
    #print value
    print(key, len([item for item in value if item]))

47
47
('RBS.L', 76)
('CCL.L', 128)
('CEY.L', 128)
('LLOY.L', 122)
('MKS.L', 42)
('DMGOa.L', 64)
('RR.L', 43)
('DGE.L', 64)
('BATS.L', 44)
('BLT.L', 128)
('MAB.L', 100)
('KGF.L', 64)
('WPP.L', 86)
('SGE.L', 44)
('STAN.L', 86)
('NG.L', 170)
('AZN.L', 8)
('APF.L', 16)
('CPI.L', 64)
('ULVR.L', 43)
('ECM.L', 64)
('AV.L', 8)
('GKN.L', 64)
('SPT.L', 86)
('PSON.L', 80)
('RTO.L', 44)
('TSCO.L', 43)
('SDR.L', 44)
('ITV.L', 64)
('BARC.L', 8)
('CPG.L', 64)
('AAL.L', 43)
('LGEN.L', 64)
('LAND.L', 64)
('VOD.L', 42)
('RB.L', 76)
('HSBA.L', 64)
('RSA.L', 24)
('RDSa.L', 61)
('PRU.L', 77)
('III.L', 64)
('RDSb.L', 82)
('UU.L', 38)
('REL.L', 41)
('CNA.L', 64)
('SHP.L', 22)
('SMIN.L', 43)


In [None]:
all_symbols['MKS.L'][1][0].split("/")[-1].split(".")[0]

for key in all_symbols.keys():
    t0 = time.time()
    for day_no in xrange(len(all_symbols[key])):
        features_tuple= pickle.load(open(all_symbols[key][day_no][1], "rb"))
        labels_df = pd.read_csv(all_symbols[key][day_no][0], index_col=0)
        features_df = pd.concat([features_tuple[0], features_tuple[1], \
                                 features_tuple[2], features_tuple[3]], axis=1, sort=False)

                          

In [381]:
potential_issues = dict()
bad_symbols_list = list()
good_symbols_list = list()
for key in all_symbols.keys():
    print(key)
    for day_no in xrange(len(all_symbols[key])):
        print(day_no)
        features_tuple = pickle.load(open(all_symbols[key][day_no][1], "rb"))
        df_labels = pd.read_csv(all_symbols[key][day_no][0], index_col=0)
        features_df = pd.concat([features_tuple[0], features_tuple[1], \
                                 features_tuple[2], features_tuple[3]], axis=1, sort=False)

        df_w_market_features = MarketFeatures( \
               df=MarketFeatures(df=MarketFeatures(df=df_labels).obv_calc()).chaikin_mf()).ma_spread()

        df_concat = pd.concat([features_df, df_w_market_features], axis=1, sort='False').dropna()

        if df_concat.shape[0] > 100:
            np.random.seed(0)
            good_symbols_list.append(key)

        else:
            print('something smelly with this symbol, day:', key, day_no)
            potential_issues[key]=day_no
            bad_symbols_list.append(key)
            print(df_concat.shape[0])
            pass

RBS.L
0
('something smelly with this symbol, day:', 'RBS.L', 0)
0
1
('something smelly with this symbol, day:', 'RBS.L', 1)
0
2
('something smelly with this symbol, day:', 'RBS.L', 2)
0
3
('something smelly with this symbol, day:', 'RBS.L', 3)
0
4
('something smelly with this symbol, day:', 'RBS.L', 4)
0
5
('something smelly with this symbol, day:', 'RBS.L', 5)
0
6
('something smelly with this symbol, day:', 'RBS.L', 6)
0
7
('something smelly with this symbol, day:', 'RBS.L', 7)
0
8
('something smelly with this symbol, day:', 'RBS.L', 8)
0
9
('something smelly with this symbol, day:', 'RBS.L', 9)
0
10
('something smelly with this symbol, day:', 'RBS.L', 10)
0
11
('something smelly with this symbol, day:', 'RBS.L', 11)
0
12
('something smelly with this symbol, day:', 'RBS.L', 12)
0
13
('something smelly with this symbol, day:', 'RBS.L', 13)
0
14
('something smelly with this symbol, day:', 'RBS.L', 14)
0
15
('something smelly with this symbol, day:', 'RBS.L', 15)
0
16
('something smelly 

('something smelly with this symbol, day:', 'CCL.L', 62)
0
63
('something smelly with this symbol, day:', 'CCL.L', 63)
0
64
('something smelly with this symbol, day:', 'CCL.L', 64)
0
65
('something smelly with this symbol, day:', 'CCL.L', 65)
0
66
('something smelly with this symbol, day:', 'CCL.L', 66)
0
67
('something smelly with this symbol, day:', 'CCL.L', 67)
0
68
('something smelly with this symbol, day:', 'CCL.L', 68)
0
69
('something smelly with this symbol, day:', 'CCL.L', 69)
0
70
('something smelly with this symbol, day:', 'CCL.L', 70)
0
71
('something smelly with this symbol, day:', 'CCL.L', 71)
0
72
('something smelly with this symbol, day:', 'CCL.L', 72)
0
73
('something smelly with this symbol, day:', 'CCL.L', 73)
0
74
('something smelly with this symbol, day:', 'CCL.L', 74)
0
75
('something smelly with this symbol, day:', 'CCL.L', 75)
0
76
('something smelly with this symbol, day:', 'CCL.L', 76)
0
77
('something smelly with this symbol, day:', 'CCL.L', 77)
0
78
('someth

('something smelly with this symbol, day:', 'CEY.L', 74)
0
75
('something smelly with this symbol, day:', 'CEY.L', 75)
0
76
('something smelly with this symbol, day:', 'CEY.L', 76)
0
77
('something smelly with this symbol, day:', 'CEY.L', 77)
0
78
('something smelly with this symbol, day:', 'CEY.L', 78)
0
79
('something smelly with this symbol, day:', 'CEY.L', 79)
0
80
('something smelly with this symbol, day:', 'CEY.L', 80)
0
81
('something smelly with this symbol, day:', 'CEY.L', 81)
0
82
('something smelly with this symbol, day:', 'CEY.L', 82)
0
83
('something smelly with this symbol, day:', 'CEY.L', 83)
0
84
('something smelly with this symbol, day:', 'CEY.L', 84)
0
85
('something smelly with this symbol, day:', 'CEY.L', 85)
0
86
('something smelly with this symbol, day:', 'CEY.L', 86)
0
87
('something smelly with this symbol, day:', 'CEY.L', 87)
0
88
('something smelly with this symbol, day:', 'CEY.L', 88)
0
89
('something smelly with this symbol, day:', 'CEY.L', 89)
0
90
('someth

('something smelly with this symbol, day:', 'LLOY.L', 78)
0
79
('something smelly with this symbol, day:', 'LLOY.L', 79)
0
80
('something smelly with this symbol, day:', 'LLOY.L', 80)
0
81
('something smelly with this symbol, day:', 'LLOY.L', 81)
0
82
('something smelly with this symbol, day:', 'LLOY.L', 82)
0
83
('something smelly with this symbol, day:', 'LLOY.L', 83)
0
84
('something smelly with this symbol, day:', 'LLOY.L', 84)
0
85
('something smelly with this symbol, day:', 'LLOY.L', 85)
0
86
('something smelly with this symbol, day:', 'LLOY.L', 86)
0
87
('something smelly with this symbol, day:', 'LLOY.L', 87)
0
88
('something smelly with this symbol, day:', 'LLOY.L', 88)
0
89
('something smelly with this symbol, day:', 'LLOY.L', 89)
0
90
('something smelly with this symbol, day:', 'LLOY.L', 90)
0
91
('something smelly with this symbol, day:', 'LLOY.L', 91)
0
92
('something smelly with this symbol, day:', 'LLOY.L', 92)
0
93
('something smelly with this symbol, day:', 'LLOY.L', 9

('something smelly with this symbol, day:', 'WPP.L', 6)
0
7
('something smelly with this symbol, day:', 'WPP.L', 7)
0
8
('something smelly with this symbol, day:', 'WPP.L', 8)
0
9
('something smelly with this symbol, day:', 'WPP.L', 9)
0
10
('something smelly with this symbol, day:', 'WPP.L', 10)
0
11
('something smelly with this symbol, day:', 'WPP.L', 11)
0
12
('something smelly with this symbol, day:', 'WPP.L', 12)
0
13
('something smelly with this symbol, day:', 'WPP.L', 13)
0
14
('something smelly with this symbol, day:', 'WPP.L', 14)
0
15
('something smelly with this symbol, day:', 'WPP.L', 15)
0
16
('something smelly with this symbol, day:', 'WPP.L', 16)
0
17
('something smelly with this symbol, day:', 'WPP.L', 17)
0
18
('something smelly with this symbol, day:', 'WPP.L', 18)
0
19
('something smelly with this symbol, day:', 'WPP.L', 19)
0
20
('something smelly with this symbol, day:', 'WPP.L', 20)
0
21
('something smelly with this symbol, day:', 'WPP.L', 21)
0
22
('something sme

('something smelly with this symbol, day:', 'STAN.L', 31)
0
32
('something smelly with this symbol, day:', 'STAN.L', 32)
0
33
('something smelly with this symbol, day:', 'STAN.L', 33)
0
34
('something smelly with this symbol, day:', 'STAN.L', 34)
0
35
('something smelly with this symbol, day:', 'STAN.L', 35)
0
36
('something smelly with this symbol, day:', 'STAN.L', 36)
0
37
('something smelly with this symbol, day:', 'STAN.L', 37)
0
38
('something smelly with this symbol, day:', 'STAN.L', 38)
0
39
('something smelly with this symbol, day:', 'STAN.L', 39)
0
40
('something smelly with this symbol, day:', 'STAN.L', 40)
0
41
('something smelly with this symbol, day:', 'STAN.L', 41)
0
42
('something smelly with this symbol, day:', 'STAN.L', 42)
0
NG.L
0
('something smelly with this symbol, day:', 'NG.L', 0)
0
1
('something smelly with this symbol, day:', 'NG.L', 1)
0
2
('something smelly with this symbol, day:', 'NG.L', 2)
0
3
('something smelly with this symbol, day:', 'NG.L', 3)
0
4
('so

PSON.L
0
('something smelly with this symbol, day:', 'PSON.L', 0)
0
1
('something smelly with this symbol, day:', 'PSON.L', 1)
0
2
('something smelly with this symbol, day:', 'PSON.L', 2)
0
3
('something smelly with this symbol, day:', 'PSON.L', 3)
0
4
('something smelly with this symbol, day:', 'PSON.L', 4)
0
5
('something smelly with this symbol, day:', 'PSON.L', 5)
0
6
('something smelly with this symbol, day:', 'PSON.L', 6)
0
7
('something smelly with this symbol, day:', 'PSON.L', 7)
0
8
('something smelly with this symbol, day:', 'PSON.L', 8)
0
9
('something smelly with this symbol, day:', 'PSON.L', 9)
0
10
('something smelly with this symbol, day:', 'PSON.L', 10)
0
11
('something smelly with this symbol, day:', 'PSON.L', 11)
0
12
('something smelly with this symbol, day:', 'PSON.L', 12)
0
13
('something smelly with this symbol, day:', 'PSON.L', 13)
0
14
('something smelly with this symbol, day:', 'PSON.L', 14)
0
15
('something smelly with this symbol, day:', 'PSON.L', 15)
0
16
('

('something smelly with this symbol, day:', 'SDR.L', 30)
0
31
('something smelly with this symbol, day:', 'SDR.L', 31)
0
32
('something smelly with this symbol, day:', 'SDR.L', 32)
0
33
('something smelly with this symbol, day:', 'SDR.L', 33)
0
34
('something smelly with this symbol, day:', 'SDR.L', 34)
0
35
('something smelly with this symbol, day:', 'SDR.L', 35)
0
36
('something smelly with this symbol, day:', 'SDR.L', 36)
0
37
('something smelly with this symbol, day:', 'SDR.L', 37)
0
38
('something smelly with this symbol, day:', 'SDR.L', 38)
0
39
('something smelly with this symbol, day:', 'SDR.L', 39)
0
40
('something smelly with this symbol, day:', 'SDR.L', 40)
0
41
('something smelly with this symbol, day:', 'SDR.L', 41)
0
42
('something smelly with this symbol, day:', 'SDR.L', 42)
0
43
('something smelly with this symbol, day:', 'SDR.L', 43)
0
ITV.L
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
4

('something smelly with this symbol, day:', 'UU.L', 13)
0
14
('something smelly with this symbol, day:', 'UU.L', 14)
0
15
('something smelly with this symbol, day:', 'UU.L', 15)
0
16
('something smelly with this symbol, day:', 'UU.L', 16)
0
17
('something smelly with this symbol, day:', 'UU.L', 17)
0
18
('something smelly with this symbol, day:', 'UU.L', 18)
0
19
('something smelly with this symbol, day:', 'UU.L', 19)
0
20
('something smelly with this symbol, day:', 'UU.L', 20)
0
21
('something smelly with this symbol, day:', 'UU.L', 21)
0
22
('something smelly with this symbol, day:', 'UU.L', 22)
0
23
('something smelly with this symbol, day:', 'UU.L', 23)
0
24
('something smelly with this symbol, day:', 'UU.L', 24)
0
25
('something smelly with this symbol, day:', 'UU.L', 25)
0
26
('something smelly with this symbol, day:', 'UU.L', 26)
0
27
('something smelly with this symbol, day:', 'UU.L', 27)
0
28
('something smelly with this symbol, day:', 'UU.L', 28)
0
29
('something smelly with 

In [382]:
good_symbols=list(set(good_symbols_list))
bad_symbols_list=list(set(bad_symbols_list))

In [171]:
bad_symbols=bad_symbols_list.extend(['IEER.L','IOG.L','EGS.L'])

In [209]:
good_symbols_dict={}
for good_symbol in good_symbols:
    symbol_labels_path = os.path.join(labels_path, good_symbol, 'NON_DIRECTIONAL')
    symbol_models_path = os.path.join(features_path, good_symbol,'MODEL_BASED')
    print good_symbol
    print('no of feature dates:', len(os.listdir(symbol_models_path)))
    print('no of labels dates:', len(os.listdir(symbol_labels_path)))
    print('--------------------------------------------------------')
    good_symbols_dict[good_symbol] = len(os.listdir(symbol_labels_path))
    

MKS.L
('no of feature dates:', 110)
('no of labels dates:', 110)
--------------------------------------------------------
DMGOa.L
('no of feature dates:', 120)
('no of labels dates:', 120)
--------------------------------------------------------
RR.L
('no of feature dates:', 99)
('no of labels dates:', 99)
--------------------------------------------------------
DGE.L
('no of feature dates:', 124)
('no of labels dates:', 124)
--------------------------------------------------------
BATS.L
('no of feature dates:', 93)
('no of labels dates:', 93)
--------------------------------------------------------
MAB.L
('no of feature dates:', 118)
('no of labels dates:', 118)
--------------------------------------------------------
KGF.L
('no of feature dates:', 134)
('no of labels dates:', 134)
--------------------------------------------------------
SPT.L
('no of feature dates:', 99)
('no of labels dates:', 99)
--------------------------------------------------------
AZN.L
('no of feature dates:

In [385]:
good_symbols

['MKS.L',
 'DMGOa.L',
 'RR.L',
 'DGE.L',
 'BATS.L',
 'MAB.L',
 'KGF.L',
 'SPT.L',
 'AZN.L',
 'CPI.L',
 'ULVR.L',
 'ECM.L',
 'AV.L',
 'GKN.L',
 'TSCO.L',
 'ITV.L',
 'BARC.L',
 'CPG.L',
 'AAL.L',
 'LGEN.L',
 'LAND.L',
 'VOD.L',
 'HSBA.L',
 'RSA.L',
 'RDSa.L',
 'PRU.L',
 'III.L',
 'REL.L',
 'CNA.L',
 'SHP.L']

In [211]:

     
# Keys they have in common:
set(good_symbols_dict.keys()) & set(all_symbols_hmm.keys())  # set intersection operation
# ... returns {'a', 'b'} for this example


{'AAL.L',
 'AV.L',
 'AZN.L',
 'BARC.L',
 'BATS.L',
 'CNA.L',
 'CPG.L',
 'CPI.L',
 'DGE.L',
 'DMGOa.L',
 'ECM.L',
 'GKN.L',
 'HSBA.L',
 'III.L',
 'ITV.L',
 'KGF.L',
 'LAND.L',
 'LGEN.L',
 'MAB.L',
 'MKS.L',
 'PRU.L',
 'RDSa.L',
 'REL.L',
 'RR.L',
 'RSA.L',
 'SHP.L',
 'SPT.L',
 'TSCO.L',
 'ULVR.L',
 'VOD.L'}

In [212]:
a= good_symbols_dict
b= all_symbols_hmm
# Comparing the values of their common keys:
same = set()
different = dict()
for key in set(a.keys()) & set(b.keys()):
    if a[key] == b[key]:
        same.add(key)
    else:
        if key not in different:
            different[key] = []   # alternatively use defaultdict
        different[key].extend([a[key], b[key]])

# same contains just 'a' an different contains 'b': [2, 22]

In [216]:
same

set()

# checks below###

In [131]:
key='REL.L'
day_no=32
features_tuple= pickle.load(open(all_symbols[key][day_no][1], "rb"))
df = pd.read_csv(all_symbols[key][day_no][0], index_col=0)
features_df = pd.concat([features_tuple[0], features_tuple[1], \
                         features_tuple[2], features_tuple[3]], axis=1, sort=False)
df_w_market_features = MarketFeatures(df=MarketFeatures( \
                   df=MarketFeatures\
                                (df=MarketFeatures(df=df).obv_calc()).chaikin_mf()).ma_spread()).ma_spread_duration()

df_concat = pd.concat([features_df, df_w_market_features], axis=1, sort='False').dropna()





In [132]:
x1=df_concat['label_PrMov__window_5__thres_arbitrary__0.1'].value_counts()[0]

In [133]:
x2=df_concat['label_PrMov__window_5__thres_arbitrary__0.1'].shape[0]
print(x2,x1)

(2251, 2250)


In [66]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
window=5
y_train = np.asanyarray(df_concat['Duration'].shift(window).dropna())
yp_train = np.asanyarray(df_concat['TradedPrice'].shift(window).dropna())
df_final = df_concat.drop(columns=['TradedPrice', 'Duration', 'TradedTime'])
X = df_final
from sklearn.linear_model import Ridge
X_train_sc= sc.fit_transform(X)
X_train_mms= sc.fit_transform(X)
y_labels= df_final['label_PrMov__window_5__thres_arbitrary__0.1']

In [67]:
test=df_final.filter(regex='label').values
type(test)
index_test= df_final.columns[df_final.columns.str.contains(pat = 'label')].values[0]
index_test

'label_PrMov__window_5__thres_arbitrary__0.1'

In [68]:
 # # Train a SVM classification model
from sklearn.model_selection import ShuffleSplit
param_grid = dict(kernel=["rbf"], C=[1, 5, 10, 25, 100,1000], gamma=[0.0001, 0.001, 0.01])


#SVC is more expensive so we do a lower number of CV iterations:
n_jobs=-1
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
clf = GridSearchCV(SVC(class_weight='balanced'), param_grid, verbose=1, n_jobs=-1, cv=5)

In [69]:
if y_labels.value_counts()[0] == y_labels.shape[0]:
    pass
    #print('houston you have a problem')
else:
    print('get going')
    clf.fit(X_train_mms, y_labels)

In [70]:
best_c = clf.best_params_['C']
best_gamma = clf.best_params_['gamma']
estimator = SVC(C=best_c, cache_size=200, class_weight='balanced', coef0=0.0,
                decision_function_shape='ovr', degree=3, gamma=best_gamma, kernel='rbf',
                max_iter=-1, probability=False, random_state=None, shrinking=True,
                tol=0.001, verbose=False)
estimator.fit(X_train_mms, y_labels)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
y_predict_train = clf.predict(X_train_mms)
accuracy_score(y_labels, y_predict_train)

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

np.random.seed(1)

In [None]:
# Instantiate a Gaussian Process model
kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

# Fit to data using Maximum Likelihood Estimation of the parameters
gp.fit(X_train_mms, y_train)

# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred, sigma = gp.predict(X_train_mms, return_std=True)


In [82]:
dod_models=os.path.join(data_only_drive,'Data','features_models','models')

In [148]:
model_symbol = 'MKS.L'
symbol_dod_models = os.path.join(dod_models, model_symbol,'SINGLE_KERNEL')
symbol_labels_path = os.path.join(labels_path, model_symbol, 'NON_DIRECTIONAL')
symbol_models_path = os.path.join(features_path, model_symbol,'MODEL_BASED')
models_list = os.listdir(symbol_dod_models)
####
model_idx= 0
model_location = os.path.join(symbol_dod_models, models_list[model_idx])
model_date= models_list[model_idx].split("_")[1]

In [112]:

clfs = pickle.load(open(model_location, "rb"))
svc_clf=clfs['SVC']

In [115]:
svc_clf.fit

<bound method OneVsRestClassifier.get_params of OneVsRestClassifier(estimator=GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
...     pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1),
          n_jobs=1)>

In [104]:
a_list = [os.listdir(symbol_labels_path)[label_idx].split(".")[0] for label_idx, _ in enumerate(os.listdir(symbol_labels_path))]

In [106]:
def fwd_dates(_dates_list, _key_date):
    #returns a list of dates that are forward from the key_date
    fwd_dates_list  = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list

In [169]:
oos_dates_list=fwd_dates(a_list, model_date)
oos_feature_locations = []
len(oos_dates_list)

26

In [164]:
fitted_models_no =list()
for idx, _ in enumerate(all_symbols[model_symbol]):
    print model_symbol
    
    oos_date = all_symbols[model_symbol][idx][0].split("/")[-1].split(".")[0]
    features_tuple= pickle.load(open(all_symbols[model_symbol][idx][1], "rb"))
    df = pd.read_csv(all_symbols[model_symbol][idx][0], index_col=0)
    features_df = pd.concat([features_tuple[0], features_tuple[1], \
                     features_tuple[2], features_tuple[3]], axis=1, sort=False)
    df_w_market_features = MarketFeatures(df=MarketFeatures( \
               df=MarketFeatures\
                            (df=MarketFeatures(df=df).obv_calc()).chaikin_mf()).ma_spread()).ma_spread_duration()

    df_concat = pd.concat([features_df, df_w_market_features], axis=1, sort='False').dropna()
    label_name = str(df_concat.columns[df_concat.columns.str.contains(pat='label')].values[0])

    df_final = df_concat.drop(columns=['TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice',\
                                       'Volume', label_name])
    # get the labels
    X_train = MinMaxScaler().fit_transform(df_final)

    y_labels = df_concat[df_concat.columns[df_concat.columns.str.contains(pat = 'label')]].iloc[:, 0]
    if model_date == oos_date:
        print ('skipping day:', idx)
        continue
    else:
        #  get the label name to use for saving the clf name.
        # this can be done a little better as it is re-used in a few places
        if y_labels.shape[0] - y_labels.value_counts()[0] < 5:
            print(' the ratio of classes is too low:', y_labels.shape[0] - y_labels.value_counts()[0])
            continue
        else:
            print(idx, "would predict here")
            fitted_models_no.append(idx)



print('the end')


MKS.L
(0, 'would predict here')
MKS.L
(1, 'would predict here')
MKS.L
(2, 'would predict here')
MKS.L
(3, 'would predict here')
MKS.L
(4, 'would predict here')
MKS.L
(5, 'would predict here')
MKS.L
(6, 'would predict here')
MKS.L
(7, 'would predict here')
MKS.L
(8, 'would predict here')
MKS.L
(9, 'would predict here')
MKS.L
(10, 'would predict here')
MKS.L
(11, 'would predict here')
MKS.L
(12, 'would predict here')
MKS.L
(13, 'would predict here')
MKS.L
(14, 'would predict here')
MKS.L
('skipping day:', 15)
MKS.L
(16, 'would predict here')
MKS.L
(17, 'would predict here')
MKS.L
(18, 'would predict here')
MKS.L
(19, 'would predict here')
MKS.L
(20, 'would predict here')
MKS.L
(21, 'would predict here')
MKS.L
(22, 'would predict here')
MKS.L
(23, 'would predict here')
MKS.L
(24, 'would predict here')
MKS.L
(25, 'would predict here')
MKS.L
(26, 'would predict here')
MKS.L
(27, 'would predict here')
MKS.L
(28, 'would predict here')
MKS.L
(29, 'would predict here')
MKS.L
(30, 'would predict

In [161]:
fitted_models_no

[]

In [140]:
# comparison and metrics

In [175]:
symbol_features_path

'/media/ak/WorkDrive/Data/features_models/features/SPT.L/MODEL_BASED'

In [189]:
os.listdir(os.path.join(symbol_models_path,os.listdir(symbol_models_path)[0]))[1].split(":_")[-1].split("_.")[0]

'20181229'

In [472]:
for k in all_symbols_d.keys():
    print ('doing this Symbol', k) #for each symbol
    fitted_model_symbol_path = os.path.join(data_only_drive, 'Data', 'features_models', 'models', str(k),'SINGLE_KERNEL')
     

('doing this Symbol', 'RBS.L')
('doing this Symbol', 'CCL.L')
('doing this Symbol', 'CEY.L')
('doing this Symbol', 'LLOY.L')
('doing this Symbol', 'MKS.L')
('doing this Symbol', 'DMGOa.L')
('doing this Symbol', 'RR.L')
('doing this Symbol', 'DGE.L')
('doing this Symbol', 'BATS.L')
('doing this Symbol', 'BLT.L')
('doing this Symbol', 'MAB.L')
('doing this Symbol', 'KGF.L')
('doing this Symbol', 'WPP.L')
('doing this Symbol', 'SGE.L')
('doing this Symbol', 'STAN.L')
('doing this Symbol', 'NG.L')
('doing this Symbol', 'AZN.L')
('doing this Symbol', 'APF.L')
('doing this Symbol', 'CPI.L')
('doing this Symbol', 'ULVR.L')
('doing this Symbol', 'ECM.L')
('doing this Symbol', 'AV.L')
('doing this Symbol', 'GKN.L')
('doing this Symbol', 'SPT.L')
('doing this Symbol', 'PSON.L')
('doing this Symbol', 'RTO.L')
('doing this Symbol', 'TSCO.L')
('doing this Symbol', 'SDR.L')
('doing this Symbol', 'ITV.L')
('doing this Symbol', 'BARC.L')
('doing this Symbol', 'CPG.L')
('doing this Symbol', 'AAL.L')
('

In [478]:
k = 'HSBA.L' 
symbol_model_dates={}
for v in all_symbols_d[k].keys():
    print ('second key', v) # and each out of sample that an hmm was fitted
    for day_no in (all_symbols_d[k].keys()):
        print(day_no)
        print('doing symbol:', k, 'for date:', day_no)

        # this is the location essentially of all the files for that day, i.e all the forward feature files

        oos_file_location = all_symbols_d[k][day_no]
        # if there are no files, just skip
        if len(oos_file_location) == 0:
            continue
        else:
        # otherwise go into that location and pull out all the various files you are working with
            for idx, file_locs in enumerate(oos_file_location):
                print idx
                oos_label_date_no = file_locs[0].split("/")[-1].split(".")[0] # strip out the date- will be used later
                symbol_model_dates[k] = oos_label_date_no
                df_labels = pd.read_csv(file_locs[0], index_col=0)
                with open(file_locs[1], 'rb') as f:
                    features_tuple = pickle.load(f)

                features_df = pd.concat([features_tuple[0], features_tuple[1],\
                                         features_tuple[2], features_tuple[3]], axis=1, sort=False)

                df_w_market_features = MarketFeatures(df=MarketFeatures( \
                    df=MarketFeatures(
                        df=MarketFeatures(df=df_labels).obv_calc()).chaikin_mf()).ma_spread()).ma_spread_duration()

                df_concat = pd.concat([features_df, df_w_market_features], axis=1, sort='False').dropna()

                # ok start putting in the magic
                # y_duration = np.asanyarray(df_concat['Duration'].shift(window).dropna())
                # y_price = np.asanyarray(df_concat['TradedPrice'].shift(window).dropna())

                # drop things we dont need: traded price, duration, traded time, labels etc!
                label_name = str(df_concat.columns[df_concat.columns.str.contains(pat='label')].values[0])

                df_final = df_concat.drop(columns=['TradedPrice', 'Duration', 'TradedTime', 'ReturnTradedPrice', \
                                                   'Volume', label_name])
                # get the labels
                ins
                else:
                    
                    X_train = MinMaxScaler().fit_transform(df_final)

                    y_labels = df_concat[df_concat.columns[df_concat.columns.str.contains(pat='label')]].iloc[:, 0]
            #
            # #  get the label name to use for saving the clf name.oos_date_no
            # # this can be done a little better as it is re-used in a few places
                    if y_labels.shape[0] - y_labels.value_counts()[0] < 5:
                        print(' the ratio of classes is too low. try another label permutation')
                        continue
                    else:
                        models_cls = FitModels(X_train, y_labels)
                        best_clfs = {'SVC': models_cls.best_svm_clf(kernel_choice="rbf"),
                                     'RF_clf': models_cls.best_random_forest_clf()}

                        seq_clf = "_".join((str(k), str(oos_label_date_no), label_name, "clf", ".pickle"))
                        save_loc = os.path.join(fitted_model_symbol_path, seq_clf)
        #                 pickle.dump(best_clfs, open(save_loc, 'wb'))
        #                 print('just saved: ', save_loc)

        #                 #  create a hash like dictionary where you can basically store everything so you can easily access all the models
        #                 model_spec_hash = {'all_symbol_locations': all_symbols_d, 'model_symbol_dates': symbol_model_dates,
        #                                    'labels_config': label_name}

        #                 with open(os.path.join(model_loc, 'clfs_model_hash.pickle'), 'wb') as handle:
        #                     pickle.dump(model_spec_hash, handle, protocol=pickle.HIGHEST_PROTOCOL)



('second key', 0)
0
('doing symbol:', 'HSBA.L', 'for date:', 0)
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
1
('doing symbol:', 'HSBA.L', 'for date:', 1)
0
Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.4s


KeyboardInterrupt: 

In [479]:
oos_file_location

[('/media/ak/WorkDrive/Data/features_models/labels/HSBA.L/NON_DIRECTIONAL/20170925.csv',
  '/media/ak/WorkDrive/Data/features_models/features/HSBA.L/MODEL_BASED/20170922/HSBA.L_3_states_features_date:_20170925_now:_20181228_.pickle'),
 ('/media/ak/WorkDrive/Data/features_models/labels/HSBA.L/NON_DIRECTIONAL/20170926.csv',
  '/media/ak/WorkDrive/Data/features_models/features/HSBA.L/MODEL_BASED/20170922/HSBA.L_3_states_features_date:_20170926_now:_20181228_.pickle'),
 ('/media/ak/WorkDrive/Data/features_models/labels/HSBA.L/NON_DIRECTIONAL/20170927.csv',
  '/media/ak/WorkDrive/Data/features_models/features/HSBA.L/MODEL_BASED/20170922/HSBA.L_3_states_features_date:_20170927_now:_20181228_.pickle'),
 ('/media/ak/WorkDrive/Data/features_models/labels/HSBA.L/NON_DIRECTIONAL/20170928.csv',
  '/media/ak/WorkDrive/Data/features_models/features/HSBA.L/MODEL_BASED/20170922/HSBA.L_3_states_features_date:_20170928_now:_20181228_.pickle'),
 ('/media/ak/WorkDrive/Data/features_models/labels/HSBA.L/NO