In [1]:
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.model_selection import KFold, cross_val_score
from sklearn.multiclass import OneVsRestClassifier #support from multiclass

from sklearn.svm import SVC


sc = StandardScaler()

In [2]:

####paths####
labels_path = '/home/ak/Documents/Data/features_models/labels'
main_path = '/home/ak/Documents/Data/features_models/'

models_path=os.path.join(main_path,'models')


In [5]:
os.listdir('/media/ak/WorkDrive/Data/SYNT_2states')

OSError: [Errno 2] No such file or directory: '/media/ak/WorkDrive/Data/SYNT_2states'

In [8]:
#####OOP#####
class DataLoader(object):
    def __init__(self, path_, ticker):
        self.main_path = path_
        self.ticker = ticker
        self.labels_path = os.path.join(self.main_path, 'labels')
        self.features_path = os.path.join(self.main_path, 'features')
        self.ticker_labels_path = os.path.join(self.labels_path, self.ticker)
        self.ticker_features_path = os.path.join(self.features_path, self.ticker)

    def ticker_features(self, date):
        file_loc = os.path.join(self.ticker_features_path, str(date) + '.pickle')
        with open(file_loc, 'rb') as handle:
            ticker_features = pickle.load(handle)
        return ticker_features

    def ticker_labels_pickle(self, date):
        file_loc = os.path.join(self.ticker_labels_path, str(date) + '.pickle')
        with open(file_loc, 'rb') as handle:
            ticker_labels = pickle.load(handle)
        return ticker_labels

    def ticker_labels_csv(self, date):
        file_loc = os.path.join(self.ticker_labels_path, str(date) + '.csv')
        ticker_labels = pd.read_csv(file_loc, index_col=0)
        return ticker_labels

    @staticmethod
    def open_pickle_file(path, pickle_file):
        file_loc = os.path.join(path, pickle_file)
        pickle_to_file = pickle.load(open(file_loc, "rb"))
        return pickle_to_file

    @staticmethod
    def get_date_from_file(file_, numb_):
        return os.path.splitext(file_[numb_])[0]


class PriceIndicators(object):
    # a class to be expanded that uses features for base case -price only-indicators
    """"Requires:
    symbol - A stock symbol on which to form a strategy on.
    short_window - Lookback period for short moving average.
    long_window - Lookback period for long moving average."""

    def __init__(self, symbol, labels_df):
        self.symbol = symbol
        self.labels = labels_df

    def MACD(self, short_window=5, long_window=20):
        short_rolling_px = self.labels['TradedPrice'].rolling(window=short_window).mean()
        long_rolling_px = self.labels['TradedPrice'].rolling(window=long_window).mean()
        px_indx = long_rolling_px - short_rolling_px
        return px_indx


class FitModels(object):

    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    # # Train a SVM classification model

    def best_kernel_ridge(self, kernel_choice):

        kr_clf =OneVsRestClassifier(GridSearchCV(KernelRidge(kernel=str(kernel_choice)), cv=10,
                          param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                                      "gamma": np.logspace(-2, 2, 5)})).fit(self.X_train, self.y_train)

        return kr_clf

    def best_svm_clf(self, kernel_choice):

        param_grid = dict(kernel=[str(kernel_choice)],
                          C=[1, 5, 10, 25, 50, 100],
                          gamma=[0.0001, 0.001, 0.01, 0.02, 0.05, 0.01])

        clf = OneVsRestClassifier(
            GridSearchCV(SVC(class_weight='balanced'), param_grid, verbose=1, n_jobs=-1, cv=10))\
            .fit(self.X_train, self.y_train)
        return clf

    def best_gradient_boost_clf(self):
        #this needs to be written properly- just a baseline placeholder here!
        GBR = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                           max_depth=4, max_features='sqrt',
                                           min_samples_leaf=15, min_samples_split=10,loss='huber',
                                        random_state=5)

        gb_boost_clf = OneVsRestClassifier(GBR).fit(self.X_train, self.y_train)

        return gb_boost_clf
    def best_MKL_clf(self):
        pass
    def best_knn_clf(self):
        pass
    def best_random_forest_clf(self):
        pass


    def run_cv(self, clf_class, **kwargs):
        # Construct a kfolds object
        kf = KFold(len(self.y_train), n_folds=10, shuffle=True)
        y_pred = self.y_train.copy()

        # Iterate through folds
        for train_index, test_index in kf:
            X_train_local, X_test_local = self.X_train[train_index], self.X_train[test_index]
            y_train_local = self.y_train[train_index]
            # Initialize a classifier with key word arguments
            clf = clf_class(**kwargs)
            clf.fit(self.X_train, self.y_train)
            y_pred[test_index] = clf.predict(X_test_local)
        return y_pred


class PredictModels(FitModels):
    def __init__(self):
        pass

def no_nans(label):
    return np.sum(np.isnan(label))


def remove_last_element(arr):
    return arr[np.arange(arr.size - 1)]


In [9]:
def fwd_dates(_dates_list, _key_date):
    #returns a list of dates that are forward from the key_date
    fwd_dates_list  = [i for i in _dates_list if i > _key_date]
    return fwd_dates_list

def prec_recall_report(y_true_, y_predict_):
    #function to ge the sci-kit learn classification metrics into a pretty DF for csv!
    report = pd.DataFrame(list(precision_recall_fscore_support(y_true_, y_predict_)),
                          index=['Precision', 'Recall', 'F1-score', 'Support']).T
    # Now add the 'Avg/Total' row
    report.loc['Avg/Total', :] = precision_recall_fscore_support(y_true_, y_predict_, average='weighted')
    report.loc['Avg/Total', 'Support'] = report['Support'].sum()
    return report

In [15]:


# if __name__ == '__main__':

###below may be unnecessary

ticker = 'SYNT_3states'

features_path = os.path.join(main_path, 'features')

ticker_labels_path = os.path.join(labels_path, ticker)
ticker_models_path = os.path.join(models_path, ticker)

ticker_features_path = os.path.join(features_path, ticker)

###

# list of files    # list of files
labels_list = os.listdir(ticker_labels_path)
# clfs=[LR, GBC, RF, KNN]

features_list = os.listdir(ticker_features_path)
models_list= os.listdir(ticker_models_path)

####
data_cls = DataLoader(path_=main_path, ticker=ticker)
#     idx = 1  # take first label-index for the data-frame of labels
#     no_files=len(features_list)
#     ticker_models_path = os.path.join(models_path, ticker)
#     ###


In [16]:
list_models_dates=[]
for model_id, _ in enumerate(models_list):
    list_models_dates.append(models_list[model_id].split("_")[0])

# [mm.split("_")[0] for mm in models_list]
# result=fwd_dates(list_models_dates,list_models_dates[0])

In [17]:
#datex=0 #which date we pick
import time
start = time.time()

for datex in range(0,10):
    print("the model date is:", list_models_dates[datex])
    fwd_dates_list_for_date = fwd_dates(list_models_dates,list_models_dates[datex]) #dates for OOS

    model_= models_list[datex] #model we will test

    loaded_models=data_cls.open_pickle_file(ticker_models_path, model_)#load models

    best_gboost=loaded_models['GBOOST']#best gradient boosting model
    best_kr = loaded_models['KR'] #best kernel ridge
    best_svc= loaded_models['SVC'] #best svc

    #######start testing
    for test_date_x in range(0,10):
        test_date = fwd_dates_list_for_date[test_date_x]
        print("you are testing on:", test_date)
        
        test_features_pickle=data_cls.ticker_features(test_date)
        test_labels = data_cls.ticker_labels_csv(date=test_date)
        #one liner converting tuple into pandas dataframe of all the features
        df_features_ = pd.concat([test_features_pickle[item]for item in range(0,4)], axis=1,sort=False) 
        #abels
        all_labels=test_labels.drop(columns=['Duration','ReturnTradedPrice','states','TradedTime','TradedPrice','ticker'], axis=1)
        idx=1
        df_labels = all_labels.iloc[:, 0:idx]
        df_concat=pd.concat([df_features_, df_labels], axis=1, sort=False)
        df= df_concat.dropna()
        label_column_loc = df.shape[1] - 1 #location of labels column
        labels = df.iloc[:, label_column_loc:label_column_loc + 1]
        features = df.drop(df.columns[label_column_loc], axis=1)
        y_predict_gboost =best_gboost.predict(features)
        y_predict_svc = best_svc.predict(features)
        y_predict_kr =best_kr.predict(features)
        print(prec_recall_report(labels,y_predict_gboost))

end = time.time()
print(end - start)

IndexError: list index out of range

put this all below in a dictionary so you can manipulate it:
    print('ROC AUC: %.3f' % roc_auc_score(y, y_predict))
        print('Accuracy: %.2f' % accuracy_score(y, y_predict))
        print('Precision: %.3f' % precision_score(y_true=y, y_pred=y_predict))
        print('Recall: %.3f' % recall_score(y_true=y, y_pred=y_predict))
        print('F1: %.3f' % f1_score(y_true=y, y_pred=y_predict))
     

_model_results= {
    'clfs': np.empty((M, T)),
    'model_date': np.empty((M, T)),
    'data_date': np.empty((M, T)),
    'ROC': np.empty((M, T)),
    'accuracy' :np.empty((M, T))
}

##unit test for sequential prediction in sklearn###

In [37]:
no_of_steps =features.shape[0]


In [20]:
best_gboost.predict(features[:1])

array([1.])

In [41]:
for step in range(1,5):
    y_predict=best_svc.predict(features[:step])
    print y_predict


[0.]
[0. 0.]
[0. 0. 0.]
[0. 0. 0. 0.]


In [11]:
best_svc.predict(features)

array([0., 0., 0., ..., 0., 0., 0.])