### Model Selection

#### Cross Validation

In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# train_test_split.py

from __future__ import print_function

import datetime
import sklearn

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.metrics import confusion_matrix
from sklearn.qda import QDA
from sklearn.svm import LinearSVC, SVC
from create_lagged_series import create_lagged_series

# The test data is split into two parts: Before and after 1st Jan 2005.
start_test = datetime.datetime(2005,1,1)

# Create training and test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]


# train_test_split.py
if __name__ == "__main__":
    # Create a lagged series of the S&P500 US stock market index
    snpret = create_lagged_series(
    "^GSPC", datetime.datetime(2001,1,10),
    datetime.datetime(2005,12,31), lags=5
    )
    # Use the prior two days of returns as predictor
    # values, with direction as the response
    X = snpret[["Lag1","Lag2"]]
    y = snpret["Direction"]
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=42
    )
    
    # Create the (parametrised) models
    print("Hit Rates/Confusion Matrices:\n")
    models = [("LR", LogisticRegression()),
              ("LDA", LDA()),
              ("QDA", QDA()),
              ("LSVC", LinearSVC()),
              ("RSVM", SVC(C = 1000000.0, 
                           cache_size = 200,
                           class_weight = None,
                           coef0 = 0.0, 
                           degree = 3, 
                           gamma = 0.0001,
                           kernel = 'rbf',
                           max_iter =- 1,
                           probability = False,
                           random_state = None,
                           shrinking = True, 
                           tol = 0.001, 
                           verbose = False)),
              ("RF", RandomForestClassifier(n_estimators = 1000, 
                                            criterion = 'gini',
                                            max_depth = None,
                                            min_samples_split = 2,
                                            min_samples_leaf = 1,
                                            max_features = 'auto',
                                            bootstrap = True,
                                            oob_score = False, 
                                            n_jobs = 1,
                                            random_state = None,
                                            verbose = 0))]
    
# Iterate through the models
for m in models:

    # Train each of the models on the training set
    m[1].fit(X_train, y_train)

    # Make an array of predictions on the test set
    pred = m[1].predict(X_test)

    # Output the hit-rate and the confusion matrix for each model
    print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
    print("%s\n" % confusion_matrix(pred, y_test))

In [2]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# k_fold_cross_val.py

from __future__ import print_function

import datetime
import pandas as pd
import sklearn

from sklearn.feature_selection import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

from create_lagged_series import create_lagged_series

if __name__ == "__main__":

    # Create a lagged series of the S&P500 US stock market index
    snpret = create_lagged_series("^GSPC", 
                                  datetime.datetime(2001,1,10), 
                                  datetime.datetime(2005,12,31), 
                                  lags=5)
    
    # Use the prior two days of returns as predictor
    # values, with direction as the response
    X = snpret[["Lag1","Lag2"]]
    y = snpret["Direction"]
    
    # Create a k-fold cross validation object
    kf = cross_validation.KFold(len(snpret),
                                n_folds = 10,
                                indices = False,
                                shuffle = True,
                                random_state = 42)
    
    # Use the kf object to create index arrays that
    # state which elements have been retained for training
    # and which elements have beenr retained for testing
    # for each k-element iteration
    for train_index, test_index in kf:
        X_train = X.ix[X.index[train_index]]
        X_test = X.ix[X.index[test_index]]
        y_train = y.ix[y.index[train_index]]
        y_test = y.ix[y.index[test_index]]
        
        # In this instance only use the
        # Radial Support Vector Machine (SVM)
        print("Hit Rate/Confusion Matrix:")
        model = SVC(C = 1000000.0, 
                    cache_size = 200,
                    class_weight = None,
                    coef0 = 0.0,
                    degree = 3, 
                    gamma = 0.0001,
                    kernel = 'rbf',
                    max_iter =-1 ,
                    probability = False,
                    random_state = None,
                    shrinking = True,
                    tol = 0.001,
                    verbose = False)
        
        # Train the model on the retained training data
        model.fit(X_train, y_train)
        
        # Make an array of predictions on the test set
        pred = model.predict(X_test)
        
        # Output the hit-rate and the confusion matrix for each model
        print("%0.3f" % model.score(X_test, y_test))
        print("%s\n" % confusion_matrix(pred, y_test))

ImportError: cannot import name 'cross_validation' from 'sklearn.feature_selection' (c:\Users\Caíque Miranda\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_selection\__init__.py)

#### Grid Search

In [None]:
from sklearn.grid_search import ParameterGrid

param_grid = {'C': [1, 10, 100, 1000], 
              'gamma': [0.001, 0.0001]}

list(ParameterGrid(param_grid))

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# grid_search.py

from __future__ import print_function

import datetime
import sklearn

from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from create_lagged_series import create_lagged_series


if __name__ == "__main__":
    
    # Create a lagged series of the S&P500 US stock market index
    snpret = create_lagged_series("^GSPC", 
                                  datetime.datetime(2001,1,10),
                                  datetime.datetime(2005,12,31), lags=5)
    
    # Use the prior two days of returns as predictor
    # values, with direction as the response
    X = snpret[["Lag1","Lag2"]]
    y = snpret["Direction"]
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.5,
                                                        random_state=42)
    
    # Set the parameters by cross-validation
    tuned_parameters = [{'kernel': ['rbf'], 
                         'gamma': [1e-3, 1e-4],
                         'C': [1, 10, 100, 1000]}]
    
    # Perform the grid search on the tuned parameters
    model = GridSearchCV(SVC(C=1), tuned_parameters, cv=10)
    model.fit(X_train, y_train)
    
    print("Optimised parameters found on training set:")
    print(model.best_estimator_, "\n")
    print("Grid scores calculated on training set:")
    
    for params, mean_score, scores in model.grid_scores_:
        print("%0.3f for %r" % (mean_score, params))
        
        
# Optimised parameters found on training set:

SVC(C = 1,
    cache_size = 200, 
    class_weight = None,
    coef0 = 0.0,
    degree = 3,
    gamma = 0.001,
    kernel = 'rbf',
    max_iter =- 1,
    probability = False,
    random_state = None,
    shrinking = True,
    tol = 0.001,
    verbose = False)

#Grid scores calculated on training set:
"""
0.541 for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.541 for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.541 for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.541 for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.541 for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.541 for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.538 for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.541 for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
"""


### Optimising Strategies

In [None]:
from scripts.intraday_mr import *

if __name__ == "__main__":
    csv_dir = 'stocks_data/' # CHANGE THIS!
    symbol_list = ['AREX', 'WLL']
    initial_capital = 100000.0
    heartbeat = 0.0
    start_date = datetime.datetime(2007, 11, 8, 10, 41, 0)

    # Create the strategy parameter grid
    # using the itertools cartesian product generator
    strat_lookback = [50, 100, 200]
    strat_z_entry = [2.0, 3.0, 4.0]
    strat_z_exit = [0.5, 1.0, 1.5]
    strat_params_list = list(product(strat_lookback, 
                                     strat_z_entry,
                                     strat_z_exit))

    # Create a list of dictionaries with the correct
    # keyword/value pairs for the strategy parameters
    strat_params_dict_list = [dict(ols_window=sp[0], 
                                   zscore_high=sp[1], 
                                   zscore_low=sp[2])
    for sp in strat_params_list]
    
    # Carry out the set of backtests for all parameter combinations
    backtest = Backtest(csv_dir,
                        symbol_list,
                        initial_capital,heartbeat,
                        start_date,
                        HistoricCSVDataHandlerHFT,
                        SimulatedExecutionHandler,
                        PortfolioHFT,
                        IntradayOLSMRStrategy,  
                        strat_params_list = strat_params_dict_list)
        
    backtest.simulate_trading()

In [None]:
from scripts.backtest import *

def _generate_trading_instances(self, strategy_params_dict):
    """
    Generates the trading instance objects from
    their class types.
    """
    
    print("Creating DataHandler, Strategy, Portfolio and ExecutionHandler for")
    print("strategy parameter list: %s..." % strategy_params_dict)
    
    self.data_handler = self.data_handler_cls(self.events,
                                              self.csv_dir,
                                              self.symbol_list,
                                              self.header_strings)
    self.strategy = self.strategy_cls(self.data_handler, 
                                      self.events,
                                      **strategy_params_dict)

    self.portfolio = self.portfolio_cls(self.data_handler, 
                                        self.events,
                                        self.start_date,
                                        self.num_strats,
                                        self.periods,
                                        self.initial_capital)
    
    self.execution_handler = self.execution_handler_cls(self.events)
    
def simulate_trading(self):
    """
    Simulates the backtest and outputs portfolio performance.
    """

    out = open("output/opt.csv", "w")
    spl = len(self.strat_params_list)

    for i, sp in enumerate(self.strat_params_list):
        print("Strategy %s out of %s..." % (i+1, spl))
        self._generate_trading_instances(sp)
        self._run_backtest()
        stats = self._output_performance()
        pprint.pprint(stats)
        tot_ret = float(stats[0][1].replace("%",""))
        cagr = float(stats[1][1].replace("%",""))
        sharpe = float(stats[2][1])
        max_dd = float(stats[3][1].replace("%",""))
        dd_dur = int(stats[4][1])
        out.write("%s,%s,%s,%s,%s,%s,%s,%s\n" % (sp["ols_window"],
                                                 sp["zscore_high"],
                                                 sp["zscore_low"],
                                                 tot_ret, 
                                                 cagr, 
                                                 sharpe,
                                                 max_dd,
                                                 dd_dur))
    out.close()

#### Visualisation

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# plot_sharpe.py

import matplotlib.pyplot as plt
import numpy as np

def create_data_matrix(csv_ref, col_index):
    data = np.zeros((3, 3))
    for i in range(0, 3):
        for j in range(0, 3):
            data[i][j] = float(csv_ref[i*3+j][col_index])
    
    return data

if __name__ == "__main__":
    
    # Open the CSV file and obtain only the lines
    # with a lookback value of 100
    csv_file = open("stocks_data/opt.csv", "r").readlines()
    csv_ref = [c.strip().split(",") for c in csv_file if c[:3] == "100"]
    
    data = create_data_matrix(csv_ref, 5)
    fig, ax = plt.subplots()
    heatmap = ax.pcolor(data, cmap=plt.cm.Blues)
    row_labels = [0.5, 1.0, 1.5]
    column_labels = [2.0, 3.0, 4.0]
    
    for y in range(data.shape[0]):
        for x in range(data.shape[1]):
            plt.text(x + 0.5, y + 0.5, '%.2f' % data[y, x],
                     horizontalalignment='center',
                     verticalalignment='center',)
    
    plt.colorbar(heatmap)
    
    ax.set_xticks(np.arange(data.shape[0])+0.5, minor=False)
    ax.set_yticks(np.arange(data.shape[1])+0.5, minor=False)
    
    ax.set_xticklabels(row_labels, minor=False)
    ax.set_yticklabels(column_labels, minor=False)
    
    plt.suptitle('Sharpe Ratio Heatmap', fontsize=18)
    plt.xlabel('Z-Score Exit Threshold', fontsize=14)
    plt.ylabel('Z-Score Entry Threshold', fontsize=14)
    plt.show()

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# plot_drawdown.py

import matplotlib.pyplot as plt
import numpy as np

def create_data_matrix(csv_ref, col_index):
    data = np.zeros((3, 3))
    for i in range(0, 3):
        for j in range(0, 3):
            data[i][j] = float(csv_ref[i*3+j][col_index])
    return data

if __name__ == "__main__":
    
    # Open the CSV file and obtain only the lines
    # with a lookback value of 100
    csv_file = open("stocks_data/opt.csv", "r").readlines()
    csv_ref = [c.strip().split(",")for c in csv_file if c[:3] == "100"]
    data = create_data_matrix(csv_ref, 6)
    fig, ax = plt.subplots()
    heatmap = ax.pcolor(data, cmap = plt.cm.Reds)
    row_labels = [0.5, 1.0, 1.5]
    column_labels = [2.0, 3.0, 4.0]
    
    for y in range(data.shape[0]):
        for x in range(data.shape[1]):
            plt.text(x + 0.5, y + 0.5, '%.2f%%' % data[y, x],
                     horizontalalignment='cente',
                     verticalalignment='center',)
    
    plt.colorbar(heatmap)
    
    ax.set_xticks(np.arange(data.shape[0])+0.5, minor=False)
    ax.set_yticks(np.arange(data.shape[1])+0.5, minor=False)
    ax.set_xticklabels(row_labels, minor=False)
    ax.set_yticklabels(column_labels, minor=False)
    
    plt.suptitle('Maximum Drawdown Heatmap', fontsize=18)
    plt.xlabel('Z-Score Exit Threshold', fontsize=14)
    plt.ylabel('Z-Score Entry Threshold', fontsize=14)
    plt.show()

### End.