### Model Selection

#### Cross Validation

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# train_test_split.py

from __future__ import print_function

import datetime
import sklearn

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.metrics import confusion_matrix
from sklearn.qda import QDA
from sklearn.svm import LinearSVC, SVC
from create_lagged_series import create_lagged_series

# The test data is split into two parts: Before and after 1st Jan 2005.
start_test = datetime.datetime(2005,1,1)

# Create training and test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]


# train_test_split.py
if __name__ == "__main__":
    # Create a lagged series of the S&P500 US stock market index
    snpret = create_lagged_series(
    "^GSPC", datetime.datetime(2001,1,10),
    datetime.datetime(2005,12,31), lags=5
    )
    # Use the prior two days of returns as predictor
    # values, with direction as the response
    X = snpret[["Lag1","Lag2"]]
    y = snpret["Direction"]
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=42
    )
    
    # Create the (parametrised) models
    print("Hit Rates/Confusion Matrices:\n")
    models = [("LR", LogisticRegression()),
              ("LDA", LDA()),
              ("QDA", QDA()),
              ("LSVC", LinearSVC()),
              ("RSVM", SVC(C = 1000000.0, 
                           cache_size = 200,
                           class_weight = None,
                           coef0 = 0.0, 
                           degree = 3, 
                           gamma = 0.0001,
                           kernel = 'rbf',
                           max_iter =- 1,
                           probability = False,
                           random_state = None,
                           shrinking = True, 
                           tol = 0.001, 
                           verbose = False)),
              ("RF", RandomForestClassifier(n_estimators = 1000, 
                                            criterion = 'gini',
                                            max_depth = None,
                                            min_samples_split = 2,
                                            min_samples_leaf = 1,
                                            max_features = 'auto',
                                            bootstrap = True,
                                            oob_score = False, 
                                            n_jobs = 1,
                                            random_state = None,
                                            verbose = 0))]
    
# Iterate through the models
for m in models:

    # Train each of the models on the training set
    m[1].fit(X_train, y_train)

    # Make an array of predictions on the test set
    pred = m[1].predict(X_test)

    # Output the hit-rate and the confusion matrix for each model
    print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
    print("%s\n" % confusion_matrix(pred, y_test))

In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
# k_fold_cross_val.py

from __future__ import print_function

import datetime
import pandas as pd
import sklearn

from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

from create_lagged_series import create_lagged_series

if __name__ == "__main__":

    # Create a lagged series of the S&P500 US stock market index
    snpret = create_lagged_series("^GSPC", 
                                  datetime.datetime(2001,1,10), 
                                  datetime.datetime(2005,12,31), 
                                  lags=5)
    
    # Use the prior two days of returns as predictor
    # values, with direction as the response
    X = snpret[["Lag1","Lag2"]]
    y = snpret["Direction"]
    
    # Create a k-fold cross validation object
    kf = cross_validation.KFold(len(snpret),
                                n_folds = 10,
                                indices = False,
                                shuffle = True,
                                random_state = 42)
    
    # Use the kf object to create index arrays that
    # state which elements have been retained for training
    # and which elements have beenr retained for testing
    # for each k-element iteration
    for train_index, test_index in kf:
        X_train = X.ix[X.index[train_index]]
        X_test = X.ix[X.index[test_index]]
        y_train = y.ix[y.index[train_index]]
        y_test = y.ix[y.index[test_index]]
        
        # In this instance only use the
        # Radial Support Vector Machine (SVM)
        print("Hit Rate/Confusion Matrix:")
        model = SVC(C = 1000000.0, 
                    cache_size = 200,
                    class_weight = None,
                    coef0 = 0.0,
                    degree = 3, 
                    gamma = 0.0001,
                    kernel = 'rbf',
                    max_iter =-1 ,
                    probability = False,
                    random_state = None,
                    shrinking = True,
                    tol = 0.001,
                    verbose = False)
        
        # Train the model on the retained training data
        model.fit(X_train, y_train)
        
        # Make an array of predictions on the test set
        pred = model.predict(X_test)
        
        # Output the hit-rate and the confusion matrix for each model
        print("%0.3f" % model.score(X_test, y_test))
        print("%s\n" % confusion_matrix(pred, y_test))

#### Grid Search