#### Python Implementations

In [23]:
from __future__ import print_function
import warnings
warnings.filterwarnings('ignore')

import datetime
import numpy as np
import pandas as pd
import sklearn
import yfinance as yf

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC, SVC

In [16]:
def create_lagged_series(symbol, start_date, end_date, lags=5):
    """
    Este DataFrame é projetado para armazenar os retornos percentuais 
    do valor de fechamento ajustado de uma ação obtida do Yahoo Finance, 
    juntamente com uma série de retornos atrasados dos dias de negociação 
    anteriores (o padrão é 5 dias). Além disso, o volume de negociação, 
    bem como a direção do dia anterior, também estão incluídos no DataFrame.
    """

    # Obtain stock information from Yahoo Finance
    ts = yf.download(symbol,
                    start_date-datetime.timedelta(days=365),
                    end_date)

    # Create the new lagged DataFrame
    tslag = pd.DataFrame(index=ts.index)
    tslag["Today"] = ts["Adj Close"]
    tslag["Volume"] = ts["Volume"]

    # Create the shifted lag series of prior trading period close values
    for i in range(0, lags):
        tslag["Lag%s" % str(i+1)] = ts["Adj Close"].shift(i+1)

    # Create the returns DataFrame
    tsret = pd.DataFrame(index=tslag.index)
    tsret["Volume"] = tslag["Volume"]
    tsret["Today"] = tslag["Today"].pct_change()*100.0

    # If any of the values of percentage returns equal zero, set them to
    # a small number (stops issues with QDA model in Scikit-Learn)
    for i, x in enumerate(tsret["Today"]):
        if (abs(x) < 0.0001):
            tsret["Today"][i] = 0.0001

    # Create the lagged percentage returns columns
    for i in range(0, lags):
        tsret["Lag%s" % str(i+1)] = \
            tslag["Lag%s" % str(i+1)].pct_change()*100.0

    # Create the "Direction" column (+1 or -1) indicating an up/down day
    tsret["Direction"] = np.sign(tsret["Today"])
    tsret = tsret[tsret.index >= start_date]

    return tsret

In [17]:
# Create a lagged series of the S&P500 US stock market index
snpret = create_lagged_series("^GSPC", 
                                datetime.datetime(2010,1,10),
                                datetime.datetime(2020,12,31), 
                                lags=5)

[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tsret["Today"][i] = 0.0001


In [18]:
# Use the prior two days of returns as predictor
# values, with direction as the response
X = snpret[["Lag1", "Lag2"]]
y = snpret["Direction"]

In [19]:
# The test data is split into two parts: Before and after 1st Jan 2005.
start_test = datetime.datetime(2020, 1, 1)

In [20]:
# Create training and test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]

In [24]:
# Create the (parametrised) models
print("Hit Rates/Confusion Matrices:\n")
models = [("LR", LogisticRegression()),
          ("LDA", LinearDiscriminantAnalysis()),
          ("QDA", QuadraticDiscriminantAnalysis()),
          ("LSVC", LinearSVC()),
          ("RSVM", SVC(C=1000000.0,
                       cache_size=200,
                       class_weight=None,
                       coef0=0.0,
                       degree=3,
                       gamma=0.0001,
                       kernel='rbf',
                       max_iter=-1,
                       probability=False,
                       random_state=None,
                       shrinking=True,
                       tol=0.001,
                       verbose=False)),

          ("RF", RandomForestClassifier(n_estimators=1000,
                                        criterion='gini',
                                        max_depth=None,
                                        min_samples_split=2,
                                        min_samples_leaf=1, max_features='auto',
                                        bootstrap=True,
                                        oob_score=False,
                                        n_jobs=1,
                                        random_state=None,
                                        verbose=0))]

Hit Rates/Confusion Matrices:



In [25]:
# Iterate through the models
for m in models:
    # Train each of the models on the training set
    m[1].fit(X_train, y_train)

    # Make an array of predictions on the test set
    pred = m[1].predict(X_test)

    # Output the hit-rate and the confusion matrix for each model
    print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
    print("%s\n" % confusion_matrix(pred, y_test))

LR:
0.611
[[ 13   3]
 [ 95 141]]

LDA:
0.611
[[ 13   3]
 [ 95 141]]

QDA:
0.595
[[  8   2]
 [100 142]]

LSVC:
0.611
[[ 13   3]
 [ 95 141]]

RSVM:
0.591
[[  6   1]
 [102 143]]

RF:
0.544
[[49 56]
 [59 88]]



### END.