In [39]:
import numpy as np
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

# 1. Utils

In [105]:
# Calculate the accuracy of prediction
def calculate_accuracy(prediction, test):
    prediction = pd.DataFrame(prediction)
    test = pd.DataFrame(test)

    return (prediction == test).sum() / len(prediction)

def calculate_return(prediction, log_rets):
    return (pd.DataFrame(prediction) * pd.DataFrame(log_rets)).sum()

# read csv files and add relevant columns
# if add_direction_col is True, add a prediction column -> indicating tomorrow's price will rise or fall
# if add_original_price_col is True, save original stock_price as a col -> necessary to keep original price after normalization
def prepare_data(filename, stockname='', add_original_price_col=True, add_log_rets=True, add_direction_col=True):
    df = pd.read_csv(filename, index_col=0, parse_dates=True)
    
    if add_original_price_col:
        df['price'] = df[stockname]

    # note that our log_rets is shifted up by 1 compared to lecture
    # this is to facilitate easier calculation of our strategy
    # our log ret implementation -> log of (tomorrow's price / today's price)
    # thus, log_rets * prediction will give the return of today's trading decision (long or short) based on prediction
    if add_log_rets:
        df['log_rets'] = np.log(df[stockname].shift(-1) / df[stockname])

    # note that if this is true, prediction will be right-most col
    if add_direction_col:
        df['direction'] = np.where(df[stockname] <= df[stockname].shift(-1), 1, -1)
    
    return df

# Split data sequentially into X_train, y_train, X_test, y_test (assuming y is right-most col) based on given ration
def sequential_train_test_split(data, ratio = 0.8, get_log_rets=True):
    data = data.dropna()
    df = pd.DataFrame(data.drop(columns=['price', 'log_rets'])).to_numpy()

    # note that we exclude price and log_rets return in our training as they are not normalized
    # they are not normalized as we need their original values for return calculation
    split_index = int(len(df) * ratio)
    train_data, test_data = df[:split_index], df[split_index:]
    X_train, y_train, X_test, y_test = train_data[:, :-1], train_data[:, -1], test_data[:, :-1], test_data[:, -1]

    if get_log_rets:
        log_rets_df = data['log_rets'].to_numpy()
        _, log_rets = log_rets_df[:split_index], log_rets_df[split_index:]
        return X_train, y_train, X_test, y_test, log_rets

    return X_train, y_train, X_test, y_test


In [98]:
data = prepare_data('./data/apple.csv', 'AAPL.O', add_original_price_col=True, add_log_rets=True, add_direction_col=True)
data

Unnamed: 0_level_0,AAPL.O,SlowK,SlowD,RSI,ADX,CCI,Aroon Down,Aroon Up,OBV,Chaikin A/D,SMA,EMA,price,log_rets,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2010-01-01,,,,,,,,,,,,,,,-1
2010-04-01,30.572827,78.7484,82.8722,75.0702,38.0954,96.8893,42.8571,100.0000,8.816356e+10,5.524930e+10,6.9552,6.9908,30.572827,0.001727,1
2010-05-01,30.625684,,,,,,,,,,,,30.625684,-0.016034,-1
2010-06-01,30.138541,89.5060,82.1665,57.5986,35.3551,151.4643,57.1429,100.0000,8.681483e+10,6.043320e+10,7.5207,7.6163,30.138541,-0.001850,-1
2010-07-01,30.082827,11.7287,14.7644,38.9499,24.7259,-152.8759,100.0000,42.8571,8.390127e+10,5.897666e+10,8.1018,7.9595,30.082827,0.006626,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-25,182.170000,24.7765,28.7326,33.5305,28.2742,-132.0057,100.0000,14.2857,9.108196e+10,7.277260e+10,44.7743,44.5714,182.170000,0.012330,1
2018-06-26,184.430000,30.9769,28.9171,32.5318,28.2014,-70.9622,92.8571,7.1429,9.118024e+10,7.276743e+10,44.5744,44.3849,184.430000,-0.001465,-1
2018-06-27,184.160000,37.4891,31.0808,42.8627,27.4502,-43.5508,85.7143,0.0000,9.107910e+10,7.267438e+10,44.4033,44.3292,184.160000,0.007250,1
2018-06-28,185.500000,55.4647,41.3102,39.4747,26.9167,-30.3030,78.5714,21.4286,9.114856e+10,7.270291e+10,44.2260,44.2348,185.500000,-0.002105,-1


# 2. Normalization

In [99]:
def normalize(df, exclude_columns = []):
    scaler = MinMaxScaler()

    for i in range(len(df.columns)):
        if df.columns[i] in exclude_columns:
            continue

        normalized_columns = scaler.fit_transform(pd.DataFrame(df.iloc[:, i]))
        df.iloc[:, i] = normalized_columns

    return df

In [100]:
data = normalize(data, ['price', 'log_rets', 'direction'])
data

Unnamed: 0_level_0,AAPL.O,SlowK,SlowD,RSI,ADX,CCI,Aroon Down,Aroon Up,OBV,Chaikin A/D,SMA,EMA,price,log_rets,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2010-01-01,,,,,,,,,,,,,,,-1
2010-04-01,0.018837,0.797082,0.851941,0.769280,0.413634,0.658869,0.428571,1.000000,0.466485,0.197673,0.021918,0.021476,30.572827,0.001727,1
2010-05-01,0.019154,,,,,,,,,,,,30.625684,-0.016034,-1
2010-06-01,0.016229,0.907128,0.844247,0.536156,0.374250,0.748379,0.571429,1.000000,0.417465,0.386577,0.033742,0.034488,30.138541,-0.001850,-1
2010-07-01,0.015895,0.111497,0.109400,0.287327,0.221484,0.249224,1.000000,0.428571,0.311572,0.333500,0.045891,0.041627,30.082827,0.006626,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-25,0.929088,0.244971,0.261688,0.215016,0.272481,0.283454,1.000000,0.142857,0.572554,0.836229,0.812646,0.803231,182.170000,0.012330,1
2018-06-26,0.942658,0.308398,0.263699,0.201690,0.271435,0.383573,0.928571,0.071429,0.576126,0.836040,0.808467,0.799351,184.430000,-0.001465,-1
2018-06-27,0.941037,0.375016,0.287289,0.339535,0.260639,0.428531,0.857143,0.000000,0.572450,0.832650,0.804889,0.798193,184.160000,0.007250,1
2018-06-28,0.949083,0.558899,0.398814,0.294329,0.252971,0.450259,0.785714,0.214286,0.574974,0.833689,0.801182,0.796229,185.500000,-0.002105,-1


# 3. Optimization: Machine Learning Model Selection, Feature Selection, Hyperparameters Tuning

In [161]:
def benchmark(model_name, model, log_rets, X_train, y_train, X_test, y_test):
    pred = model.fit(X_train, y_train).predict(X_test)
    acc = calculate_accuracy(pred, y_test)[0]
    log_ret = calculate_return(pred, log_rets)[0]
    up_percentage = ((pred > 0).sum() / len(pred))
    simple_ret = np.exp(log_ret)
    print(f"{model_name}\naccuracy: {acc}\nlog_return: {log_ret}\nsimple_return: {simple_ret}\n up%: {up_percentage}")
    return acc, log_ret, simple_ret

In [139]:
X_train, y_train, X_test, y_test, log_rets = sequential_train_test_split(data)
print(len(y_test))
print(len(log_rets))

363
363


## 3.1 Machine Learning Model Selection

In [140]:
# linear model is excluded due non-linear-separability of data which leads to linear model to always predict all up or all down
model_name = 'SVC Linear Model'
model = SVC(kernel='linear')
benchmark(model_name, model, log_rets, X_train, y_train, X_test, y_test)

model_name = 'SVC Gaussian Model'
model = SVC(kernel='rbf')
benchmark(model_name, model, log_rets, X_train, y_train, X_test, y_test)

model_name = 'SVC Gaussian Model'
model = SVC(kernel='poly')
benchmark(model_name, model, log_rets, X_train, y_train, X_test, y_test)

model_name = 'SVC Sigmoid Model'
model = SVC(kernel='sigmoid', degree=2, C=3/10)
benchmark(model_name, model, log_rets, X_train, y_train, X_test, y_test)

SVC Linear Model
accuracy: 0.4986225895316804
log_return: 0.24198995575396315
simple_return: 1.2737813985781763

SVC Gaussian Model
accuracy: 0.46831955922865015
log_return: -0.4840671360067562
simple_return: 0.6162718264939079

SVC Gaussian Model
accuracy: 0.5206611570247934
log_return: -0.12894069878822068
simple_return: 0.8790260913124078

SVC Sigmoid Model
accuracy: 0.4738292011019284
log_return: -0.4764104347959125
simple_return: 0.6210085464177639



(0.4738292011019284, -0.4764104347959125, 0.6210085464177639)

## 3.2 Feature Selection

In [155]:
import itertools

all_features = list(range(len(data.columns) - 3))

best_acc = 0
best_selected_features_for_acc = []

best_ret = 0
best_selected_features_for_ret = []

for i in range(1, len(data.columns) - 3):
    possible_selected_features = list(itertools.combinations(all_features, i))
    for selected_features in possible_selected_features:
        print("---")
        curr_X_train = X_train[:, selected_features].reshape(-1, len(selected_features))
        curr_X_test = X_test[:, selected_features].reshape(-1, len(selected_features))
        model = SVC(kernel='poly', degree = 2, C= 0.3)
        acc, log_ret, simple_ret = benchmark(f"SVC Poly with Features: {selected_features}", model, log_rets, curr_X_train, y_train, curr_X_test, y_test)

        if acc > best_acc:
            best_acc = acc
            best_selected_features_for_acc = selected_features

        if log_ret > best_ret:
            best_ret = log_ret
            best_selected_features_for_ret = selected_features

        print(f"Current Best Acc: {best_selected_features_for_acc} acc: {best_acc}")
        print(f"Current Best Ret: {best_selected_features_for_ret} log ret: {best_ret} simple ret: {np.exp(best_ret)}")
        print("---")

print(f"Overall Best Acc: {best_selected_features_for_acc} acc: {best_acc}")
print(f"Overall Best Ret: {best_selected_features_for_ret} log ret: {best_ret} simple ret: {np.exp(best_ret)}")

---
SVC Poly with Features: (0,)
accuracy: 0.5013774104683195
log_return: 0.2578317011364651
simple_return: 1.2941210012017605

Current Best Acc: (0,) acc: 0.5013774104683195
Current Best Ret: (0,) log ret: 0.2578317011364651 simple ret: 1.2941210012017605
---
---
SVC Poly with Features: (1,)
accuracy: 0.5013774104683195
log_return: 0.2578317011364651
simple_return: 1.2941210012017605

Current Best Acc: (0,) acc: 0.5013774104683195
Current Best Ret: (0,) log ret: 0.2578317011364651 simple ret: 1.2941210012017605
---
---
SVC Poly with Features: (2,)
accuracy: 0.5013774104683195
log_return: 0.2578317011364651
simple_return: 1.2941210012017605

Current Best Acc: (0,) acc: 0.5013774104683195
Current Best Ret: (0,) log ret: 0.2578317011364651 simple ret: 1.2941210012017605
---
---
SVC Poly with Features: (3,)
accuracy: 0.5013774104683195
log_return: 0.2578317011364651
simple_return: 1.2941210012017605

Current Best Acc: (0,) acc: 0.5013774104683195
Current Best Ret: (0,) log ret: 0.25783170

KeyboardInterrupt: 

## 3.3 Hyperparameter Tuning

In [162]:
import itertools

all_features = list(range(len(data.columns) - 3))

best_acc = 0
best_degree_for_acc = -1
best_C_for_acc = -1

best_ret = 0
best_degree_for_ret= -1
best_C_for_ret = -1

for d in range(1, 11):
    for c in range(1, 101, 1):
        degree = d
        C = c / 10

        print("---")
        model = SVC(kernel='poly', degree = degree, C = C)
        acc, log_ret, simple_ret = benchmark(f"SVC Poly with degree: {degree} C: {C}", model, log_rets, X_train, y_train, X_test, y_test)

        if acc > best_acc:
            best_acc = acc
            best_degree_for_acc = degree
            best_C_for_acc = C

        if log_ret > best_ret:
            best_ret = log_ret
            best_degree_for_ret= degree
            best_C_for_ret = C

        # print(f"Current Best Acc: degree: {best_degree_for_acc} C: {best_C_for_acc} acc: {best_acc}")
        print(d, C)
        print(f"Current Best Ret: degree: {best_degree_for_ret} C: {best_C_for_ret} log ret: {best_ret} simple ret: {np.exp(best_ret)}")
        print("---")

print(f"Overall Best Acc: degree: {best_degree_for_acc} C: {best_C_for_acc} acc: {best_acc}")
print(f"Overall Best Ret: degree: {best_degree_for_ret} C: {best_C_for_ret} log ret: {best_ret} simple ret: {np.exp(best_ret)}")

---
SVC Poly with degree: 1 C: 0.1
accuracy: 0.5013774104683195
log_return: 0.2578317011364651
simple_return: 1.2941210012017605
 up%: 1.0
1 0.1
Current Best Ret: degree: 1 C: 0.1 log ret: 0.2578317011364651 simple ret: 1.2941210012017605
---
---
SVC Poly with degree: 1 C: 0.2
accuracy: 0.5013774104683195
log_return: 0.2578317011364651
simple_return: 1.2941210012017605
 up%: 1.0
1 0.2
Current Best Ret: degree: 1 C: 0.1 log ret: 0.2578317011364651 simple ret: 1.2941210012017605
---
---
SVC Poly with degree: 1 C: 0.3
accuracy: 0.5013774104683195
log_return: 0.2578317011364651
simple_return: 1.2941210012017605
 up%: 1.0
1 0.3
Current Best Ret: degree: 1 C: 0.1 log ret: 0.2578317011364651 simple ret: 1.2941210012017605
---
---
SVC Poly with degree: 1 C: 0.4
accuracy: 0.5013774104683195
log_return: 0.2578317011364651
simple_return: 1.2941210012017605
 up%: 1.0
1 0.4
Current Best Ret: degree: 1 C: 0.1 log ret: 0.2578317011364651 simple ret: 1.2941210012017605
---
---
SVC Poly with degree: 1 

KeyboardInterrupt: 