In [1]:
import numpy as np
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [50]:
# Calculate the accuracy of prediction
def calculate_accuracy(prediction, test):
    prediction = pd.DataFrame(prediction)
    test = pd.DataFrame(test)

    return (prediction == test).sum() / len(prediction)

def calculate_return(prediction, log_rets):
    return (pd.DataFrame(prediction) * pd.DataFrame(log_rets)).sum()

# read csv files and add relevant columns
# if add_direction_col is True, add a prediction column -> indicating tomorrow's price will rise or fall
# if add_original_price_col is True, save original stock_price as a col -> necessary to keep original price after normalization
def prepare_data(filename, stockname='', add_original_price_col=True, add_log_rets=True, add_direction_col=True):
    df = pd.read_csv(filename, index_col=0, parse_dates=True).dropna()
    
    if add_original_price_col:
        df['price'] = df[stockname]

    # note that our log_rets is shifted up by 1 compared to lecture
    # this is to facilitate easier calculation of our strategy
    # our log ret implementation -> log of (tomorrow's price / today's price)
    # thus, log_rets * prediction will give the return of today's trading decision (long or short) based on prediction
    if add_log_rets:
        df['log_rets'] = np.log(df[stockname].shift(-1) / df[stockname])

    # note that if this is true, prediction will be right-most col
    if add_direction_col:
        df['direction'] = np.where(df[stockname] <= df[stockname].shift(-1), 1, -1)
    
    return df

# Split data sequentially into X_train, y_train, X_test, y_test (assuming y is right-most col) based on given ration
def sequential_train_test_split(data, ratio = 0.8, get_log_rets=True):
   
    df = pd.DataFrame(data.drop(columns=['price', 'log_rets'])).to_numpy()

    # note that we exclude price and log_rets return in our training as they are not normalized
    # they are not normalized as we need their original values for return calculation
    split_index = int(len(df) * ratio)
    train_data, test_data = df[:split_index], df[split_index:]
    X_train, y_train, X_test, y_test = train_data[:, :-1], train_data[:, -1], test_data[:, :-1], test_data[:, -1]

    if get_log_rets:
        log_rets_df = data['log_rets'].to_numpy()
        _, log_rets = log_rets_df[:split_index], log_rets_df[split_index:]
        return X_train, y_train, X_test, y_test, log_rets

    return X_train, y_train, X_test, y_test

#calculate basic info such as, "perfect" return of whole and test data and "normal" return in which the strat is long each row until the next trade
def basic_info(log_rets, log_rets_test):
    print(log_rets_test.sum())
    normal_ret = np.exp(log_rets.sum())
    normal_ret_test = np.exp(log_rets_test.sum())
    
    #"perfect" return is when the guess is always right
    perfect_ret = np.exp(np.abs(log_rets).sum())
    perfect_ret_test = np.exp(np.abs(log_rets_test).sum())

    print(f"perfect_return: {perfect_ret}\nperfect_return_test: {perfect_ret_test}\nnormal_return: {normal_ret}\nnormal_return_test: {normal_ret_test}\n")
    return perfect_ret, perfect_ret_test, normal_ret, normal_ret_test

def normalize(df, exclude_columns = []):
    scaler = MinMaxScaler()

    for i in range(len(df.columns)):
        if df.columns[i] in exclude_columns:
            continue

        normalized_columns = scaler.fit_transform(pd.DataFrame(df.iloc[:, i]))
        df.iloc[:, i] = normalized_columns

    return df

def benchmark(model_name, model, log_rets, X_train, y_train, X_test, y_test):
    #The code below is to predict result on X_test. If want to see model's accuracy on the whole dataset, input the whole X in the X_test parameter 
    pred = model.fit(X_train, y_train).predict(X_test)
    acc = calculate_accuracy(pred, y_test)[0]
    log_ret = calculate_return(pred, log_rets)[0]
    up_percentage = ((pred > 0).sum() / len(pred))
    simple_ret = np.exp(log_ret)
    print(f"{model_name}\naccuracy: {acc}\nlog_return: {log_ret}\nsimple_return: {simple_ret}\nup%: {up_percentage}\n")
    return acc, log_ret, simple_ret


In [22]:
import os 
stocks = [['AAPL.O','./data-collection/data/apple.csv', [1, 5, 11, 13]], ['AMZN.O', './data-collection/data/amazon.csv', [3, 4, 6, 7, 8, 9, 10, 11, 15]], ['MSFT.O', './data-collection/data/microsoft.csv', [0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 12, 13, 14, 15]], ['INTC.O','./data-collection/data/intel.csv', [0, 2, 6, 8, 9, 12, 15]], ['GS.N','./data-collection/data/gs.csv', [1, 2, 3, 5, 6, 8, 9, 13]]]


In [70]:
results = []
features =[]
for i in range(5) :
  curr = stocks[i]
  
  data = prepare_data(curr[1], curr[0], add_original_price_col=True, add_log_rets=True, add_direction_col=True)
  data = normalize(data, ['price', 'log_rets', 'direction'])
  X_train, y_train, X_test, y_test, log_rets = sequential_train_test_split(data)
  print(curr[0])
  a, b, c, benchmark_return = basic_info(data['log_rets'], pd.DataFrame(log_rets).dropna().to_numpy())

  print(data.columns[curr[2]])


  degree = 2
  c = 0.3

  model = SVC(kernel='poly', degree = degree, C = c)
  curr_X_train = X_train[:, curr[2]].reshape(-1, len(curr[2]))
  curr_X_test = X_test[:, curr[2]].reshape(-1, len(curr[2]))
  acc, log_ret, simple_ret = benchmark(f"SVC Poly with Features: {curr[2]}", model, log_rets, curr_X_train, y_train, curr_X_test, y_test)
  result = {
    'Stock': curr[0],
    'Accuracy': acc,
    'Log Return': log_ret,
    'Simple Return': simple_ret,
    'benchmark ret' : benchmark_return
  }
  feature = {
    'Stock': curr[0],
    'features' : curr[2]
  }
  results.append(result)
  features.append(feature)
  print("---\n")

df_results = pd.DataFrame(results)

# Display the DataFrame
print(pd.DataFrame(features))
print(df_results)


AAPL.O
0.45323612779822187
perfect_return: 5558774849.644141
perfect_return_test: 33.42600813758216
normal_return: 6.675104552706013
normal_return_test: 1.5733956651083716

Index(['SlowK', 'CCI', 'EMA', 'SMA_Short'], dtype='object')
SVC Poly with Features: [1, 5, 11, 13]
accuracy: 0.5882352941176471
log_return: 1.008779675306789
simple_return: 2.742252533833267
up%: 0.7299465240641712

---

AMZN.O
0.7071403827812376
perfect_return: 396302124736.1031
perfect_return_test: 62.7822729822795
normal_return: 14.54934520243093
normal_return_test: 2.0281831306900227

Index(['RSI', 'ADX', 'Aroon Down', 'Aroon Up', 'OBV', 'Chaikin A/D', 'SMA',
       'EMA', 'SMA_Long_Short'],
      dtype='object')
SVC Poly with Features: [3, 4, 6, 7, 8, 9, 10, 11, 15]
accuracy: 0.5721925133689839
log_return: 0.8528014104098228
simple_return: 2.34621035220882
up%: 0.9786096256684492

---

MSFT.O
0.4802988121709258
perfect_return: 337358856.1567629
perfect_return_test: 25.360250559887437
normal_return: 3.5573593073