In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
ma_list = [3,5,10,20,30]

df = yf.Ticker('AAPL').history(period='max')
df = df.iloc[:,:-2]
df.reset_index(inplace=True)

# df['Premarket_Change'] = (df.Open - df.Close.shift(1))/df.Close.shift(1) <== With scaling (Percentage change)
df['Premarket_Change'] = df.Open - df.Close.shift(1)

# Get the shifted data
for day in [1,3,5,10]:
    for col in ['High','Low','Close','Volume']:
        df[f'{col}_Shift_{day}'] = df[col].shift(day)

# Get the Moving Average data
for col in ['Close','Volume']:
    for ma in ma_list:
        df[f'{col}_{ma}MA'] = df[col].rolling(ma).mean().shift(1)

# Setting the target variable
up_list = []
for i in range(len(df)):
    if df['Close'][i] > df['Open'][i]: up_list.append(1)
    else: up_list.append(0)
df['Call'] = up_list
del up_list

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df['Return'] = (df.Close - df.Open)/df.Open
df.drop(['Date','High','Low','Close','Volume'], axis=1, inplace=True)
df

Unnamed: 0,Open,Premarket_Change,High_Shift_1,Low_Shift_1,Close_Shift_1,Volume_Shift_1,High_Shift_3,Low_Shift_3,Close_Shift_3,Volume_Shift_3,...,Close_10MA,Close_20MA,Close_30MA,Volume_3MA,Volume_5MA,Volume_10MA,Volume_20MA,Volume_30MA,Call,Return
0,0.112374,-5.007119e-08,0.112810,0.112374,0.112374,24640000.0,0.115423,0.114552,0.114552,35548800.0,...,0.110980,0.113049,0.109108,2.380373e+07,23479680.0,22377600.0,33241600.0,6.039637e+07,0,-0.007751
1,0.108454,-3.048700e-03,0.112374,0.111503,0.111503,23699200.0,0.114987,0.114116,0.114116,11222400.0,...,0.111503,0.112352,0.109485,1.985387e+07,22202880.0,22442560.0,29768480.0,4.555189e+07,0,-0.004016
2,0.104535,-3.484238e-03,0.108454,0.108019,0.108019,28156800.0,0.112810,0.112374,0.112374,24640000.0,...,0.111634,0.111634,0.109921,2.549867e+07,24653440.0,23829120.0,27732320.0,4.062763e+07,0,-0.004166
3,0.099307,-4.791671e-03,0.104535,0.104099,0.104099,43904000.0,0.112374,0.111503,0.111503,23699200.0,...,0.111155,0.110893,0.110458,3.192000e+07,26324480.0,26812800.0,28140000.0,3.856683e+07,0,-0.008771
4,0.093210,-5.226376e-03,0.099307,0.098436,0.098436,46188800.0,0.108454,0.108019,0.108019,28156800.0,...,0.110196,0.109804,0.110734,3.941653e+07,33317760.0,30092160.0,29366400.0,3.722507e+07,0,-0.004673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10424,149.070007,-5.699921e-01,149.679993,145.259995,149.639999,90978500.0,141.789993,138.339996,140.520004,92482700.0,...,142.795001,148.363036,153.010866,9.135423e+07,99184340.0,104393490.0,114840880.0,1.071714e+08,0,-0.001543
10425,149.899994,1.059998e+00,150.660004,146.839996,148.839996,103718400.0,144.339996,137.139999,143.779999,90601500.0,...,143.125002,147.918623,152.477938,9.509947e+07,96382760.0,106100950.0,113874035.0,1.083279e+08,0,-0.007939
10426,147.830002,-8.800049e-01,151.740005,147.679993,148.710007,74286600.0,149.679993,145.259995,149.639999,90978500.0,...,143.072002,147.391822,151.863125,8.966117e+07,90413540.0,105695980.0,113140040.0,1.085466e+08,1,0.022864
10427,146.899994,-4.310013e+00,151.270004,146.860001,151.210007,72348100.0,150.660004,146.839996,148.839996,103718400.0,...,144.111002,146.663501,151.337304,8.345103e+07,86386620.0,101956500.0,111344620.0,1.086939e+08,0,-0.010347


In [3]:
# train, test = train_test_split(df, test_size=0.2)
train, test = df.iloc[:int(len(df)*0.8),:], df.iloc[int(len(df)*0.8):,:]

train_X, train_Y = train.iloc[:,1:-2], train['Call']
test_X, test_Y = test.iloc[:,1:-2], test['Call']

train_return, test_return = train['Return'], test['Return']
train_return.reset_index(drop=True,inplace=True)
test_return.reset_index(drop=True,inplace=True)

# Scaling the data
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [4]:
%%time
# Major vote with all model that the accuracy is larger than 0.5
target_score = 0.5
training_record = []
result_list = []

# Use Adaboost method with decision tree to train the model
max_d, max_e = 15,101

for d in range(4,max_d,2):
    for e in range(5,max_e,5):
        model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=d), n_estimators=e, random_state=1)
        model.fit(train_X, train_Y)

        score = model.score(test_X, test_Y)
        selected = True if score > target_score else False
        
        training_record.append({
            'Classifier': 'AdaBoost',
            'Parameter': (d,e),
            'Score': score,
            'Selected': selected
        })

        if selected:
            result_list.append(model.predict(test_X))

for e in range(2, max_e, 2):
    model = RandomForestClassifier(n_estimators=e, random_state=1)
    model.fit(train_X,train_Y)

    score = model.score(test_X, test_Y)
    selected = True if score > target_score else False
    
    training_record.append({
        'Classifier': 'RandomForest',
        'Parameter': (e),
        'Score': score,
        'Selected': selected
    })

    if selected:
        result_list.append(model.predict(test_X))

for c in [0.1,0.5,1,10,100,1000]:
    for g in [0.1,0.5,1,10,100,1000]:
        model = svm.SVC(C=c, gamma=g)
        model.fit(train_X, train_Y)

        score = model.score(test_X, test_Y)
        selected = True if score > target_score else False
        
        training_record.append({
            'Classifier': 'SVM',
            'Parameter': (c,g),
            'Score': score,
            'Selected': selected
        })

        if selected:
            result_list.append(model.predict(test_X))

Wall time: 18min 11s


In [5]:
# Check if the number of results is odd
# if (len(result_list) % 2) == 0:
#     index = pd.DataFrame(training_record)['Score'].argmin()
#     training_record[index]['Selected'] = False
#     del result_list[index]

# Get final results by major vote
final_result = []
for i in range(len(test_Y)):
    count_0, count_1 = 0, 0
    for j in range(len(result_list)):
        if result_list[j][i] == 1: count_1 += 1
        else: count_0 += 1
    
    if count_1 > count_0: final_result.append(1)
    else: final_result.append(0)
final_result = np.array(final_result)

In [6]:
accuracy = np.count_nonzero((final_result == test_Y) == True) / len(test_Y)
print(f'The major vote model accuracy will be: {accuracy}')

The major vote model accuracy will be: 0.5263662511984659


In [25]:
profit_list = [1]

for i in range(len(test_Y)):
    if final_result[i] == 1:
        profit_list.append(profit_list[i]*(1+test_return[i]))
    else:
        profit_list.append(profit_list[i])

In [8]:
pd.DataFrame(training_record).to_csv('training record.csv')

In [9]:
record = pd.read_csv('training record.csv', index_col=0)
record

Unnamed: 0,Classifier,Parameter,Score,Selected
0,AdaBoost,"(4, 5)",0.495686,False
1,AdaBoost,"(4, 10)",0.491850,False
2,AdaBoost,"(4, 15)",0.477948,False
3,AdaBoost,"(4, 20)",0.481783,False
4,AdaBoost,"(4, 25)",0.490412,False
...,...,...,...,...
201,SVM,"(1000, 0.5)",0.523011,True
202,SVM,"(1000, 1)",0.467881,False
203,SVM,"(1000, 10)",0.472196,False
204,SVM,"(1000, 100)",0.472196,False


In [10]:
record[record['Selected']==True]

Unnamed: 0,Classifier,Parameter,Score,Selected
20,AdaBoost,"(6, 5)",0.529243,True
21,AdaBoost,"(6, 10)",0.526846,True
22,AdaBoost,"(6, 15)",0.526846,True
23,AdaBoost,"(6, 20)",0.531160,True
24,AdaBoost,"(6, 25)",0.532119,True
...,...,...,...,...
135,RandomForest,32,0.513902,True
136,RandomForest,34,0.509588,True
194,SVM,"(100, 0.1)",0.519175,True
200,SVM,"(1000, 0.1)",0.522531,True


In [11]:
# 1. Calculate the return of each day
# 2. Calculate the PnL for the model
# 3. Try using Linear Regression
# 4. Try finding new features for the model

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1a7fe019-844d-4914-80df-2f49b95fd9e5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>