In [1]:
import numpy as np
import pandas as pd


from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import joblib

First, we’ll identify a target that we’re trying to predict. Our target will be if the next close price will go up or down tomorrow. If the price went up, the target will be 1.0, and if it went down, the target will be 0.0.

EMA explanation

In [47]:
import yfinance as yf
def prepare_stock_data(stock):
    df = pd.read_csv('./data/' + stock + '_stock_data_with_polarity.csv')

    # Set date as index
    df['Date'] = df['Date'].astype(str).str.split(' ').str[0]
    df = df.set_index('Date')
    
    # Add label
    df['Label'] = df.rolling(2).apply(lambda x: x.iloc[1] > x.iloc[0])['Close']
    
    # Shift one day, we can not use the future to predict the past
    df[['Open', 'High', 'Low', 'Close', 'Volume']] = df[['Open', 'High', 'Low', 'Close', 'Volume']].shift(1)
    
    df = df.rename(columns={'Open': 'Prev Open', 'High': 'Prev High', 'Low': 'Prev Low', 
                            'Close': 'Prev Close', 'Volume': 'Prev Volume', })
    
    # Compute Exponential Mobile Average (EMA) for stock price daily increments
    delta = df['Prev Close'] - df['Prev Open']
    df['10 Days Incr EMA'] = np.round(delta.copy().ewm(span=10, adjust=False).mean(), decimals=3)
    df['5 Days Incr EMA'] = np.round(delta.copy().ewm(span=5, adjust=False).mean(), decimals=3)
    df['3 Days Incr EMA'] = np.round(delta.copy().ewm(span=3, adjust=False).mean(), decimals=3)
    
    # Compute Exponential Mobile Average (EMA) for stock polarity
    df['10 Days Pol EMA'] = np.round(df['Polarity'].copy().ewm(span=10, adjust=False).mean(), decimals=3)
    df['5 Days Pol EMA'] = np.round(df['Polarity'].copy().ewm(span=5, adjust=False).mean(), decimals=3)
    df['3 Days Pol EMA'] = np.round(df['Polarity'].copy().ewm(span=3, adjust=False).mean(), decimals=3)

    # Drop rows with NaN values
    df.dropna(inplace=True)

    return df

df = prepare_stock_data('AMZN')
df

Unnamed: 0_level_0,Prev Close,Prev High,Prev Low,Prev Open,Polarity,Ticker,Prev Volume,Label,10 Days Incr EMA,5 Days Incr EMA,3 Days Incr EMA,10 Days Pol EMA,5 Days Pol EMA,3 Days Pol EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-01-04T00:00:00.000Z,3256.929932,3282.919922,3241.199951,3275.000000,39.350200,AMZN,2957200.0,0.0,-18.070,-18.070,-18.070,64.340,59.712,54.622
2021-01-05T00:00:00.000Z,3186.629883,3272.000000,3144.020020,3270.000000,49.959878,AMZN,4411400.0,1.0,-29.943,-39.837,-50.720,61.725,56.461,52.291
2021-01-06T00:00:00.000Z,3218.510010,3223.379883,3165.060059,3166.010010,22.887350,AMZN,2655500.0,0.0,-14.953,-9.058,0.890,54.664,45.270,37.589
2021-01-07T00:00:00.000Z,3138.379883,3197.510010,3131.159912,3146.479980,15.317456,AMZN,4394800.0,1.0,-13.707,-8.739,-3.605,47.510,35.286,26.453
2021-01-08T00:00:00.000Z,3162.159912,3208.540039,3155.000000,3157.000000,21.230500,AMZN,3514500.0,1.0,-10.277,-4.106,0.777,42.732,30.601,23.842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-10T00:00:00.000Z,3251.080078,3304.870117,3240.620117,3276.780029,17.298125,AMZN,2329300.0,0.0,-16.919,-21.893,-22.987,20.988,18.621,17.651
2022-01-11T00:00:00.000Z,3229.719971,3233.229980,3126.090088,3211.709961,31.623548,AMZN,4389900.0,1.0,-10.568,-8.592,-2.488,22.922,22.955,24.637
2022-01-12T00:00:00.000Z,3307.239990,3327.000000,3214.030029,3230.000000,7.483831,AMZN,3140300.0,0.0,5.397,20.019,37.376,20.115,17.798,16.061
2022-01-13T00:00:00.000Z,3304.139893,3337.560059,3288.340088,3331.500000,12.161957,AMZN,2501500.0,0.0,-0.559,4.226,5.008,18.669,15.919,14.111


the training and test set have to follow chronological order.

In [59]:
predictors = ['Prev Close', # SI
              'Prev Volume', # SI
              'Polarity',
              '10 Days Incr EMA', # SI
              '5 Days Incr EMA', # SI
              '3 Days Incr EMA', # SI
              '10 Days Pol EMA', # SI
              '5 Days Pol EMA', # SI
              '3 Days Pol EMA' # SI
            ]

training_stocks = ['AMZN', 'AAPL', 'MSFT', 'GOOGL']

x_train = pd.DataFrame()
x_test = pd.DataFrame()
y_train = pd.DataFrame()
y_test = pd.DataFrame()

for train_stock in training_stocks:
    
    df = prepare_stock_data(train_stock)
    x_tr, x_te, y_tr, y_te = train_test_split(df[predictors],
                                                df[['Label']], test_size=.333,
                                                shuffle=False, random_state=0)
    
    x_train = x_train.append(x_tr, ignore_index=True)
    x_test = x_test.append(x_te, ignore_index=True)
    y_train = y_train.append(y_tr, ignore_index=True)
    y_test = y_test.append(y_te, ignore_index=True)

print('Size of train set: ', x_train.shape)
print('Size of test set: ', x_test.shape)
print('Size of train set: ', y_train.shape)
print('Size of test set: ', y_test.shape)

Size of train set:  (696, 9)
Size of test set:  (352, 9)
Size of train set:  (696, 1)
Size of test set:  (352, 1)


In [60]:
from sklearn.compose import ColumnTransformer

# Spot-Check Algorithms
classifiers = [
    RandomForestClassifier(),
    XGBClassifier(),
    AdaBoostClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    GaussianNB()
]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ("pca", PCA())
])

numeric_features = predictors

preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
]) 


In [61]:
for classifier in classifiers:  
    
    pipe = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('classifier', classifier)
           ])
    
    # Train the model
    pipe.fit(x_train, y_train.values.ravel())
    
    # Use model to make predictions
    y_pred = pipe.predict(x_test)
    
    # Evaluate the performance
    print("\nTraining ", classifier)
    accuracy = accuracy_score(y_pred, y_test)
    print("Accuracy on test set: ", accuracy)
    print("Metrics per class on test set:")

    print("Confusion matrix:")
    metrics.confusion_matrix(y_test, y_pred)

    print(metrics.classification_report(y_test, y_pred))


Training  RandomForestClassifier()
Accuracy on test set:  0.5227272727272727
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

         0.0       0.50      0.37      0.42       168
         1.0       0.54      0.66      0.59       184

    accuracy                           0.52       352
   macro avg       0.52      0.52      0.51       352
weighted avg       0.52      0.52      0.51       352






Training  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Accuracy on test set:  0.5454545454545454
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

         0.0       0.54      0.36      0.43       168
         1.0       0.55      0.71      0.62       184

    accuracy                           0.55       352
   macro avg       0.54      0.54      0.5

In [62]:
# Save the Model to disk
filename = '../model/stock_trend_predictor.pkl'
final_pipe = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('classifier', classifiers[2])
           ])
joblib.dump(final_pipe, filename)

['../model/stock_trend_predictor.pkl']

Evaluate the model using other test set