In [2]:
import numpy as np
import pandas as pd


from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import joblib

First, we’ll identify a target that we’re trying to predict. Our target will be if the next close price will go up or down tomorrow. If the price went up, the target will be 1.0, and if it went down, the target will be 0.0.

EMA explanation

In [3]:
import yfinance as yf
def prepare_stock_data(stock, start_date='2018-01-01', end_date='2021-01-18'):
    df = pd.read_csv('./data/' + stock + '_stock_data_with_polarity.csv')

    # Set date as index
    df['Date'] = df['Date'].astype(str).str.split(' ').str[0]
    df = df.set_index('Date')
    
    # Add label
    df['Label'] = df.rolling(2).apply(lambda x: x.iloc[1] > x.iloc[0])['Close']
    
    # Shift one day, we can not use the future to predict the past
    df[['Open', 'High', 'Low', 'Close', 'Volume']] = df[['Open', 'High', 'Low', 'Close', 'Volume']].shift(1)
    
    df = df.rename(columns={'Open': 'Prev Open', 'High': 'Prev High', 'Low': 'Prev Low', 
                            'Close': 'Prev Close', 'Volume': 'Prev Volume', })
    
    # Add sentiment analysis

    # Compute Exponential Mobile Average (EMA) for stock price daily increments
    delta = df['Prev Close'] - df['Prev Open']
    df['10 Days Incr EMA'] = np.round(delta.copy().ewm(span=10, adjust=False).mean(), decimals=3)
    df['5 Days Incr EMA'] = np.round(delta.copy().ewm(span=5, adjust=False).mean(), decimals=3)
    df['3 Days Incr EMA'] = np.round(delta.copy().ewm(span=3, adjust=False).mean(), decimals=3)
    
    # Compute Exponential Mobile Average (EMA) for stock polarity
    df['10 Days Pol EMA'] = np.round(df['Polarity'].copy().ewm(span=10, adjust=False).mean(), decimals=3)
    df['5 Days Pol EMA'] = np.round(df['Polarity'].copy().ewm(span=5, adjust=False).mean(), decimals=3)
    df['3 Days Pol EMA'] = np.round(df['Polarity'].copy().ewm(span=3, adjust=False).mean(), decimals=3)

    # Drop rows with NaN values
    df.dropna(inplace=True)

    # Re order columns
    #df = df[['Date', 'Stock Trend EMA', 'S&P 500 Trend EMA', 'Label']]

    return df

the training and test set have to follow chronological order.

In [4]:
predictors = [#'Prev Open', #NO
              #'Prev High', #NO
              #'Prev Low', #NO
              'Prev Close', # SI
              'Prev Volume', # SI
              'Polarity',
              '10 Days Incr EMA', # SI
              '5 Days Incr EMA', # SI
              '3 Days Incr EMA', # SI
              '10 Days Pol EMA', # SI
              '5 Days Pol EMA', # SI
              '3 Days Pol EMA' # SI
            ]

training_stocks = ['AMZN', 'AAPL', 'MSFT', 'GOOGL']  # GOOGL

x_train = pd.DataFrame()
x_test = pd.DataFrame()
y_train = pd.DataFrame()
y_test = pd.DataFrame()

for train_stock in training_stocks:
    
    df = prepare_stock_data(train_stock)
    x_tr, x_te, y_tr, y_te = train_test_split(df[predictors],
                                                df[['Label']], test_size=.3,
                                                shuffle=False, random_state=0)
    
    x_train = x_train.append(x_tr, ignore_index=True)
    x_test = x_test.append(x_te, ignore_index=True)
    y_train = y_train.append(y_tr, ignore_index=True)
    y_test = y_test.append(y_te, ignore_index=True)

print('Size of train set: ', x_train.shape)
print('Size of test set: ', x_test.shape)
print('Size of train set: ', y_train.shape)
print('Size of test set: ', y_test.shape)

Size of train set:  (732, 9)
Size of test set:  (316, 9)
Size of train set:  (732, 1)
Size of test set:  (316, 1)


In [5]:
from sklearn.compose import ColumnTransformer

# Spot-Check Algorithms
classifiers = [
    RandomForestClassifier(),
    XGBClassifier(),
    AdaBoostClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier()
]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ("pca", PCA())
])

numeric_features = predictors

preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
]) 


In [6]:
for classifier in classifiers:  
    
    pipe = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('classifier', classifier)
           ])
    
    # Train the model
    pipe.fit(x_train, y_train.values.ravel())
    
    # Use model to make predictions
    y_pred = pipe.predict(x_test)
    
    # Evaluate the performance
    print("\nTraining ", classifier)
    accuracy = accuracy_score(y_pred, y_test)
    print("Accuracy on test set: ", accuracy)
    print("Metrics per class on test set:")

    print("Confusion matrix:")
    metrics.confusion_matrix(y_test, y_pred)

    print(metrics.classification_report(y_test, y_pred))


Training  RandomForestClassifier()
Accuracy on test set:  0.5537974683544303
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

         0.0       0.56      0.38      0.45       154
         1.0       0.55      0.72      0.62       162

    accuracy                           0.55       316
   macro avg       0.56      0.55      0.54       316
weighted avg       0.56      0.55      0.54       316


Training  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scal




Training  AdaBoostClassifier()
Accuracy on test set:  0.5506329113924051
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

         0.0       0.55      0.43      0.48       154
         1.0       0.55      0.67      0.60       162

    accuracy                           0.55       316
   macro avg       0.55      0.55      0.54       316
weighted avg       0.55      0.55      0.54       316


Training  KNeighborsClassifier()
Accuracy on test set:  0.5063291139240507
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

         0.0       0.49      0.43      0.46       154
         1.0       0.52      0.58      0.55       162

    accuracy                           0.51       316
   macro avg       0.50      0.50      0.50       316
weighted avg       0.50      0.51      0.50       316


Training  DecisionTreeClassifier()
Accuracy on test set:  0.5253164556962026
Metrics per class on

In [7]:
# Save the Model to disk
filename = '../model/classification_model.pkl'
final_pipe = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('classifier', classifiers[0])
           ])
joblib.dump(final_pipe, filename)

['../model/classification_model.pkl']

Evaluate the model using other test set

In [8]:
def predict_stock_value(stock):
    df = prepare_stock_data(stock)
    y_test = df['Label']
    
    # Load the Regression Model
    pipe = joblib.load('../model/classification_model.pkl')
    
    # Use model to make predictions
    y_pred = pipe.predict(df[predictors])

    # Evaluate the performance
    print("\n Evaluating ", pipe['classifier'])
    accuracy = accuracy_score(y_pred, y_test)
    print("Accuracy on test set: ", accuracy)
    print("Metrics per class on test set:")

    print("Confusion matrix:")
    metrics.confusion_matrix(y_test, y_pred)

    print(metrics.classification_report(y_test, y_pred))

In [9]:
predict_stock_value('GOOGL')


 Evaluating  RandomForestClassifier()
Accuracy on test set:  0.8587786259541985
Metrics per class on test set:
Confusion matrix:
              precision    recall  f1-score   support

         0.0       0.93      0.73      0.82       113
         1.0       0.82      0.96      0.89       149

    accuracy                           0.86       262
   macro avg       0.88      0.84      0.85       262
weighted avg       0.87      0.86      0.86       262

