In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectKBest
import joblib
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


def prepare_stock_data(stock):
    df = pd.read_json('../data/historical_data/' + stock + '.json', lines=True)
    df1 = pd.read_json('../data/historical_data/S&P500.json', lines=True)

    # Rename columns
    df = df.rename(columns={'Open': 'Stock Open', 'Close': 'Stock Close'})
    df1 = df1.rename(columns={'Open': 'S&P 500 Open', 'Close': 'S&P 500 Close'})

    # Join dataframes
    df = pd.merge(df[['Date', 'Stock Open', 'Stock Close']], df1[['Date', 'S&P 500 Open', 'S&P 500 Close']],
                  on='Date', how='outer')

    # Add previous day close price
    df['Stock Close Prev'] = df['Stock Close'].shift(1)
    df['S&P 500 Close Prev'] = df['S&P 500 Close'].shift(1)

    # Compute Exponential Mobile Average (EMA) for stock values and index values
    stock_value_ema = df['Stock Close'].copy().ewm(span=10, adjust=False).mean()
    df['Stock Close EMA'] = np.round(stock_value_ema, decimals=3)
    sp500_ema = df['S&P 500 Close'].copy().ewm(span=10, adjust=False).mean()
    df['S&P 500 Close EMA'] = np.round(sp500_ema, decimals=3)

    # Shift EMAs in order to have the previous days trend along with today close value
    df['Stock Close EMA'] = df['Stock Close EMA'].shift(1)
    df['S&P 500 Close EMA'] = df['S&P 500 Close EMA'].shift(1)

    # Add sentiment analysis

    # Drop rows with NaN values
    df.dropna(inplace=True)

    # Re order columns
    df = df[['Date', 'Stock Close', 'Stock Close Prev', 'Stock Close EMA',
             'S&P 500 Close Prev', 'S&P 500 Close EMA']]

    return df

df = prepare_stock_data('AMZN')
df

Unnamed: 0,Date,Stock Close,Stock Close Prev,Stock Close EMA,S&P 500 Close Prev,S&P 500 Close EMA
1,2017-01-04 00:00:00+00:00,757.18,753.67,753.670,2257.83,2257.830
2,2017-01-05 00:00:00+00:00,780.45,757.18,754.308,2270.75,2260.179
3,2017-01-06 00:00:00+00:00,795.99,780.45,759.061,2269.00,2261.783
4,2017-01-09 00:00:00+00:00,796.92,795.99,765.776,2276.98,2264.546
5,2017-01-10 00:00:00+00:00,795.90,796.92,771.438,2268.90,2265.338
...,...,...,...,...,...,...
1264,2022-01-10 00:00:00+00:00,3229.72,3251.08,3330.825,4677.02,4726.486
1265,2022-01-11 00:00:00+00:00,3307.24,3229.72,3312.442,4670.29,4716.268
1266,2022-01-12 00:00:00+00:00,3304.14,3307.24,3311.496,4713.07,4715.687
1267,2022-01-13 00:00:00+00:00,3224.28,3304.14,3310.159,4726.35,4717.625


In [3]:
x_train, x_test, y_train, y_test = train_test_split(df[['Stock Close Prev', 'Stock Close EMA',
                                                        'S&P 500 Close Prev', 'S&P 500 Close EMA']],
                                                    df[['Stock Close']], test_size=.2,
                                                    shuffle=False, random_state=0)

scale = StandardScaler()

x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

In [6]:
from sklearn.model_selection import RandomizedSearchCV

model = RandomForestRegressor()
grid_rf = {
'n_estimators': [20, 50, 100, 500, 1000],  
'max_depth': np.arange(1, 15, 1),  
'min_samples_split': [2, 10, 9], 
'min_samples_leaf': np.arange(1, 15, 2, dtype=int),  
'bootstrap': [True, False], 
'random_state': [1, 2, 30, 42]
}

rscv = RandomizedSearchCV(estimator=model, param_distributions=grid_rf, cv=3, n_jobs=-1, verbose=2, n_iter=200)
rscv_fit = rscv.fit(x_train, y_train.values.ravel())
best_parameters = rscv_fit.best_params_
print(best_parameters)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
{'random_state': 30, 'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 6, 'bootstrap': False}
