<a href="https://colab.research.google.com/github/anagh07/stock_price_predictor/blob/colab/lagged_feature_engineering_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspark
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV



Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 37 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 66.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=3b1dd1afe365b89e1a62a03643f438a05df3c4d4a73409f4965ee2dd44d41360
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [3]:
%pwd

'/content'

In [4]:
def init_spark():
    spark = SparkSession \
        .builder \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

# read and clean the csv
spark = init_spark()
filename = '/content/drive/MyDrive/data/stock_histories.csv'
df = spark.read.csv(filename, header=True, mode="DROPMALFORMED")

In [6]:
df = df.withColumn("volume",df.volume.cast(IntegerType()))
df = df.withColumn("open",df.open.cast(FloatType()))
df = df.withColumn("close",df.close.cast(FloatType()))
df = df.withColumn("adjusted_close",df.close.cast(FloatType()))
df = df.withColumn("high",df.high.cast(FloatType()))
df = df.withColumn("low",df.low.cast(FloatType()))

In [7]:
nvda = df.where(df.stock == "NVDA")

In [8]:
nvda.show()

+-----+----------+--------+------+------+------+------+--------------+
|stock|      date|  volume|  open| close|  high|   low|adjusted_close|
+-----+----------+--------+------+------+------+------+--------------+
| NVDA|2018-11-02|11320900|217.73|214.92| 222.0|210.21|        214.92|
| NVDA|2018-11-01|14163200| 212.3|218.11|218.49|207.19|        218.11|
| NVDA|2018-10-31|18644300|209.65|210.83|212.59|204.01|        210.83|
| NVDA|2018-10-30|20179800|186.55| 203.0| 203.4|185.62|         203.0|
| NVDA|2018-10-29|18950400|203.99|185.62|204.13|176.01|        185.62|
| NVDA|2018-10-26|16619600|198.31|198.29|204.84|193.12|        198.29|
| NVDA|2018-10-25|23793000|195.47|207.84|209.75|193.68|        207.84|
| NVDA|2018-10-24|22107200|219.51|199.41|221.39|198.85|        199.41|
| NVDA|2018-10-23|15660900|220.43|221.06|224.19|216.71|        221.06|
| NVDA|2018-10-22| 9221100|231.28|231.22|235.32|227.07|        231.22|
| NVDA|2018-10-19|15340200|241.76|229.17|242.55| 227.7|        229.17|
| NVDA

In [9]:
nvda.orderBy(["date"]).select("date").first()

Row(date='1999-01-22')

In [10]:
nvda.orderBy(["date"], ascending=False).select("date").first()

Row(date='2018-11-02')

In [11]:
# feature engineering: Quarter, week of year, year, day of week
nvda = nvda.withColumn("quarter", quarter(col("date")))
nvda = nvda.withColumn("week_of_year", weekofyear(col("date")))
nvda = nvda.withColumn("year", year(col("date")))
nvda = nvda.withColumn("day_of_week", dayofweek(col("date")))
nvda = nvda.withColumn("year_quarter", concat(nvda.year, lit("-"), nvda.quarter).alias("year_quarter"))

In [12]:
nvda.select("date", "quarter", "week_of_year", "year", "day_of_week", "year_quarter").show()

+----------+-------+------------+----+-----------+------------+
|      date|quarter|week_of_year|year|day_of_week|year_quarter|
+----------+-------+------------+----+-----------+------------+
|2018-11-02|      4|          44|2018|          6|      2018-4|
|2018-11-01|      4|          44|2018|          5|      2018-4|
|2018-10-31|      4|          44|2018|          4|      2018-4|
|2018-10-30|      4|          44|2018|          3|      2018-4|
|2018-10-29|      4|          44|2018|          2|      2018-4|
|2018-10-26|      4|          43|2018|          6|      2018-4|
|2018-10-25|      4|          43|2018|          5|      2018-4|
|2018-10-24|      4|          43|2018|          4|      2018-4|
|2018-10-23|      4|          43|2018|          3|      2018-4|
|2018-10-22|      4|          43|2018|          2|      2018-4|
|2018-10-19|      4|          42|2018|          6|      2018-4|
|2018-10-18|      4|          42|2018|          5|      2018-4|
|2018-10-17|      4|          42|2018|  

In [13]:
# get last full quarter
# step 1: calculate days in each quarters - a full quarter has at least 60 days 
quarter_day_count = nvda.groupby("year_quarter").count()\
                                                .filter("count >= 60")
last_quarter = quarter_day_count.orderBy(["year_quarter"], ascending=False).select("year_quarter").first()

In [14]:
last_quarter[0]

'2018-3'

In [15]:
quarter_day_count = quarter_day_count.withColumn("is_test", when(col("year_quarter") == last_quarter[0], 1).otherwise(0))

In [16]:
quarter_day_count.show()

+------------+-----+-------+
|year_quarter|count|is_test|
+------------+-----+-------+
|      2003-4|   64|      0|
|      2006-2|   63|      0|
|      2014-1|   61|      0|
|      2013-4|   64|      0|
|      2000-1|   63|      0|
|      2012-1|   62|      0|
|      2010-1|   61|      0|
|      2005-2|   64|      0|
|      2000-4|   63|      0|
|      2005-1|   61|      0|
|      2010-3|   64|      0|
|      2010-2|   63|      0|
|      2011-3|   64|      0|
|      2007-2|   63|      0|
|      2013-2|   64|      0|
|      2009-1|   61|      0|
|      2001-1|   62|      0|
|      2002-3|   64|      0|
|      2002-2|   64|      0|
|      2014-4|   64|      0|
+------------+-----+-------+
only showing top 20 rows



In [17]:
nvda = nvda.join(quarter_day_count, "year_quarter")

In [18]:
nvda.count()

4848

In [19]:
open_average = nvda.groupby("year_quarter")\
                   .agg(avg("open").alias("open_avg"))
windowSpec = Window.orderBy("year_quarter")
open_average = open_average.withColumn("open_avg_l1", lag("open_avg", 1).over(windowSpec))
open_average = open_average.withColumn("open_avg_l2", lag("open_avg", 2).over(windowSpec))
open_average = open_average.withColumn("open_avg_l3", lag("open_avg", 3).over(windowSpec))
open_average = open_average.withColumn("open_avg_l4", lag("open_avg", 4).over(windowSpec))
open_average = open_average.drop("open_avg") # drop unused col

In [20]:
volume_average = nvda.groupby("year_quarter")\
                     .agg(avg("volume").alias("volume_avg"))
volume_average = volume_average.withColumn("volume_avg_l1", lag("volume_avg", 1).over(windowSpec))
volume_average = volume_average.withColumn("volume_avg_l2", lag("volume_avg", 2).over(windowSpec))
volume_average = volume_average.withColumn("volume_avg_l3", lag("volume_avg", 3).over(windowSpec))
volume_average = volume_average.withColumn("volume_avg_l4", lag("volume_avg", 4).over(windowSpec))
volume_average = volume_average.drop("volume_avg") # drop unused col

In [21]:
high_average = nvda.groupby("year_quarter")\
                   .agg(avg("high").alias("high_avg"))
high_average = high_average.withColumn("high_avg_l1", lag("high_avg", 1).over(windowSpec))
high_average = high_average.withColumn("high_avg_l2", lag("high_avg", 2).over(windowSpec))
high_average = high_average.withColumn("high_avg_l3", lag("high_avg", 3).over(windowSpec))
high_average = high_average.withColumn("high_avg_l4", lag("high_avg", 4).over(windowSpec))
high_average = high_average.drop("high_avg") # drop unused col

In [22]:
low_average = nvda.groupby("year_quarter")\
                   .agg(avg("low").alias("low_avg"))
low_average = low_average.withColumn("low_avg_l1", lag("low_avg", 1).over(windowSpec))
low_average = low_average.withColumn("low_avg_l2", lag("low_avg", 2).over(windowSpec))
low_average = low_average.withColumn("low_avg_l3", lag("low_avg", 3).over(windowSpec))
low_average = low_average.withColumn("low_avg_l4", lag("low_avg", 4).over(windowSpec))
low_average = low_average.drop("low_avg") # drop unused col

In [23]:
close_average = nvda.groupby("year_quarter")\
                    .agg(avg("close").alias("close_avg"))
close_average = close_average.withColumn("close_avg_l1", lag("close_avg", 1).over(windowSpec))
close_average = close_average.withColumn("close_avg_l2", lag("close_avg", 2).over(windowSpec))
close_average = close_average.withColumn("close_avg_l3", lag("close_avg", 3).over(windowSpec))
close_average = close_average.withColumn("close_avg_l4", lag("close_avg", 4).over(windowSpec))
close_average = close_average.drop("close_avg") # drop unused col

In [24]:
adj_close_average = nvda.groupby("year_quarter")\
                        .agg(avg("adjusted_close").alias("adj_close_avg"))
adj_close_average = adj_close_average.withColumn("adj_close_avg_l1", lag("adj_close_avg", 1).over(windowSpec))
adj_close_average = adj_close_average.withColumn("adj_close_avg_l2", lag("adj_close_avg", 2).over(windowSpec))
adj_close_average = adj_close_average.withColumn("adj_close_avg_l3", lag("adj_close_avg", 3).over(windowSpec))
adj_close_average = adj_close_average.withColumn("adj_close_avg_l4", lag("adj_close_avg", 4).over(windowSpec))
adj_close_average = adj_close_average.drop("adj_close_avg") # drop unused col

In [25]:
nvda = nvda.join(open_average, "year_quarter")\
           .join(volume_average, "year_quarter")\
           .join(high_average, "year_quarter")\
           .join(low_average, "year_quarter")\
           .join(close_average, "year_quarter")\
           .join(adj_close_average, "year_quarter")

In [26]:
open_average.show(5)

+------------+------------------+------------------+------------------+------------------+
|year_quarter|       open_avg_l1|       open_avg_l2|       open_avg_l3|       open_avg_l4|
+------------+------------------+------------------+------------------+------------------+
|      1999-2|              null|              null|              null|              null|
|      1999-3|1.5172783136367798|              null|              null|              null|
|      1999-4|1.8688963577151299|1.5172783136367798|              null|              null|
|      2000-1| 2.627888672053814|1.8688963577151299|1.5172783136367798|              null|
|      2000-2| 5.248697753936526| 2.627888672053814|1.8688963577151299|1.5172783136367798|
+------------+------------------+------------------+------------------+------------------+
only showing top 5 rows



In [27]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [28]:
# drop rows with nulls (i.e. first four quarters)
nvda = nvda.na.drop()

In [29]:
nvda = nvda.toPandas()
nvda.columns

Index(['year_quarter', 'stock', 'date', 'volume', 'open', 'close', 'high',
       'low', 'adjusted_close', 'quarter', 'week_of_year', 'year',
       'day_of_week', 'count', 'is_test', 'open_avg_l1', 'open_avg_l2',
       'open_avg_l3', 'open_avg_l4', 'volume_avg_l1', 'volume_avg_l2',
       'volume_avg_l3', 'volume_avg_l4', 'high_avg_l1', 'high_avg_l2',
       'high_avg_l3', 'high_avg_l4', 'low_avg_l1', 'low_avg_l2', 'low_avg_l3',
       'low_avg_l4', 'close_avg_l1', 'close_avg_l2', 'close_avg_l3',
       'close_avg_l4', 'adj_close_avg_l1', 'adj_close_avg_l2',
       'adj_close_avg_l3', 'adj_close_avg_l4'],
      dtype='object')

In [36]:
predictor_features = ['quarter', 'week_of_year', 'year', 'day_of_week', 'open_avg_l1', 'open_avg_l2',
                      'open_avg_l3', 'open_avg_l4', 'volume_avg_l1', 'volume_avg_l2', 'volume_avg_l3', 
                      'volume_avg_l4', 'high_avg_l1', 'high_avg_l2', 'high_avg_l3', 'high_avg_l4', 
                      'low_avg_l1', 'low_avg_l2', 'low_avg_l3', 'low_avg_l4', 'close_avg_l1', 'close_avg_l2', 
                      'close_avg_l3', 'close_avg_l4', 'adj_close_avg_l1', 'adj_close_avg_l2', 'adj_close_avg_l3', 
                      'adj_close_avg_l4']

In [31]:
len(predictor_features)

28

In [32]:
x_train, x_test = nvda[nvda.is_test == 0][predictor_features], nvda[nvda.is_test == 1][predictor_features]
y_train, y_test = nvda[nvda.is_test == 0]['adjusted_close'], nvda[nvda.is_test == 1]['adjusted_close']
x_train

Unnamed: 0,quarter,week_of_year,year,day_of_week,open_avg_l1,open_avg_l2,open_avg_l3,open_avg_l4,volume_avg_l1,volume_avg_l2,...,low_avg_l3,low_avg_l4,close_avg_l1,close_avg_l2,close_avg_l3,close_avg_l4,adj_close_avg_l1,adj_close_avg_l2,adj_close_avg_l3,adj_close_avg_l4
0,2,14,2000,2,5.248698,2.627889,1.868896,1.517278,1.256130e+07,1.046445e+07,...,1.804606,1.471705,5.255477,2.649495,1.860433,1.506572,5.255477,2.649495,1.860433,1.506572
1,2,14,2000,3,5.248698,2.627889,1.868896,1.517278,1.256130e+07,1.046445e+07,...,1.804606,1.471705,5.255477,2.649495,1.860433,1.506572,5.255477,2.649495,1.860433,1.506572
2,2,14,2000,4,5.248698,2.627889,1.868896,1.517278,1.256130e+07,1.046445e+07,...,1.804606,1.471705,5.255477,2.649495,1.860433,1.506572,5.255477,2.649495,1.860433,1.506572
3,2,14,2000,5,5.248698,2.627889,1.868896,1.517278,1.256130e+07,1.046445e+07,...,1.804606,1.471705,5.255477,2.649495,1.860433,1.506572,5.255477,2.649495,1.860433,1.506572
4,2,14,2000,6,5.248698,2.627889,1.868896,1.517278,1.256130e+07,1.046445e+07,...,1.804606,1.471705,5.255477,2.649495,1.860433,1.506572,5.255477,2.649495,1.860433,1.506572
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4526,2,26,2018,2,235.049836,198.983333,166.225396,127.084763,1.659347e+07,1.324039e+07,...,163.738254,124.742381,234.976885,198.685555,166.285715,127.045237,234.976885,198.685555,166.285715,127.045237
4527,2,26,2018,3,235.049836,198.983333,166.225396,127.084763,1.659347e+07,1.324039e+07,...,163.738254,124.742381,234.976885,198.685555,166.285715,127.045237,234.976885,198.685555,166.285715,127.045237
4528,2,26,2018,4,235.049836,198.983333,166.225396,127.084763,1.659347e+07,1.324039e+07,...,163.738254,124.742381,234.976885,198.685555,166.285715,127.045237,234.976885,198.685555,166.285715,127.045237
4529,2,26,2018,5,235.049836,198.983333,166.225396,127.084763,1.659347e+07,1.324039e+07,...,163.738254,124.742381,234.976885,198.685555,166.285715,127.045237,234.976885,198.685555,166.285715,127.045237


In [33]:
# hypertuning steps
import numpy as np
from keras import models, layers

# import random

# model = RandomForestRegressor()  # blank/boilerplate model

# grid_rf = {
#     "n_estimators": [20, 50, 100, 500, 1000],
#     "max_depth": np.arange(1, 15, 1),
#     "min_samples_split": [2, 10, 9],
#     "min_samples_leaf": np.arange(1, 15, 2, dtype=int),
#     "bootstrap": [True, False],
#     "random_state": [1, 2, 30, 42, random.randint(0, (2**32 - 1))],
# }

# rscv = RandomizedSearchCV(
#     estimator=model, param_distributions=grid_rf, cv=3, n_jobs=-1, verbose=2, n_iter=200
# )
# rscv_fit = rscv.fit(x_train, y_train)
# best_parameters = rscv_fit.best_params_
# print(best_parameters)
# Build a Sequential model
model = models.Sequential()
model.add(layers.LSTM(1, input_shape=(1,len(predictor_features))))
model.add(layers.Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()



In [35]:
from sklearn.preprocessing import MinMaxScaler

# scale the arrays to keep the values between 0 and 1 to match outcomes found in sigmoid or step functions that are used for activation.
minMaxScale = MinMaxScaler()
minMaxScale.fit(x_train)
y_train = minMaxScale.transform(y_train)
x_train = minMaxScale.transform(x_train)
x_test = minMaxScale.transform(x_test)
y_test = minMaxScale.transform(y_test)


# Transform the testing and training data sets into three-dimensional arrays,  batch size, a number of time steps, and a number of features
x_train = x_train.reshape((x_train.shape[0], 1, x_train.shape[1]))
x_test = x_test.reshape((x_test.shape[0], 1, x_test.shape[1]))
# y_train = y_train.reshape((y_train.shape[0], 1, y_train.shape[1]))
# y_test = y_test.reshape((y_test.shape[0], 1, y_test.shape[1]))

print('The shape of xtrain is {}: '.format(x_train.shape))
print('The shape of xtest is {}: '.format(x_test.shape))


ValueError: ignored

In [185]:
# train the model
loss = model.fit(x_train, y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [193]:
# testing the model
predict = model.predict(x_test)
predict

array([[26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],
       [26.759144],


In [191]:
# model = RandomForestRegressor(
#     random_state=best_parameters["random_state"],
#     n_estimators=best_parameters["n_estimators"],
#     min_samples_split=best_parameters["min_samples_split"],
#     min_samples_leaf=best_parameters["min_samples_leaf"],
#     max_depth=best_parameters["max_depth"],
#     bootstrap=best_parameters["bootstrap"],
# )

# model.fit(x_train, y_train)

# predict = model.predict(x_test)

In [189]:
print(metrics.mean_absolute_error(y_test, predict))
print(metrics.mean_squared_error(y_test, predict))
print(np.sqrt(metrics.mean_squared_error(y_test, predict)))
# print(metrics.r2_score(y_test, predict))

233.35846
54597.69
233.66148


In [190]:
[(y_test.tolist()[i], predict[i]) for i in range(len(y_test))]

[(242.24000549316406, array([26.759144], dtype=float32)),
 (236.83999633789062, array([26.759144], dtype=float32)),
 (242.72999572753906, array([26.759144], dtype=float32)),
 (247.3300018310547, array([26.759144], dtype=float32)),
 (249.25, array([26.759144], dtype=float32)),
 (253.25, array([26.759144], dtype=float32)),
 (247.52999877929688, array([26.759144], dtype=float32)),
 (251.22999572753906, array([26.759144], dtype=float32)),
 (249.32000732421875, array([26.759144], dtype=float32)),
 (248.1999969482422, array([26.759144], dtype=float32)),
 (253.69000244140625, array([26.759144], dtype=float32)),
 (251.6999969482422, array([26.759144], dtype=float32)),
 (252.02999877929688, array([26.759144], dtype=float32)),
 (250.88999938964844, array([26.759144], dtype=float32)),
 (249.41000366210938, array([26.759144], dtype=float32)),
 (248.7100067138672, array([26.759144], dtype=float32)),
 (251.8699951171875, array([26.759144], dtype=float32)),
 (254.83999633789062, array([26.759144], dt

In [None]:
nvda.head(10)

Unnamed: 0,year_quarter,stock,date,volume,open,close,high,low,adjusted_close,quarter,...,low_avg_l3,low_avg_l4,close_avg_l1,close_avg_l2,close_avg_l3,close_avg_l4,adj_close_avg_l1,adj_close_avg_l2,adj_close_avg_l3,adj_close_avg_l4
0,2003-4,NVDA,2003-12-31,12431700,7.913333,7.733333,7.95,7.606667,7.733333,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
1,2003-4,NVDA,2003-12-30,13772100,7.816667,7.916667,8.066667,7.81,7.916667,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
2,2003-4,NVDA,2003-12-29,18915600,7.723333,7.9,7.926667,7.686666,7.9,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
3,2003-4,NVDA,2003-12-26,9099000,7.323333,7.58,7.623333,7.293334,7.58,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
4,2003-4,NVDA,2003-12-24,5271300,7.316667,7.266667,7.34,7.246666,7.266667,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
5,2003-4,NVDA,2003-12-23,18069000,7.006667,7.32,7.39,7.003334,7.32,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
6,2003-4,NVDA,2003-12-22,9279600,6.92,7.033333,7.05,6.85,7.033333,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
7,2003-4,NVDA,2003-12-19,22923900,7.093333,7.023334,7.2,6.946667,7.023334,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
8,2003-4,NVDA,2003-12-18,10918200,6.78,6.92,6.99,6.773334,6.92,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323
9,2003-4,NVDA,2003-12-17,12477900,6.843333,6.766667,6.866667,6.653333,6.766667,4,...,3.918361,3.968229,6.558802,6.454656,4.020492,4.104323,6.558802,6.454656,4.020492,4.104323


In [None]:
# try to engineer features a bit differently
# predict for the last 60 days, use last 7, 14, 60, 90, 120 day moving averages as predictors

nvda = df.where(df.stock == "NVDA")
nvda = nvda.withColumn("quarter", quarter(col("date")))
nvda = nvda.withColumn("week_of_year", weekofyear(col("date")))
nvda = nvda.withColumn("year", year(col("date")))
nvda = nvda.withColumn("day_of_week", dayofweek(col("date")))

In [None]:
w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
nvda = nvda.withColumn('open_avg_l7', avg('open').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_open_avg_l7', lag('open_avg_l7', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('open_avg_l7')

w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
nvda = nvda.withColumn('open_avg_l14', avg('open').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_open_avg_l14', lag('open_avg_l14', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('open_avg_l14')

w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
nvda = nvda.withColumn('open_avg_l60', avg('open').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_open_avg_l60', lag('open_avg_l60', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('open_avg_l60')

w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
nvda = nvda.withColumn('open_avg_l90', avg('open').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_open_avg_l90', lag('open_avg_l90', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('open_avg_l90')

w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
nvda = nvda.withColumn('open_avg_l120', avg('open').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_open_avg_l120', lag('open_avg_l120', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('open_avg_l120')

In [None]:
w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
nvda = nvda.withColumn('close_avg_l7', avg('close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_close_avg_l7', lag('close_avg_l7', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('close_avg_l7')

w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
nvda = nvda.withColumn('close_avg_l14', avg('close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_close_avg_l14', lag('close_avg_l14', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('close_avg_l14')

w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
nvda = nvda.withColumn('close_avg_l60', avg('close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_close_avg_l60', lag('close_avg_l60', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('close_avg_l60')

w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
nvda = nvda.withColumn('close_avg_l90', avg('close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_close_avg_l90', lag('close_avg_l90', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('close_avg_l90')

w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
nvda = nvda.withColumn('close_avg_l120', avg('close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_close_avg_l120', lag('close_avg_l120', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('close_avg_l120')

In [None]:
w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
nvda = nvda.withColumn('high_avg_l7', avg('high').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_high_avg_l7', lag('high_avg_l7', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('high_avg_l7')

w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
nvda = nvda.withColumn('high_avg_l14', avg('high').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_high_avg_l14', lag('high_avg_l14', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('high_avg_l14')

w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
nvda = nvda.withColumn('high_avg_l60', avg('high').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_high_avg_l60', lag('high_avg_l60', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('high_avg_l60')

w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
nvda = nvda.withColumn('high_avg_l90', avg('high').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_high_avg_l90', lag('high_avg_l90', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('high_avg_l90')

w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
nvda = nvda.withColumn('high_avg_l120', avg('high').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_high_avg_l120', lag('high_avg_l120', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('high_avg_l120')

In [None]:
w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
nvda = nvda.withColumn('low_avg_l7', avg('low').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_low_avg_l7', lag('low_avg_l7', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('low_avg_l7')

w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
nvda = nvda.withColumn('low_avg_l14', avg('low').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_low_avg_l14', lag('low_avg_l14', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('low_avg_l14')

w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
nvda = nvda.withColumn('low_avg_l60', avg('low').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_low_avg_l60', lag('low_avg_l60', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('low_avg_l60')

w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
nvda = nvda.withColumn('low_avg_l90', avg('low').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_low_avg_l90', lag('low_avg_l90', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('low_avg_l90')

w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
nvda = nvda.withColumn('low_avg_l120', avg('low').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_low_avg_l120', lag('low_avg_l120', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('low_avg_l120')

In [None]:
w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
nvda = nvda.withColumn('adjusted_close_avg_l7', avg('adjusted_close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_adjusted_close_avg_l7', lag('adjusted_close_avg_l7', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('adjusted_close_avg_l7')

w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
nvda = nvda.withColumn('adjusted_close_avg_l14', avg('adjusted_close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_adjusted_close_avg_l14', lag('adjusted_close_avg_l14', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('adjusted_close_avg_l14')

w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
nvda = nvda.withColumn('adjusted_close_avg_l60', avg('adjusted_close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_adjusted_close_avg_l60', lag('adjusted_close_avg_l60', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('adjusted_close_avg_l60')

w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
nvda = nvda.withColumn('adjusted_close_avg_l90', avg('adjusted_close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_adjusted_close_avg_l90', lag('adjusted_close_avg_l90', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('adjusted_close_avg_l90')

w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
nvda = nvda.withColumn('adjusted_close_avg_l120', avg('adjusted_close').over(w)).orderBy('date')
nvda = nvda.withColumn('lagged_adjusted_close_avg_l120', lag('adjusted_close_avg_l120', 60).over(Window.orderBy(col("date"))))
nvda = nvda.drop('adjusted_close_avg_l120')

In [None]:
from datetime import datetime, timedelta

last_day = nvda.orderBy('date', ascending=False).select('date').first()[0]
threshold = datetime.strptime(last_day, '%Y-%m-%d').date() - timedelta(days=60)
nvda = nvda.withColumn("is_test", when(col("date") >= threshold, 1).otherwise(0))

In [None]:
nvda.count()

4980

In [None]:
# drop rows with nulls (i.e. first four quarters)
nvda = nvda.na.drop()

In [None]:
nvda.count()

                                                                                

4919

In [None]:
nvda = nvda.toPandas()
nvda.columns

                                                                                

Index(['stock', 'date', 'volume', 'open', 'close', 'high', 'low',
       'adjusted_close', 'quarter', 'week_of_year', 'year', 'day_of_week',
       'lagged_open_avg_l7', 'lagged_open_avg_l14', 'lagged_open_avg_l60',
       'lagged_open_avg_l90', 'lagged_open_avg_l120', 'lagged_high_avg_l7',
       'lagged_high_avg_l14', 'lagged_high_avg_l60', 'lagged_high_avg_l90',
       'lagged_high_avg_l120', 'lagged_low_avg_l7', 'lagged_low_avg_l14',
       'lagged_low_avg_l60', 'lagged_low_avg_l90', 'lagged_low_avg_l120',
       'lagged_close_avg_l7', 'lagged_close_avg_l14', 'lagged_close_avg_l60',
       'lagged_close_avg_l90', 'lagged_close_avg_l120',
       'lagged_adjusted_close_avg_l7', 'lagged_adjusted_close_avg_l14',
       'lagged_adjusted_close_avg_l60', 'lagged_adjusted_close_avg_l90',
       'lagged_adjusted_close_avg_l120', 'is_test'],
      dtype='object')

In [None]:
predictor_features = ['quarter', 'week_of_year', 'year', 'day_of_week',
                       'lagged_open_avg_l7', 'lagged_open_avg_l14', 'lagged_open_avg_l60',
                       'lagged_open_avg_l90', 'lagged_open_avg_l120', 'lagged_high_avg_l7',
                       'lagged_high_avg_l14', 'lagged_high_avg_l60', 'lagged_high_avg_l90',
                       'lagged_high_avg_l120', 'lagged_low_avg_l7', 'lagged_low_avg_l14',
                       'lagged_low_avg_l60', 'lagged_low_avg_l90', 'lagged_low_avg_l120',
                       'lagged_close_avg_l7', 'lagged_close_avg_l14', 'lagged_close_avg_l60',
                       'lagged_close_avg_l90', 'lagged_close_avg_l120',
                       'lagged_adjusted_close_avg_l7', 'lagged_adjusted_close_avg_l14',
                       'lagged_adjusted_close_avg_l60', 'lagged_adjusted_close_avg_l90',
                       'lagged_adjusted_close_avg_l120']

In [None]:
x_train, x_test = nvda[nvda.is_test == 0][predictor_features], nvda[nvda.is_test == 1][predictor_features]
y_train, y_test = nvda[nvda.is_test == 0]['adjusted_close'], nvda[nvda.is_test == 1]['adjusted_close']

In [None]:
# hypertuning steps
import numpy as np
import random

model = RandomForestRegressor()  # blank/boilerplate model

grid_rf = {
    "n_estimators": [20, 50, 100, 500, 1000],
    "max_depth": np.arange(1, 15, 1),
    "min_samples_split": [2, 10, 9],
    "min_samples_leaf": np.arange(1, 15, 2, dtype=int),
    "bootstrap": [True, False],
    "random_state": [1, 2, 30, 42, random.randint(0, (2**32 - 1))],
}

rscv = RandomizedSearchCV(
    estimator=model, param_distributions=grid_rf, cv=3, n_jobs=-1, verbose=2, n_iter=200
)
rscv_fit = rscv.fit(x_train, y_train)
best_parameters = rscv_fit.best_params_
print(best_parameters)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
{'random_state': 30, 'n_estimators': 20, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 12, 'bootstrap': False}


In [None]:
model = RandomForestRegressor(
    random_state=best_parameters["random_state"],
    n_estimators=best_parameters["n_estimators"],
    min_samples_split=best_parameters["min_samples_split"],
    min_samples_leaf=best_parameters["min_samples_leaf"],
    max_depth=best_parameters["max_depth"],
    bootstrap=best_parameters["bootstrap"],
)

model.fit(x_train, y_train)

predict = model.predict(x_test)

In [None]:
print(metrics.mean_absolute_error(y_test, predict))
print(metrics.mean_squared_error(y_test, predict))
print(np.sqrt(metrics.mean_squared_error(y_test, predict)))
# print(metrics.r2_score(y_test, predict))

26.07795333862306
1357.390781167133
36.842784655440106


In [None]:
x_test[predictor_features]

Unnamed: 0,quarter,week_of_year,year,day_of_week,lagged_open_avg_l7,lagged_open_avg_l14,lagged_open_avg_l60,lagged_open_avg_l90,lagged_open_avg_l120,lagged_high_avg_l7,...,lagged_close_avg_l7,lagged_close_avg_l14,lagged_close_avg_l60,lagged_close_avg_l90,lagged_close_avg_l120,lagged_adjusted_close_avg_l7,lagged_adjusted_close_avg_l14,lagged_adjusted_close_avg_l60,lagged_adjusted_close_avg_l90,lagged_adjusted_close_avg_l120
4875,3,36,2018,3,258.008571,252.496428,239.621,239.392667,232.883334,261.357141,...,260.110001,253.339286,239.452334,239.171889,233.0065,260.110001,253.339286,239.452334,239.171889,233.0065
4876,3,36,2018,4,259.43857,253.221428,239.796167,239.602111,233.500334,262.879998,...,261.437143,254.506428,239.678,239.389222,233.63825,261.437143,254.506428,239.678,239.389222,233.63825
4877,3,36,2018,5,260.877141,254.070712,240.004167,239.779889,234.113,264.058572,...,262.641427,255.676428,239.866,239.553889,234.21375,262.641427,255.676428,239.866,239.553889,234.21375
4878,3,36,2018,6,261.975712,255.27857,240.199,240.037334,234.68375,264.801431,...,263.349997,257.096427,240.067667,239.799222,234.75275,263.349997,257.096427,240.067667,239.799222,234.75275
4879,3,37,2018,2,262.495714,256.875714,240.440001,240.322223,235.226417,264.791434,...,262.999996,258.157855,240.424333,240.120111,235.305167,262.999996,258.157855,240.424333,240.120111,235.305167
4880,3,37,2018,3,262.387142,258.077857,240.825001,240.735778,235.78075,265.164294,...,263.262852,259.530713,240.713167,240.711333,235.889417,263.262852,259.530713,240.713167,240.711333,235.889417
4881,3,37,2018,4,262.840001,259.249286,241.079667,241.404667,236.344584,265.478581,...,263.278569,260.672142,240.9915,241.152222,236.4675,263.278569,260.672142,240.9915,241.152222,236.4675
4882,3,37,2018,5,262.622859,260.315715,241.371001,241.781334,236.92025,265.524292,...,263.591426,261.850713,241.378834,241.555444,237.049333,263.591426,261.850713,241.378834,241.555444,237.049333
4883,3,37,2018,6,262.760005,261.099288,241.679668,242.081778,237.486001,265.472866,...,263.29,262.363571,241.832167,242.029333,237.572084,263.29,262.363571,241.832167,242.029333,237.572084
4884,3,38,2018,2,262.768576,261.822859,242.076834,242.343778,238.027084,265.615723,...,263.531429,263.086428,242.129334,242.365222,238.114917,263.531429,263.086428,242.129334,242.365222,238.114917


In [None]:
[(y_test.tolist()[i], predict[i]) for i in range(len(y_test))]

[(283.70001220703125, 276.5799967447917),
 (278.4200134277344, 276.5799967447917),
 (272.7200012207031, 276.5799967447917),
 (271.8599853515625, 276.5799967447917),
 (274.7300109863281, 276.5799967447917),
 (272.79998779296875, 276.5799967447917),
 (268.20001220703125, 276.5799967447917),
 (271.3399963378906, 276.5799967447917),
 (276.42999267578125, 276.5799967447917),
 (273.92999267578125, 276.5799967447917),
 (271.0199890136719, 276.5799967447917),
 (271.9800109863281, 276.5799967447917),
 (266.2799987792969, 276.5799967447917),
 (263.45001220703125, 276.5799967447917),
 (265.70001220703125, 276.5799967447917),
 (268.4100036621094, 276.5799967447917),
 (266.9200134277344, 276.5799967447917),
 (267.3999938964844, 276.5799967447917),
 (281.0199890136719, 276.5799967447917),
 (289.3599853515625, 276.5799967447917),
 (286.4800109863281, 276.5799967447917),
 (286.7300109863281, 276.5799967447917),
 (279.2900085449219, 276.5799967447917),
 (269.8599853515625, 276.5799967447917),
 (265.769

[CV] END bootstrap=True, max_depth=6, min_samples_leaf=7, min_samples_split=2, n_estimators=20, random_state=1; total time=   0.7s
[CV] END bootstrap=True, max_depth=14, min_samples_leaf=1, min_samples_split=9, n_estimators=20, random_state=4101135339; total time=   1.0s
[CV] END bootstrap=False, max_depth=5, min_samples_leaf=1, min_samples_split=9, n_estimators=1000, random_state=30; total time=  55.8s
[CV] END bootstrap=False, max_depth=14, min_samples_leaf=11, min_samples_split=2, n_estimators=500, random_state=1; total time=  51.7s
[CV] END bootstrap=False, max_depth=6, min_samples_leaf=7, min_samples_split=10, n_estimators=500, random_state=42; total time=  33.4s
[CV] END bootstrap=True, max_depth=14, min_samples_leaf=7, min_samples_split=2, n_estimators=500, random_state=2; total time=  31.9s
[CV] END bootstrap=True, max_depth=4, min_samples_leaf=11, min_samples_split=2, n_estimators=100, random_state=30; total time=   3.0s
[CV] END bootstrap=False, max_depth=7, min_samples_leaf=

In [None]:
# look at feature importance after all the features are engineered