In [None]:
import csv

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import random

def init_spark():
    spark = SparkSession \
        .builder \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

# read and clean the csv
spark = init_spark()
filename = '/home/sarsingh/OneDrive/Documents/SOEN6111+Winter+2022/project/github-repo/data/stock_histories.csv/part-00000-0933a1f1-003b-4a50-8b1e-99f98d978e84-c000.csv'
df = spark.read.csv(filename, header=True, mode="DROPMALFORMED")

In [None]:
df = df.withColumn("volume",df.volume.cast(IntegerType()))
df = df.withColumn("open",df.open.cast(FloatType()))
df = df.withColumn("close",df.close.cast(FloatType()))
df = df.withColumn("adjusted_close",df.close.cast(FloatType()))
df = df.withColumn("high",df.high.cast(FloatType()))
df = df.withColumn("low",df.low.cast(FloatType()))

In [None]:
# get list of unique companies
companies = [i.stock for i in df.select(col("stock")).distinct().collect()]
print(companies)

In [154]:
for company in companies:
    company_df = df.where(col("stock") == str(company))
    company_df.show()
    company_df.orderBy(["date"]).select("date").first()
    company_df.orderBy(["date"], ascending=False).select("date").first()
    # feature engineering: Quarter, week of year, year, day of week
    company_df = company_df.withColumn("quarter", quarter(col("date")))
    company_df = company_df.withColumn("week_of_year", weekofyear(col("date")))
    company_df = company_df.withColumn("year", year(col("date")))
    company_df = company_df.withColumn("day_of_week", dayofweek(col("date")))
    company_df = company_df.withColumn("year_quarter", concat(company_df.year, lit("-"), company_df.quarter).alias("year_quarter"))
    company_df.select("date", "quarter", "week_of_year", "year", "day_of_week", "year_quarter").show()
    # get last full quarter
    # step 1: calculate days in each quarters - a full quarter has at least 60 days 
    quarter_day_count = company_df.groupby("year_quarter").count()\
                                                    .filter("count >= 60")
    last_quarter = quarter_day_count.orderBy(["year_quarter"], ascending=False).select("year_quarter").first()
    last_quarter[0]
    quarter_day_count = quarter_day_count.withColumn("is_test", when(col("year_quarter") == last_quarter[0], 1).otherwise(0))
    quarter_day_count.show()
    company_df = company_df.join(quarter_day_count, "year_quarter")
    company_df.count()
    open_average = company_df.groupby("year_quarter")\
                    .agg(avg("open").alias("open_avg"))
    windowSpec = Window.orderBy("year_quarter")
    open_average = open_average.withColumn("open_avg_l1", lag("open_avg", 1).over(windowSpec))
    open_average = open_average.withColumn("open_avg_l2", lag("open_avg", 2).over(windowSpec))
    open_average = open_average.withColumn("open_avg_l3", lag("open_avg", 3).over(windowSpec))
    open_average = open_average.withColumn("open_avg_l4", lag("open_avg", 4).over(windowSpec))
    open_average = open_average.drop("open_avg") # drop unused col
    volume_average = company_df.groupby("year_quarter")\
                        .agg(avg("volume").alias("volume_avg"))
    volume_average = volume_average.withColumn("volume_avg_l1", lag("volume_avg", 1).over(windowSpec))
    volume_average = volume_average.withColumn("volume_avg_l2", lag("volume_avg", 2).over(windowSpec))
    volume_average = volume_average.withColumn("volume_avg_l3", lag("volume_avg", 3).over(windowSpec))
    volume_average = volume_average.withColumn("volume_avg_l4", lag("volume_avg", 4).over(windowSpec))
    volume_average = volume_average.drop("volume_avg") # drop unused col
    high_average = company_df.groupby("year_quarter")\
                    .agg(avg("high").alias("high_avg"))
    high_average = high_average.withColumn("high_avg_l1", lag("high_avg", 1).over(windowSpec))
    high_average = high_average.withColumn("high_avg_l2", lag("high_avg", 2).over(windowSpec))
    high_average = high_average.withColumn("high_avg_l3", lag("high_avg", 3).over(windowSpec))
    high_average = high_average.withColumn("high_avg_l4", lag("high_avg", 4).over(windowSpec))
    high_average = high_average.drop("high_avg") # drop unused col
    low_average = company_df.groupby("year_quarter")\
                    .agg(avg("low").alias("low_avg"))
    low_average = low_average.withColumn("low_avg_l1", lag("low_avg", 1).over(windowSpec))
    low_average = low_average.withColumn("low_avg_l2", lag("low_avg", 2).over(windowSpec))
    low_average = low_average.withColumn("low_avg_l3", lag("low_avg", 3).over(windowSpec))
    low_average = low_average.withColumn("low_avg_l4", lag("low_avg", 4).over(windowSpec))
    low_average = low_average.drop("low_avg") # drop unused col
    close_average = company_df.groupby("year_quarter")\
                        .agg(avg("close").alias("close_avg"))
    close_average = close_average.withColumn("close_avg_l1", lag("close_avg", 1).over(windowSpec))
    close_average = close_average.withColumn("close_avg_l2", lag("close_avg", 2).over(windowSpec))
    close_average = close_average.withColumn("close_avg_l3", lag("close_avg", 3).over(windowSpec))
    close_average = close_average.withColumn("close_avg_l4", lag("close_avg", 4).over(windowSpec))
    close_average = close_average.drop("close_avg") # drop unused col
    adj_close_average = company_df.groupby("year_quarter")\
                            .agg(avg("adjusted_close").alias("adj_close_avg"))
    adj_close_average = adj_close_average.withColumn("adj_close_avg_l1", lag("adj_close_avg", 1).over(windowSpec))
    adj_close_average = adj_close_average.withColumn("adj_close_avg_l2", lag("adj_close_avg", 2).over(windowSpec))
    adj_close_average = adj_close_average.withColumn("adj_close_avg_l3", lag("adj_close_avg", 3).over(windowSpec))
    adj_close_average = adj_close_average.withColumn("adj_close_avg_l4", lag("adj_close_avg", 4).over(windowSpec))
    adj_close_average = adj_close_average.drop("adj_close_avg") # drop unused col
    company_df = company_df.join(open_average, "year_quarter")\
            .join(volume_average, "year_quarter")\
            .join(high_average, "year_quarter")\
            .join(low_average, "year_quarter")\
            .join(close_average, "year_quarter")\
            .join(adj_close_average, "year_quarter")
    open_average.show(5)
    sc = spark.sparkContext
    sc.setLogLevel("ERROR")
    # drop rows with nulls (i.e. first four quarters)
    company_df = company_df.na.drop()
    company_df = company_df.toPandas()
    company_df.columns
    predictor_features = ['quarter', 'week_of_year', 'year', 'day_of_week', 'open_avg_l1', 'open_avg_l2',
                        'open_avg_l3', 'open_avg_l4', 'volume_avg_l1', 'volume_avg_l2', 'volume_avg_l3', 
                        'volume_avg_l4', 'high_avg_l1', 'high_avg_l2', 'high_avg_l3', 'high_avg_l4', 
                        'low_avg_l1', 'low_avg_l2', 'low_avg_l3', 'low_avg_l4', 'close_avg_l1', 'close_avg_l2', 
                        'close_avg_l3', 'close_avg_l4', 'adj_close_avg_l1', 'adj_close_avg_l2', 'adj_close_avg_l3', 
                        'adj_close_avg_l4']
    x_train, x_test = company_df[company_df.is_test == 0][predictor_features], company_df[company_df.is_test == 1][predictor_features]
    y_train, y_test = company_df[company_df.is_test == 0]['adjusted_close'], company_df[company_df.is_test == 1]['adjusted_close']
    y_test_dates = company_df[company_df.is_test == 1]['date']

    # hypertuning steps
    model = RandomForestRegressor()  # blank/boilerplate model for randomSearch
    grid_rf = {"n_estimators": [20, 50, 100, 500, 1000], "max_depth": np.arange(1, 15, 1), "min_samples_split": [2, 10, 9], "min_samples_leaf": np.arange(1, 15, 2, dtype=int), "bootstrap": [True, False], "random_state": [1, 2, 30, 42, random.randint(0, (2**32 - 1))]}
    rscv = RandomizedSearchCV(
        estimator=model, param_distributions=grid_rf, cv=3, n_jobs=-1, verbose=2, n_iter=200
    )
    rscv_fit = rscv.fit(x_train, y_train)
    best_parameters = rscv_fit.best_params_
    print(best_parameters)

    model = RandomForestRegressor(
        random_state=best_parameters["random_state"],
        n_estimators=best_parameters["n_estimators"],
        min_samples_split=best_parameters["min_samples_split"],
        min_samples_leaf=best_parameters["min_samples_leaf"],
        max_depth=best_parameters["max_depth"],
        bootstrap=best_parameters["bootstrap"],
    )

    model.fit(x_train, y_train)

    predict = model.predict(x_test)

    print(metrics.mean_absolute_error(y_test, predict))
    print(metrics.mean_squared_error(y_test, predict))
    print(np.sqrt(metrics.mean_squared_error(y_test, predict)))

    # print(metrics.r2_score(y_test, predict))
    output = [(y_test_dates.tolist()[i], y_test.tolist()[i], predict[i]) for i in range(len(y_test))]
    print(output)

    #save quartely model to disk
    header_list = ["date", "test_value", "predicted_value"]
    with open("/home/sarsingh/OneDrive/Documents/SOEN6111+Winter+2022/project/github-repo/dump/randomSearchCV-" + company + "-results.csv", mode='w') as file:
        csv_out = csv.writer(file)
        csv_out.writerow(header_list)
        for row in output:
            csv_out.writerow(row)

+-----+----------+--------+------+------+------+------+--------------+
|stock|      date|  volume|  open| close|  high|   low|adjusted_close|
+-----+----------+--------+------+------+------+------+--------------+
| AAPL|2018-11-02|91263400|209.55|207.48|213.65|205.43|        207.48|
| AAPL|2018-11-01|58323200|219.05|222.22|222.36|216.81|        222.22|
| AAPL|2018-10-31|38358900|216.88|218.86|220.45|216.62|        218.86|
| AAPL|2018-10-30|36660000|211.15| 213.3|215.18|209.27|         213.3|
| AAPL|2018-10-29|45935500|219.19|212.24|219.69|206.09|        212.24|
| AAPL|2018-10-26|47258400| 215.9| 216.3|220.19|212.67|         216.3|
| AAPL|2018-10-25|29855800|217.71| 219.8|221.38|216.75|         219.8|
| AAPL|2018-10-24|40925500| 222.6|215.09|224.23|214.54|        215.09|
| AAPL|2018-10-23|38767800|215.83|222.73|223.25| 214.7|        222.73|
| AAPL|2018-10-22|28792100|219.79|220.65|223.36|218.94|        220.65|
| AAPL|2018-10-19|33078700|218.06|219.31|221.26|217.43|        219.31|
| AAPL

In [155]:
company_df.head(10)

Unnamed: 0,year_quarter,stock,date,volume,open,close,high,low,adjusted_close,quarter,...,low_avg_l3,low_avg_l4,close_avg_l1,close_avg_l2,close_avg_l3,close_avg_l4,adj_close_avg_l1,adj_close_avg_l2,adj_close_avg_l3,adj_close_avg_l4
0,1984-3,AMD,1984-09-28,728000,17.4375,17.375,17.75,17.25,17.375,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
1,1984-3,AMD,1984-09-27,2224000,17.25,17.5625,17.6875,17.1875,17.5625,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
2,1984-3,AMD,1984-09-26,935800,18.375,17.875,18.5625,17.75,17.875,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
3,1984-3,AMD,1984-09-25,488400,18.375,18.3125,18.375,18.125,18.3125,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
4,1984-3,AMD,1984-09-24,560200,18.5625,18.3125,18.6875,18.3125,18.3125,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
5,1984-3,AMD,1984-09-21,987800,18.75,18.625,19.125,18.5625,18.625,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
6,1984-3,AMD,1984-09-20,1439000,18.5,18.75,18.8125,18.25,18.75,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
7,1984-3,AMD,1984-09-19,1194800,19.5625,18.5,19.5625,18.1875,18.5,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
8,1984-3,AMD,1984-09-18,890600,19.875,19.625,19.875,19.5,19.625,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438
9,1984-3,AMD,1984-09-17,689800,19.6875,20.125,20.1875,19.6875,20.125,3,...,15.649802,14.706055,15.725198,15.179563,15.94246,15.023438,15.725198,15.179563,15.94246,15.023438


In [157]:
# get list of unique companies
companies = [i.stock for i in df.select(col("stock")).distinct().collect()]
print(companies)
companies = ['AAPL']

['AAPL', 'GOOG', 'FB', 'TXN', 'AVGO', 'AMZN', 'MSFT', 'NVDA', 'CSCO', 'QCOM', 'EXPE', 'INTC', 'NFLX', 'ORCL', 'CRM', 'ADBE', 'BKNG', 'EA', 'HP', 'INTU', 'IBM', 'ADSK', 'EBAY', 'FTNT', 'AMD']


In [None]:
for company in companies:
    company_df = df.where(df.stock == company)
    company_df = company_df.withColumn("quarter", quarter(col("date")))
    company_df = company_df.withColumn("week_of_year", weekofyear(col("date")))
    company_df = company_df.withColumn("year", year(col("date")))
    company_df = company_df.withColumn("day_of_week", dayofweek(col("date")))

    w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
    company_df = company_df.withColumn('open_avg_l7', avg('open').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_open_avg_l7', lag('open_avg_l7', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('open_avg_l7')

    w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
    company_df = company_df.withColumn('open_avg_l14', avg('open').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_open_avg_l14', lag('open_avg_l14', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('open_avg_l14')

    w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
    company_df = company_df.withColumn('open_avg_l60', avg('open').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_open_avg_l60', lag('open_avg_l60', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('open_avg_l60')

    w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
    company_df = company_df.withColumn('open_avg_l90', avg('open').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_open_avg_l90', lag('open_avg_l90', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('open_avg_l90')

    w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
    company_df = company_df.withColumn('open_avg_l120', avg('open').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_open_avg_l120', lag('open_avg_l120', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('open_avg_l120')

    w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
    company_df = company_df.withColumn('close_avg_l7', avg('close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_close_avg_l7', lag('close_avg_l7', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('close_avg_l7')

    w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
    company_df = company_df.withColumn('close_avg_l14', avg('close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_close_avg_l14', lag('close_avg_l14', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('close_avg_l14')

    w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
    company_df = company_df.withColumn('close_avg_l60', avg('close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_close_avg_l60', lag('close_avg_l60', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('close_avg_l60')

    w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
    company_df = company_df.withColumn('close_avg_l90', avg('close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_close_avg_l90', lag('close_avg_l90', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('close_avg_l90')

    w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
    company_df = company_df.withColumn('close_avg_l120', avg('close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_close_avg_l120', lag('close_avg_l120', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('close_avg_l120')

    w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
    company_df = company_df.withColumn('high_avg_l7', avg('high').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_high_avg_l7', lag('high_avg_l7', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('high_avg_l7')

    w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
    company_df = company_df.withColumn('high_avg_l14', avg('high').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_high_avg_l14', lag('high_avg_l14', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('high_avg_l14')

    w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
    company_df = company_df.withColumn('high_avg_l60', avg('high').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_high_avg_l60', lag('high_avg_l60', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('high_avg_l60')

    w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
    company_df = company_df.withColumn('high_avg_l90', avg('high').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_high_avg_l90', lag('high_avg_l90', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('high_avg_l90')

    w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
    company_df = company_df.withColumn('high_avg_l120', avg('high').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_high_avg_l120', lag('high_avg_l120', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('high_avg_l120')

    w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
    company_df = company_df.withColumn('low_avg_l7', avg('low').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_low_avg_l7', lag('low_avg_l7', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('low_avg_l7')

    w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
    company_df = company_df.withColumn('low_avg_l14', avg('low').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_low_avg_l14', lag('low_avg_l14', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('low_avg_l14')

    w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
    company_df = company_df.withColumn('low_avg_l60', avg('low').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_low_avg_l60', lag('low_avg_l60', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('low_avg_l60')

    w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
    company_df = company_df.withColumn('low_avg_l90', avg('low').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_low_avg_l90', lag('low_avg_l90', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('low_avg_l90')

    w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
    company_df = company_df.withColumn('low_avg_l120', avg('low').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_low_avg_l120', lag('low_avg_l120', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('low_avg_l120')

    w = (Window.orderBy(col("date")).rowsBetween(-7, -1))
    company_df = company_df.withColumn('adjusted_close_avg_l7', avg('adjusted_close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_adjusted_close_avg_l7', lag('adjusted_close_avg_l7', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('adjusted_close_avg_l7')

    w = (Window.orderBy(col("date")).rowsBetween(-14, -1))
    company_df = company_df.withColumn('adjusted_close_avg_l14', avg('adjusted_close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_adjusted_close_avg_l14', lag('adjusted_close_avg_l14', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('adjusted_close_avg_l14')

    w = (Window.orderBy(col("date")).rowsBetween(-60, -1))
    company_df = company_df.withColumn('adjusted_close_avg_l60', avg('adjusted_close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_adjusted_close_avg_l60', lag('adjusted_close_avg_l60', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('adjusted_close_avg_l60')

    w = (Window.orderBy(col("date")).rowsBetween(-90, -1))
    company_df = company_df.withColumn('adjusted_close_avg_l90', avg('adjusted_close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_adjusted_close_avg_l90', lag('adjusted_close_avg_l90', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('adjusted_close_avg_l90')

    w = (Window.orderBy(col("date")).rowsBetween(-120, -1))
    company_df = company_df.withColumn('adjusted_close_avg_l120', avg('adjusted_close').over(w)).orderBy('date')
    company_df = company_df.withColumn('lagged_adjusted_close_avg_l120', lag('adjusted_close_avg_l120', 60).over(Window.orderBy(col("date"))))
    company_df = company_df.drop('adjusted_close_avg_l120')
    from datetime import datetime, timedelta

    last_day = company_df.orderBy('date', ascending=False).select('date').first()[0]
    threshold = datetime.strptime(last_day, '%Y-%m-%d').date() - timedelta(days=60)
    company_df = company_df.withColumn("is_test", when(col("date") >= threshold, 1).otherwise(0))
    
    company_df.count()
    # drop rows with nulls (i.e. first four quarters)
    company_df = company_df.na.drop()
    company_df.count()

    company_df = company_df.toPandas()
    company_df.columns
    predictor_features = ['quarter', 'week_of_year', 'year', 'day_of_week',
                        'lagged_open_avg_l7', 'lagged_open_avg_l14', 'lagged_open_avg_l60',
                        'lagged_open_avg_l90', 'lagged_open_avg_l120', 'lagged_high_avg_l7',
                        'lagged_high_avg_l14', 'lagged_high_avg_l60', 'lagged_high_avg_l90',
                        'lagged_high_avg_l120', 'lagged_low_avg_l7', 'lagged_low_avg_l14',
                        'lagged_low_avg_l60', 'lagged_low_avg_l90', 'lagged_low_avg_l120',
                        'lagged_close_avg_l7', 'lagged_close_avg_l14', 'lagged_close_avg_l60',
                        'lagged_close_avg_l90', 'lagged_close_avg_l120',
                        'lagged_adjusted_close_avg_l7', 'lagged_adjusted_close_avg_l14',
                        'lagged_adjusted_close_avg_l60', 'lagged_adjusted_close_avg_l90',
                        'lagged_adjusted_close_avg_l120']
    x_train, x_test = company_df[company_df.is_test == 0][predictor_features], company_df[company_df.is_test == 1][predictor_features]
    y_train, y_test = company_df[company_df.is_test == 0]['adjusted_close'], company_df[company_df.is_test == 1]['adjusted_close']
    y_test_dates = company_df[company_df.is_test == 1]['date']

    model = RandomForestRegressor()  # blank/boilerplate model

    grid_rf = {
        "n_estimators": [20, 50, 100, 500, 1000],
        "max_depth": np.arange(1, 15, 1),
        "min_samples_split": [2, 10, 9],
        "min_samples_leaf": np.arange(1, 15, 2, dtype=int),
        "bootstrap": [True, False],
        "random_state": [1, 2, 30, 42, random.randint(0, (2**32 - 1))],
    }

    rscv = RandomizedSearchCV(
        estimator=model, param_distributions=grid_rf, cv=3, n_jobs=-1, verbose=2, n_iter=200
    )
    rscv_fit = rscv.fit(x_train, y_train)
    best_parameters = rscv_fit.best_params_
    print(best_parameters)

    model = RandomForestRegressor(
        random_state=best_parameters["random_state"],
        n_estimators=best_parameters["n_estimators"],
        min_samples_split=best_parameters["min_samples_split"],
        min_samples_leaf=best_parameters["min_samples_leaf"],
        max_depth=best_parameters["max_depth"],
        bootstrap=best_parameters["bootstrap"],
    )

    model.fit(x_train, y_train)

    predict = model.predict(x_test)

    # show model feature importances
    import matplotlib.pyplot as plt

    importances = model.feature_importances_
    sorted_index = np.argsort(importances)[::-1]
    x_values = range(len(importances))
    labels = np.array(model.feature_names_in_)
    plt.bar(x_values, importances[sorted_index], tick_label=labels)
    plt.xticks(rotation=90)
    plt.show()
    print(metrics.mean_absolute_error(y_test, predict))
    print(metrics.mean_squared_error(y_test, predict))
    print(np.sqrt(metrics.mean_squared_error(y_test, predict)))
    # print(metrics.r2_score(y_test, predict))
    x_test[predictor_features]
    output = [(y_test.tolist()[i], predict[i]) for i in range(len(y_test))]
    print(output)