In [0]:
%pip install tensorflow scikit-learn

In [0]:
from pyspark.sql import functions as F
import pandas as pd

df = spark.table("main.stocks.nvda_gold_features").orderBy("date")

pdf = df.select(
    "date",
    "adj_close",
    "daily_return",
    "ret_5d",
    "ret_20d",
    "vol_20d"
).toPandas().dropna()

pdf.head()


In [0]:
import numpy as np

feature_cols = ["adj_close", "daily_return", "ret_5d", "ret_20d", "vol_20d"]
lookback = 60  # past 60 days

values = pdf[feature_cols].values
adj_close = pdf["adj_close"].values
dates = pd.to_datetime(pdf["date"]).values

X_list, y_list, date_list = [], [], []

for i in range(lookback, len(pdf)):
    X_list.append(values[i - lookback:i, :])   # past 60 days
    y_list.append(adj_close[i])                # today's close
    date_list.append(dates[i])                 # date of that close

X = np.array(X_list)   # (samples, timesteps, features)
y = np.array(y_list)   # (samples,)
date_arr = np.array(date_list)

print("X shape:", X.shape)
print("y shape:", y.shape)


In [0]:
split_idx = int(len(X) * 0.8)

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
dates_test = date_arr[split_idx:]

len(X_train), len(X_test)


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error

n_timesteps = X.shape[1]
n_features = X.shape[2]

model_lstm = Sequential()
model_lstm.add(LSTM(64, input_shape=(n_timesteps, n_features)))
model_lstm.add(Dropout(0.2))
model_lstm.add(Dense(1))

model_lstm.compile(optimizer="adam", loss="mse")

history = model_lstm.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=32,
    verbose=1
)

# Evaluate
y_pred_test = model_lstm.predict(X_test).flatten()
rmse_lstm = np.sqrt(
    mean_squared_error(
        y_test,
        y_pred_test
    )
)
print("LSTM next-day close RMSE:", rmse_lstm)


In [0]:
y_pred_all = model_lstm.predict(X).flatten()

pred_lstm_pdf = pd.DataFrame()
pred_lstm_pdf["date"] = pd.to_datetime(date_arr).astype("datetime64[ns]").date
pred_lstm_pdf["actual_close"] = y
pred_lstm_pdf["predicted_close_lstm"] = y_pred_all

pred_lstm_pdf.head()


In [0]:
pred_lstm_spark = spark.createDataFrame(pred_lstm_pdf)

pred_lstm_spark.write.mode("overwrite").saveAsTable(
    "main.stocks.nvda_predictions_lstm_nextday"
)
