In [0]:
%pip install tensorflow scikit-learn


In [0]:
from pyspark.sql import functions as F
import pandas as pd
import numpy as np

# Load gold features
df = spark.table("main.stocks.nvda_gold_features").orderBy("date")

pdf = df.select(
    "date",
    "adj_close",
    "daily_return",
    "ret_5d",
    "ret_20d",
    "vol_20d",
    "fwd_30d_return"
).toPandas().dropna()

pdf.head()


In [0]:
feature_cols = ["adj_close", "daily_return", "ret_5d", "ret_20d", "vol_20d"]
target_col  = "fwd_30d_return"

lookback = 60  # use last 60 days to predict 30-day forward return

values = pdf[feature_cols].values
targets = pdf[target_col].values
dates   = pd.to_datetime(pdf["date"]).values

X_list, y_list, date_list = [], [], []

for i in range(lookback, len(pdf)):
    # past 60 days of features
    X_list.append(values[i - lookback:i, :])
    # 30-day forward return for "today"
    y_list.append(targets[i])
    # label date (for plotting later)
    date_list.append(dates[i])

X = np.array(X_list)      # (samples, timesteps, features)
y = np.array(y_list)      # (samples,)
date_arr = np.array(date_list)

print("X shape:", X.shape)
print("y shape:", y.shape)


In [0]:
split_idx = int(len(X) * 0.8)

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
dates_test = date_arr[split_idx:]

len(X_train), len(X_test)


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error

n_timesteps = X.shape[1]
n_features  = X.shape[2]

model_lstm_30d = Sequential()
model_lstm_30d.add(LSTM(64, input_shape=(n_timesteps, n_features)))
model_lstm_30d.add(Dropout(0.2))
model_lstm_30d.add(Dense(1))  # regression: 30-day forward return (%)

model_lstm_30d.compile(optimizer="adam", loss="mse")

history = model_lstm_30d.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    batch_size=32,
    verbose=1
)


In [0]:
y_pred_test = model_lstm_30d.predict(X_test).flatten()

rmse = np.sqrt(
    mean_squared_error(
        y_test,
        y_pred_test
    )
)
print("LSTM 30-day return RMSE:", rmse)


In [0]:
# Predictions for ALL sequences (entire history after first lookback)
y_pred_all = model_lstm_30d.predict(X).flatten()

pred_lstm_30d_pdf = pd.DataFrame()
pred_lstm_30d_pdf["date"] = pd.to_datetime(date_arr).astype("datetime64[ns]").date
pred_lstm_30d_pdf["actual_fwd_30d_return"] = y
pred_lstm_30d_pdf["pred_fwd_30d_return_lstm"] = y_pred_all

pred_lstm_30d_pdf.head()


In [0]:
pred_lstm_30d_spark = spark.createDataFrame(pred_lstm_30d_pdf)

pred_lstm_30d_spark.write.mode("overwrite").saveAsTable(
    "main.stocks.nvda_predictions_lstm_30d_return"
)
