In [0]:
%pip install scikit-learn

Load Gold table into Pandas

In [0]:
from pyspark.sql import functions as F

df = spark.table("main.stocks.nvda_gold_features").orderBy("date")

pdf = df.select(
    "date",
    "adj_close",
    "daily_return",
    "ret_5d",
    "ret_20d",
    "vol_20d",
    "fwd_30d_return"
).toPandas().dropna()

pdf.head()

Train/test split & RandomForest

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

feature_cols = ["adj_close", "daily_return", "ret_5d", "ret_20d", "vol_20d"]
target_col = "fwd_30d_return"

X = pdf[feature_cols].values
y = pdf[target_col].values

# Time-series: no shuffle so we preserve order
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

model = RandomForestRegressor(
    n_estimators=200,
    max_depth=8,
    random_state=42
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Test MSE:", mse)
print("Test R2:", r2)


### Create predictions table 

Predict for all dates

In [0]:
import pandas as pd

pdf_pred = pdf.copy()
pdf_pred["pred_fwd_30d_return"] = model.predict(pdf_pred[feature_cols].values)

pdf_pred[["date", "fwd_30d_return", "pred_fwd_30d_return"]].tail()


Save to Delta table

In [0]:
pred_spark = spark.createDataFrame(
    pdf_pred[["date", "pred_fwd_30d_return"]]
)

pred_spark.write.mode("overwrite").saveAsTable("main.stocks.nvda_predictions")


In [0]:
display(spark.table("main.stocks.nvda_predictions").orderBy("date").limit(5))