In [0]:
%pip install scikit-learn

Load Gold table into Pandas

In [0]:
from pyspark.sql import functions as F
import pandas as pd

df = spark.table("main.stocks.nvda_gold_features").orderBy("date")

pdf = df.select(
    "date",
    "adj_close",
    "daily_return",
    "ret_5d",
    "ret_20d",
    "vol_20d",
    "fwd_30d_return"
).toPandas().dropna()

pdf.head()

Train/test split & RandomForest

In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

feature_cols = ["adj_close", "daily_return", "ret_5d", "ret_20d", "vol_20d"]
target_col  = "fwd_30d_return"

X = pdf[feature_cols].values
y = pdf[target_col].values
dates = pd.to_datetime(pdf["date"]).values

# time-based split (no shuffle)
split_idx = int(len(X) * 0.8)

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
dates_test = dates[split_idx:]

model_rf_30d = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model_rf_30d.fit(X_train, y_train)

y_pred_test = model_rf_30d.predict(X_test)

rmse = mean_squared_error(y_test, y_pred_test)
r2   = r2_score(y_test, y_pred_test)

print("Random Forest 30-day return")
print("Test RMSE:", rmse)
print("Test RÂ²:  ", r2)


### Create predictions table 

Predict for all dates

In [0]:
# Predict 30-day return for every row 
y_pred_all = model_rf_30d.predict(X)

pred_rf_30d_pdf = pd.DataFrame()
pred_rf_30d_pdf["date"] = pd.to_datetime(dates).astype("datetime64[ns]").date
pred_rf_30d_pdf["actual_fwd_30d_return"] = y
pred_rf_30d_pdf["pred_fwd_30d_return_rf"] = y_pred_all

pred_rf_30d_pdf.head()


Save to Delta table

In [0]:
pred_rf_30d_spark = spark.createDataFrame(pred_rf_30d_pdf)

pred_rf_30d_spark.write.mode("overwrite").saveAsTable(
    "main.stocks.nvda_predictions_rf_30d_return"
)


In [0]:
display(spark.table("main.stocks.nvda_predictions_rf_30d_return").orderBy("date").limit(5))