In [0]:
%pip install scikit-learn

In [0]:
from pyspark.sql import functions as F

df = spark.table("main.stocks.nvda_gold_features").orderBy("date")

pdf = df.select(
    "date",
    "adj_close",
    "daily_return",
    "ret_5d",
    "ret_20d",
    "vol_20d"
).toPandas().dropna()

pdf.head()


In [0]:
import pandas as pd

# next day's close as label
pdf["next_close"] = pdf["adj_close"].shift(-1)

# date corresponding to that next_close
pdf["target_date"] = pdf["date"].shift(-1)

# drop last row (no next day)
pdf = pdf.dropna(subset=["next_close", "target_date"])

pdf.head()


In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

feature_cols = ["adj_close", "daily_return", "ret_5d", "ret_20d", "vol_20d"]

X = pdf[feature_cols].values
y = pdf["next_close"].values

# time-series split, no shuffle
split_idx = int(len(pdf) * 0.8)

X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

model_rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
model_rf.fit(X_train, y_train)

y_pred_test = model_rf.predict(X_test)

rmse = np.sqrt(
    mean_squared_error(
        y_test,
        y_pred_test
    )
)
r2 = r2_score(
    y_test,
    y_pred_test
)

print("Random Forest next-day close")
print("Test RMSE:", rmse)
print("Test RÂ²:  ", r2)


In [0]:
# Predictions for the entire dataset
y_pred_all = model_rf.predict(X)

pred_pdf = pd.DataFrame()
pred_pdf["date"] = pd.to_datetime(pdf["target_date"]).dt.date  # tomorrow's date
pred_pdf["actual_close"] = pdf["next_close"].values
pred_pdf["predicted_close_rf"] = y_pred_all

pred_pdf.head()


In [0]:
pred_spark = spark.createDataFrame(pred_pdf)

pred_spark.write.mode("overwrite").saveAsTable(
    "main.stocks.nvda_predictions_rf_nextday"
)
