In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, when, lit
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

# Initialize Spark
spark = SparkSession.builder.appName("StockPricePrediction").getOrCreate()

In [22]:
# Load Data
df = spark.read.csv("COCO COLA.csv", header=True, inferSchema=True)

In [23]:
# Feature Engineering
window_spec = Window().orderBy("Date")
df = df.withColumn("Prev_Close", lag("Close").over(window_spec))
df = df.withColumn("Price_Movement", when(col("Close") > col("Open"), 1).otherwise(0))

In [24]:
# Remove Nulls (from lag function)
df = df.dropna()

In [25]:
# Feature Selection
feature_cols = ["Prev_Close", "Open", "High", "Low", "Volume"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

In [26]:
# Normalize Features
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
df = scaler.fit(df).transform(df)

df = df.select("scaled_features", "Close", "Price_Movement")

df.show(5)

+--------------------+--------+--------------+
|     scaled_features|   Close|Price_Movement|
+--------------------+--------+--------------+
|[0.00118395786148...|0.257161|             0|
|[0.00108619007685...|0.259115|             1|
|[0.00111879046101...|0.253255|             0|
|[0.00102102267638...|0.250651|             0|
|[9.77577742740058...|0.255208|             1|
+--------------------+--------+--------------+
only showing top 5 rows



In [30]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Train Random Forest Model
rf = RandomForestRegressor(featuresCol="scaled_features", labelCol="Close", numTrees=50)

In [31]:
# Train Model
rf_model = rf.fit(train)

# Predictions
rf_predictions = rf_model.transform(test)

In [32]:
# Evaluate Model
evaluator = RegressionEvaluator(labelCol="Close", metricName="rmse")
rmse = evaluator.evaluate(rf_predictions)
print(f"Random Forest RMSE: {rmse:.3f}")

# Show Results
rf_predictions.select("Close", "prediction").show(5)

Random Forest RMSE: 0.794
+--------+-------------------+
|   Close|         prediction|
+--------+-------------------+
|0.192708|0.33743747840463195|
|0.195313|0.33743747840463195|
|0.199219|0.33743747840463195|
|0.198242|0.33743747840463195|
|0.203125|0.33743747840463195|
+--------+-------------------+
only showing top 5 rows

