Importing Required Libraries

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, avg
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

Initialize Spark Session

In [5]:
spark = SparkSession.builder.appName("XGBoost_PySpark").getOrCreate()

Load Dataset

In [6]:
df = spark.read.csv("Data/coin_ChainLink.csv", header=True, inferSchema=True)
df = df.orderBy("Date")

Feature Engineering

In [7]:
windowSpec = Window.orderBy("Date")
df = df.withColumn("RSI", (col("Close") - lag("Close", 14).over(windowSpec)) / col("Close") * 100)
df = df.withColumn("MA7", avg("Close").over(Window.orderBy("Date").rowsBetween(-6, 0)))
df = df.withColumn("MA21", avg("Close").over(Window.orderBy("Date").rowsBetween(-20, 0)))
df = df.dropna()

Feature Selection

In [8]:
features = ["High", "Low", "Open", "Volume", "Marketcap", "RSI", "MA7", "MA21"]
target = "Close"
assembler = VectorAssembler(inputCols=features, outputCol="features")
df = assembler.transform(df)

Normalize Features

In [9]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
df = scaler.fit(df).transform(df)


25/03/04 04:25:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Train-Test Split

In [10]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

Define XGBoost Model

In [11]:
model = GBTRegressor(featuresCol="scaledFeatures", labelCol="Close", maxIter=50, maxDepth=5)


Train Model

In [12]:
xgb_model = model.fit(train_df)

25/03/04 04:25:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 0

Predictions

In [13]:
predictions = xgb_model.transform(test_df)

Evaluation


In [14]:
mse_evaluator = RegressionEvaluator(labelCol="Close", predictionCol="prediction", metricName="mse")
mae_evaluator = RegressionEvaluator(labelCol="Close", predictionCol="prediction", metricName="mae")
r2_evaluator = RegressionEvaluator(labelCol="Close", predictionCol="prediction", metricName="r2")

mse = mse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)
r2 = r2_evaluator.evaluate(predictions)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-Squared (R2 Score): {r2:.4f}")


25/03/04 04:25:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 0

Mean Squared Error (MSE): 0.3828
Mean Absolute Error (MAE): 0.2988
R-Squared (R2 Score): 0.9956


25/03/04 04:25:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/04 04:25:37 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Stop Spark Session

In [15]:
spark.stop()