In [1]:
# !pip3 install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=051f2938c3f239c9c30b7baa14fca5ae8014ee09d1241e1af7e4ad0f959a7de6
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


# **Introduction to Big Data**

# **Prediction Using Supervised ML - PySpark Linear Regression (Cryptocurrency Dataset - BTC)**

## **Data Science & Business Analytics**

## **Student 1: 11355 | Asad Tariq Sheikh**

## **Student 2: 10718 | Muzammil Ahmed**

Dataset: https://raw.githubusercontent.com/asadsheikh1/MachineLearningPredictions/main/bitcoin.csv

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import urllib.request

# **Create a Spark session**

In [3]:
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

# **Download the CSV file locally**

In [4]:
url = "https://raw.githubusercontent.com/asadsheikh1/MachineLearningPredictions/main/bitcoin.csv"
local_file_path = "bitcoin.csv"
urllib.request.urlretrieve(url, local_file_path)

('bitcoin.csv', <http.client.HTTPMessage at 0x7dfc78c95120>)

# **Load data from the local CSV file**

In [5]:
data = spark.read.csv(local_file_path, header=True, inferSchema=True)

# **Display the schema and first few rows of the DataFrame**

In [6]:
data.printSchema()
data.show()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: long (nullable = true)
 |-- Currency: string (nullable = true)

+----------+----+----+---+-----+------+--------+
|      Date|Open|High|Low|Close|Volume|Currency|
+----------+----+----+---+-----+------+--------+
|2010-07-18| 0.0| 0.1|0.1|  0.1|    75|     USD|
|2010-07-19| 0.1| 0.1|0.1|  0.1|   574|     USD|
|2010-07-20| 0.1| 0.1|0.1|  0.1|   262|     USD|
|2010-07-21| 0.1| 0.1|0.1|  0.1|   575|     USD|
|2010-07-22| 0.1| 0.1|0.1|  0.1|  2160|     USD|
|2010-07-23| 0.1| 0.1|0.1|  0.1|  2403|     USD|
|2010-07-24| 0.1| 0.1|0.1|  0.1|   496|     USD|
|2010-07-25| 0.1| 0.1|0.1|  0.1|  1551|     USD|
|2010-07-26| 0.1| 0.1|0.1|  0.1|   877|     USD|
|2010-07-27| 0.1| 0.1|0.1|  0.1|  3374|     USD|
|2010-07-28| 0.1| 0.1|0.1|  0.1|  4390|     USD|
|2010-07-29| 0.1| 0.1|0.1|  0.1|  8058|     USD|
|2010

# **Assuming `Close` is the column you want to predict**

In [7]:
assembler = VectorAssembler(inputCols=['Open', 'High', 'Low', 'Volume'], outputCol='features')
data = assembler.transform(data)

# **Rename the `Close` column to `label`**

In [8]:
data = data.withColumnRenamed('Close', 'label')

# **Split the data into training and testing sets**

In [9]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# **Create a Linear Regression model**

In [10]:
lr = LinearRegression(featuresCol='features', labelCol='label')

# **Train the model**

In [11]:
lr_model = lr.fit(train_data)

# **Make predictions on the test set**

In [12]:
predictions = lr_model.transform(test_data)

# **Evaluate the model**

In [16]:
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)

# **Metrics**

In [17]:
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 313.07742149255967
Coefficients: [-0.5134050434234165,0.9155894600413454,0.5921274086035752,-4.8405627065459295e-08]
Intercept: -1.3033403227673066
Root Mean Squared Error (RMSE): 313.07742149255967


# **Stop the Spark session**

In [15]:
# spark.stop()