In [1]:
# !pip3 install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=b835d2b2742346bc75d2f02d62a9ac0802424a3591438d733c43eb304d6ad86a
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


# **Introduction to Big Data**

# **Prediction Using Supervised ML - PySpark Linear Regression (Cryptocurrency Dataset - DogeCoin)**

## **Data Science & Business Analytics**

## **Student 1: 11355 | Asad Tariq Sheikh**

## **Student 2: 10718 | Muzammil Ahmed**

Dataset: https://raw.githubusercontent.com/asadsheikh1/MachineLearningPredictions/main/dogecoin.csv

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import urllib.request

# **Create a Spark session**

In [3]:
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

# **Download the CSV file locally**

In [4]:
url = "https://raw.githubusercontent.com/asadsheikh1/MachineLearningPredictions/main/dogecoin.csv"
local_file_path = "dogecoin.csv"
urllib.request.urlretrieve(url, local_file_path)

('dogecoin.csv', <http.client.HTTPMessage at 0x7e7cf4155000>)

# **Load data from the local CSV file**

In [5]:
data = spark.read.csv(local_file_path, header=True, inferSchema=True)

# **Display the schema and first few rows of the DataFrame**

In [6]:
data.printSchema()
data.show()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: long (nullable = true)
 |-- Currency: string (nullable = true)

+----------+--------+--------+--------+--------+-------+--------+
|      Date|    Open|    High|     Low|   Close| Volume|Currency|
+----------+--------+--------+--------+--------+-------+--------+
|2017-06-03|0.002803|0.003398|0.002655|0.003213|4369225|     USD|
|2017-06-04|0.003213| 0.00377|0.002544| 0.00377|5963160|     USD|
|2017-06-05| 0.00377|0.003825|0.002923|0.003718|4823315|     USD|
|2017-06-06|0.003718|0.003718|0.003152|0.003617|2114105|     USD|
|2017-06-07|0.003617| 0.00395|0.003132| 0.00325|1371475|     USD|
|2017-06-08| 0.00325|0.003511|0.003055|0.003431|1544340|     USD|
|2017-06-09|0.003431|0.003752| 0.00329|0.003426|3428495|     USD|
|2017-06-10|0.003426|0.003591| 0.00313|0.003278|2244725|     USD|
|2017-06-11|0.

# **Assuming `Close` is the column you want to predict**

In [7]:
assembler = VectorAssembler(inputCols=['Open', 'High', 'Low', 'Volume'], outputCol='features')
data = assembler.transform(data)

# **Rename the `Close` column to `label`**

In [8]:
data = data.withColumnRenamed('Close', 'label')

# **Split the data into training and testing sets**

In [9]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# **Create a Linear Regression model**

In [10]:
lr = LinearRegression(featuresCol='features', labelCol='label')

# **Train the model**

In [11]:
lr_model = lr.fit(train_data)

# **Make predictions on the test set**

In [12]:
predictions = lr_model.transform(test_data)

# **Evaluate the model**

In [13]:
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)

# **Metrics**

In [14]:
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 0.0032919806409871764
Coefficients: [-0.43902470153562245,0.8051312617780186,0.6254544463582187,-2.976581469242082e-14]
Intercept: 8.288226063960647e-05
Root Mean Squared Error (RMSE): 0.0032919806409871764


# **Stop the Spark session**

In [15]:
# spark.stop()