In [1]:
# !pip3 install pyspark

# **Introduction to Big Data**

# **Prediction Using Supervised ML - PySpark Linear Regression (Cryptocurrency Dataset - USD)**

## **Data Science & Business Analytics**

## **Student 1: 11355 | Asad Tariq Sheikh**

## **Student 2: 10718 | Muzammil Ahmed**

Dataset: https://raw.githubusercontent.com/asadsheikh1/MachineLearningPredictions/main/usd.csv

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import urllib.request


# **Create a Spark session**

In [3]:
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

# **Download the CSV file locally**

In [4]:
url = "https://raw.githubusercontent.com/asadsheikh1/MachineLearningPredictions/main/usd.csv"
local_file_path = "usd.csv"
urllib.request.urlretrieve(url, local_file_path)

('usd.csv', <http.client.HTTPMessage at 0x7d476a281e10>)

# **Load data from the local CSV file**

In [5]:
data = spark.read.csv(local_file_path, header=True, inferSchema=True)

# **Display the schema and first few rows of the DataFrame**

In [6]:
data.printSchema()
data.show()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: long (nullable = true)
 |-- Currency: string (nullable = true)

+----------+----+----+---+-----+------+--------+
|      Date|Open|High|Low|Close|Volume|Currency|
+----------+----+----+---+-----+------+--------+
|18/07/2010| 0.0| 0.1|0.1|  0.1|    75|     USD|
|19/07/2010| 0.1| 0.1|0.1|  0.1|   574|     USD|
|20/07/2010| 0.1| 0.1|0.1|  0.1|   262|     USD|
|21/07/2010| 0.1| 0.1|0.1|  0.1|   575|     USD|
|22/07/2010| 0.1| 0.1|0.1|  0.1|  2160|     USD|
|23/07/2010| 0.1| 0.1|0.1|  0.1|  2403|     USD|
|24/07/2010| 0.1| 0.1|0.1|  0.1|   496|     USD|
|25/07/2010| 0.1| 0.1|0.1|  0.1|  1551|     USD|
|26/07/2010| 0.1| 0.1|0.1|  0.1|   877|     USD|
|27/07/2010| 0.1| 0.1|0.1|  0.1|  3374|     USD|
|28/07/2010| 0.1| 0.1|0.1|  0.1|  4390|     USD|
|29/07/2010| 0.1| 0.1|0.1|  0.1|  8058|     USD|
|30

# **Assuming `Close` is the column you want to predict**

In [7]:
assembler = VectorAssembler(inputCols=['Open', 'High', 'Low', 'Volume'], outputCol='features')
data = assembler.transform(data)

# **Rename the `Close` column to `label`**

In [8]:
data = data.withColumnRenamed('Close', 'label')

# **Split the data into training and testing sets**

In [9]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# **Create a Linear Regression model**

In [10]:
lr = LinearRegression(featuresCol='features', labelCol='label')

# **Train the model**

In [11]:
lr_model = lr.fit(train_data)

# **Make predictions on the test set**

In [12]:
predictions = lr_model.transform(test_data)

# **Evaluate the model**

In [13]:
evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)

# **Metrics**

In [14]:
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")

Root Mean Squared Error (RMSE): 328.3690506500255
Coefficients: [-0.5225964280228853,0.9436926781269785,0.5714438384313236,-8.648311819732884e-08]
Intercept: 1.2594583827791468


# **Stop the Spark session**

In [15]:
# spark.stop()