In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=6a1f33bb27a7fdf1f9b83908d34b8433c35dee4847dece852e9fefcb85de54ab
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [1]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import StructType, StructField, DoubleType

from pyspark.ml.linalg import Vectors, VectorUDT


Initialize spark session

In [2]:
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

Get input data

Input data example:

We have 2 columns seperated by comma.

-1.74,1.66</br>
1.24,-1.18</br>
0.29,-0.40</br>
-0.13,0.09</br>
-0.39,0.38</br>
-1.79,1.73</br>

In [4]:
inputLines = spark.sparkContext.textFile("./regression.txt")

We use RDD interface to parse the data out. We then map x, where x represents each row of RDD, extracts first column which is the label which we are predicting. First column is amount spent, then after that there are features that we are using. In our case we only have one feature, i.e. the page speed. We could create a dense vector consisting multiple features to multivariate linear regression.

In [5]:
inputLines.take(2)

['-1.74,1.66', '1.24,-1.18']

In [6]:
data = inputLines.map(lambda x:x.split(",")).map(lambda x:(float(x[0]), Vectors.dense(float(x[1]))))

We now create spark dataframe

In [7]:
schema = StructType([
    StructField("label", DoubleType(), True),
    StructField("features", VectorUDT(), True)
])

df = spark.createDataFrame(data, schema)
df.take(2)

[Row(label=-1.74, features=DenseVector([1.66])),
 Row(label=1.24, features=DenseVector([-1.18]))]

In [9]:
trainTest = df.randomSplit([0.8,0.2])
trainDF = trainTest[0]
testDF = trainTest[1]

In [10]:
trainDF

DataFrame[label: double, features: vector]

Initializing Linear Regression with hyperparameters:


- maxIter: Controls the number of iterations for the optimization algorithm.
- regParam: Determines the amount of regularization to apply, helping to prevent overfitting.
- elasticNetParam: Balances between L1 and L2 regularization to combine their strengths.

In [11]:
spark_LR = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="label")

In [12]:
model = spark_LR.fit(trainDF)

Next, we predict with our test set. Cache the data for doing stuffs with the result dataset.

In [13]:
predictions = model.transform(testDF).cache()

In [14]:
predictions.show()

+-----+--------+-------------------+
|label|features|         prediction|
+-----+--------+-------------------+
|-2.09|  [1.97]|-1.3895233295705949|
| -2.0|  [2.02]|-1.4248063975333296|
|-1.91|  [1.86]|-1.3119005800525783|
|-1.77|  [1.66]|-1.1707683082016387|
|-1.75|  [1.69]|-1.1919381489792797|
|-1.64|  [1.84]|-1.2977873528674844|
| -1.6|  [1.63]|-1.1495984674239978|
|-1.58|  [1.45]|-1.0225794227581524|
|-1.57|  [1.56]|-1.1002021722761692|
|-1.48|  [1.38]|-0.9731831276103237|
| -1.4|  [1.32]| -0.930843446055042|
|-1.36|  [1.41]|-0.9943529683879646|
|-1.26|  [1.17]|-0.8249942421668374|
|-1.25|  [1.32]| -0.930843446055042|
|-1.22|   [1.2]|-0.8461640829444783|
|-1.11|  [1.23]|-0.8673339237221191|
|-1.09|  [1.06]|-0.7473714926488209|
|-1.09|  [1.18]|-0.8320508557593843|
|-0.95|  [0.84]|-0.5921259936127875|
|-0.91|  [1.03]|-0.7262016518711799|
+-----+--------+-------------------+
only showing top 20 rows



In [15]:
spark.stop()