# Linear Regression w/ Spark & MLlib

In [19]:
import findspark
findspark.init('/home/aforestier10/Downloads/spark-3.5.3-bin-hadoop3')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LinReg').getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [20]:
#import model
from pyspark.ml.regression import LinearRegression

In [21]:
### Load data
training = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

                                                                                

## Build a very simple model

In [22]:
# Spark must be in only 2 columns for models. 1 label and features column
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [23]:
# instantiate model object
model = LinearRegression(featuresCol='features', labelCol='label', predictionCol='my_model_predictions')

In [24]:
# fit model
lr_model = model.fit(training)

In [25]:
# show some attributes
lr_model.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [26]:
lr_model.intercept

0.14228558260358093

In [27]:
# use training summary to call key methods and attributes
training_summary = lr_model.summary

In [28]:
training_summary.r2, training_summary.rootMeanSquaredError

(0.027839179518600154, 10.16309157133015)

In [29]:
# Actually split into train, test, split
data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')
train_data, test_data = data.randomSplit([.7, .3])

In [30]:
train_data.describe().show(), test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                354|
|   mean| 0.8072096920319102|
| stddev| 10.387608867929963|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                147|
|   mean| -1.068373680677591|
| stddev| 10.059556471892726|
|    min|-28.046018037776633|
|    max| 26.903524792043335|
+-------+-------------------+



(None, None)

In [31]:
# Now make model
base = LinearRegression(featuresCol='features', labelCol='label', predictionCol='my_predictions')
lr_model = base.fit(train_data)

In [32]:
# Evaluate model
results = lr_model.evaluate(test_data)

In [33]:
results.residuals.show(), results.rootMeanSquaredError

+-------------------+
|          residuals|
+-------------------+
|-28.345748237830676|
| -24.02172473713055|
|-26.167282374698473|
|-20.311888308422922|
|-21.639654505695756|
|-21.354648434486172|
|-21.783744333213125|
| -20.06051186822888|
| -17.43712684760353|
| -19.41830393150117|
|-15.537844903112127|
| -16.72956778506602|
|-13.024918838818882|
|-17.515576846886447|
|-13.196149167020742|
| -16.75192211858612|
|-14.193691540420028|
|-14.076361860984653|
|-12.390582077370933|
|-11.143446572387965|
+-------------------+
only showing top 20 rows



(None, 10.2188915489845)

Improve model through tweaking model parameters when initializing object

In [34]:
unlabeled_data = test_data.select('features')

In [35]:
# How to show predictions based on the features
y_pred = lr_model.transform(unlabeled_data)
y_pred.show()

+--------------------+--------------------+
|            features|      my_predictions|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  0.2997302000540432|
|(10,[0,1,2,3,4,5,...|  0.5342846161940393|
|(10,[0,1,2,3,4,5,...|  3.2174564385023974|
|(10,[0,1,2,3,4,5,...|   -2.52557210849642|
|(10,[0,1,2,3,4,5,...|  1.5821718899065438|
|(10,[0,1,2,3,4,5,...|  1.4700876602127464|
|(10,[0,1,2,3,4,5,...|   2.000981543598587|
|(10,[0,1,2,3,4,5,...|  1.7852983022242477|
|(10,[0,1,2,3,4,5,...| 0.11040611492758101|
|(10,[0,1,2,3,4,5,...|  3.1568736560484387|
|(10,[0,1,2,3,4,5,...| -0.4136676626824456|
|(10,[0,1,2,3,4,5,...|   1.370022905233344|
|(10,[0,1,2,3,4,5,...| -1.7378394141122435|
|(10,[0,1,2,3,4,5,...|   3.186598337811003|
|(10,[0,1,2,3,4,5,...| -0.6709387281380252|
|(10,[0,1,2,3,4,5,...|  3.8296990152156996|
|(10,[0,1,2,3,4,5,...|  1.6351157515638401|
|(10,[0,1,2,3,4,5,...|  1.5755880756295984|
|(10,[0,1,2,3,4,5,...|-0.08869813408056526|
|(10,[0,1,2,3,4,5,...| -1.324209