one of main quirks of using mllib is that you need to format you data so that eventually it just has one or two columns     
* Features , Label >> SuperVised
* Feature >> UnsuperVised

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [8]:
spark = SparkSession.builder.appName("ml").getOrCreate()

In [9]:
# libsvm is a type of file format
training = spark.read.format("libsvm").load("data/sample_linear_regression_data.txt")
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [14]:
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')

In [16]:
lr_model = lr.fit(training)

In [19]:
lr_model.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [20]:
lr_model.intercept

0.14228558260358093

In [21]:
training_summary = lr_model.summary

In [23]:
training_summary.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-11.011130022096554|
| 0.9236590911176538|
|-4.5957401897776675|
|  -20.4201774575836|
|-10.339160314788181|
|-5.9552091439610555|
|-10.726906349283922|
|  2.122807193191233|
|  4.077122222293811|
|-17.316168071241652|
| -4.593044343959059|
|  6.380476690746936|
| 11.320566035059846|
|-20.721971774534094|
| -2.736692773777401|
| -16.66886934252847|
|  8.242186378876315|
|-1.3723486332690233|
|-0.7060332131264666|
|-1.1591135969994064|
+-------------------+
only showing top 20 rows



In [27]:
print(f"RMSE {training_summary.rootMeanSquaredError} \n")
print(f"R2 {training_summary.r2}")

RMSE 10.16309157133015 

R2 0.027839179518600154


In [29]:
all_data = spark.read.format("libsvm").load("data/sample_linear_regression_data.txt")

In [34]:
print(f"There are {all_data.count()} Rows and {len(all_data.columns)} Columns")

There are 501 Rows and 2 Columns


In [31]:
train_data, test_data = all_data.randomSplit([0.7, 0.3])

In [35]:
print("Training")
print(f"There are {train_data.count()} Rows and {len(train_data.columns)} Columns")

Training
There are 355 Rows and 2 Columns


In [36]:
print("Testing")
print(f"There are {test_data.count()} Rows and {len(test_data.columns)} Columns")

Testing
There are 146 Rows and 2 Columns


In [37]:
unlabled_data = all_data.select('features')

In [38]:
unlabled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [39]:
correct_model = lr.fit(train_data)

In [40]:
test_results = correct_model.evaluate(test_data)

In [41]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -29.59776154827857|
|-25.016061054788523|
| -21.67142554915997|
| -22.53110146576099|
|-22.410377207823245|
|-20.091037013534436|
|-19.603284018659487|
| -16.78122037391696|
|-18.592009998150267|
| -18.75072941313241|
|-20.217341343047078|
| -16.52765134293253|
|-19.104198719097578|
|-20.292748684324465|
| -17.43274349628169|
| -18.53590762263824|
|-13.320150401815622|
|-13.644468109708876|
|-15.298392007021267|
|-11.407973103354054|
+-------------------+
only showing top 20 rows



In [42]:
print(f"RMSE {test_results.rootMeanSquaredError} \n")
print(f"R2 {test_results.r2}")

RMSE 11.239738045609387 

R2 -0.0459108649882316


In [44]:
preductions = correct_model.transform(unlabled_data)

In [46]:
preductions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...|-0.3793332739483283|
|(10,[0,1,2,3,4,5,...| -2.116063421575468|
|(10,[0,1,2,3,4,5,...| 0.9805292224741291|
|(10,[0,1,2,3,4,5,...| 1.0774562822217295|
|(10,[0,1,2,3,4,5,...|  3.083990098278862|
|(10,[0,1,2,3,4,5,...|-0.8032483869308691|
|(10,[0,1,2,3,4,5,...| 1.5805780536506338|
|(10,[0,1,2,3,4,5,...| 1.2867074643450136|
|(10,[0,1,2,3,4,5,...| -2.355141989996113|
|(10,[0,1,2,3,4,5,...|  3.223042986570298|
|(10,[0,1,2,3,4,5,...|-0.7931105775916838|
|(10,[0,1,2,3,4,5,...|  2.745308714701633|
|(10,[0,1,2,3,4,5,...| 3.0298750009224626|
|(10,[0,1,2,3,4,5,...| 2.4736188499717793|
|(10,[0,1,2,3,4,5,...| 3.1687839254522547|
|(10,[0,1,2,3,4,5,...|-1.4243931221853994|
|(10,[0,1,2,3,4,5,...| -2.080136855444721|
|(10,[0,1,2,3,4,5,...| -2.876698866487407|
|(10,[0,1,2,3,4,5,...|  2.097532142859017|
|(10,[0,1,2,3,4,5,...|0.15578820441411012|
+----------

In [47]:
unlabled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows

