In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(appName='LI1').getOrCreate()
spark = SparkSession(sparkContext=sc)

In [4]:
from pyspark.ml.regression import LinearRegression

In [6]:
df_sample = spark.read.format('libsvm').load('data/sample_linear_regression_data.txt')
df_sample.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [7]:
df_sample.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [8]:
mod_lr_sample = LinearRegression(featuresCol='features', labelCol='label',
                                predictionCol='prediction')

In [9]:
mod_lr_sample = mod_lr_sample.fit(df_sample)

In [10]:
mod_lr_sample.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [11]:
mod_lr_sample.intercept

0.14228558260358093

In [14]:
mod_lr_sample_summary = mod_lr_sample.summary

In [15]:
mod_lr_sample_summary.rootMeanSquaredError

10.16309157133015

In [16]:
mod_lr_sample_summary.r2

0.027839179518600154

### Training Test Split

In [17]:
df_sample = spark.read.format('libsvm').load('data/sample_linear_regression_data.txt')

In [18]:
df_sample_train, df_sample_test = df_sample.randomSplit([0.7, 0.3])

In [20]:
df_sample_train.count(), df_sample_test.count(), df_sample.count()

(355, 146, 501)

In [21]:
mod_lr_sample = LinearRegression(featuresCol='features', labelCol='label', 
                                predictionCol='prediction')

In [22]:
mod_lr_sample = mod_lr_sample.fit(df_sample_train)

In [23]:
test_results = mod_lr_sample.evaluate(df_sample_test)

In [24]:
type(test_results)

pyspark.ml.regression.LinearRegressionSummary

In [28]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-23.209817026651397|
|-23.813424943849412|
|-21.844114492549004|
|-21.566958112973065|
| -19.55469050788568|
| -21.26358466720319|
|-19.804936012796702|
|-20.300105351713064|
|-18.641296578505788|
| -17.58802325293179|
|-15.981155019778843|
| -18.14415715718417|
|-16.064155925981364|
|-15.263211206917457|
|-16.747452342702008|
|-16.806982809935484|
|-18.796741265127213|
|-14.219278549170202|
|-13.430858606006652|
|-12.429636782470496|
+-------------------+
only showing top 20 rows



In [29]:
test_results.rootMeanSquaredError

10.792053220083027

### Deployment on Unlabeled Data

In [30]:
df_sample_unlabeled = df_sample_test.select('features')

In [31]:
df_sample_unlabeled.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [32]:
df_predictions = mod_lr_sample.transform(df_sample_unlabeled)

In [33]:
df_predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...|-3.5263901559503266|
|(10,[0,1,2,3,4,5,...| 0.3259848229129004|
|(10,[0,1,2,3,4,5,...|0.41172672838319657|
|(10,[0,1,2,3,4,5,...| 1.6823973386996396|
|(10,[0,1,2,3,4,5,...|-0.3183005301827245|
|(10,[0,1,2,3,4,5,...| 1.5962660518314709|
|(10,[0,1,2,3,4,5,...| 0.4025999825821493|
|(10,[0,1,2,3,4,5,...| 1.4541828788144826|
|(10,[0,1,2,3,4,5,...| 0.3660830125011559|
|(10,[0,1,2,3,4,5,...| 0.5615309887222417|
|(10,[0,1,2,3,4,5,...|-0.7379418138262475|
|(10,[0,1,2,3,4,5,...| 1.4519501358730662|
|(10,[0,1,2,3,4,5,...|-0.0871934252957473|
|(10,[0,1,2,3,4,5,...|-0.8224478341040322|
|(10,[0,1,2,3,4,5,...| 1.0153640704627616|
|(10,[0,1,2,3,4,5,...| 1.4474379301028066|
|(10,[0,1,2,3,4,5,...|  3.461973785204873|
|(10,[0,1,2,3,4,5,...| 0.2431476180174983|
|(10,[0,1,2,3,4,5,...|0.39093054190203663|
|(10,[0,1,2,3,4,5,...|-0.5482119429216084|
+----------