# Simple Linear Regression Using Spark MLlib

In [17]:
import findspark
findspark.init()
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark ML").master("local").getOrCreate()


In [19]:
spark


In [22]:
data = spark.read.csv("Student_Grades_Data",header=True, inferSchema=True)

In [23]:
data.printSchema()

root
 |-- Time_to_Study: integer (nullable = true)
 |-- Grades: double (nullable = true)



In [24]:
data.show()

+-------------+------+
|Time_to_Study|Grades|
+-------------+------+
|            1|   1.5|
|            5|   2.7|
|            7|   3.1|
|            3|   2.1|
|            2|   1.8|
|            9|   3.9|
|            6|   2.9|
|           12|   4.5|
|           11|   4.3|
|            2|   1.8|
|            4|   2.4|
|            8|   3.5|
|           13|   4.8|
|            9|   3.9|
|           14|   5.0|
|           10|   4.1|
|            6|   2.9|
|           12|   4.5|
|            1|   1.5|
|            4|   2.4|
+-------------+------+
only showing top 20 rows



In [25]:
feature_cols = data.columns[:-1]

In [28]:
from pyspark.ml.feature import VectorAssembler
vect_Assembler = VectorAssembler(inputCols=feature_cols,outputCol="features")
data_w_features = vect_Assembler.transform(data)
data_w_features.show()

+-------------+------+--------+
|Time_to_Study|Grades|features|
+-------------+------+--------+
|            1|   1.5|   [1.0]|
|            5|   2.7|   [5.0]|
|            7|   3.1|   [7.0]|
|            3|   2.1|   [3.0]|
|            2|   1.8|   [2.0]|
|            9|   3.9|   [9.0]|
|            6|   2.9|   [6.0]|
|           12|   4.5|  [12.0]|
|           11|   4.3|  [11.0]|
|            2|   1.8|   [2.0]|
|            4|   2.4|   [4.0]|
|            8|   3.5|   [8.0]|
|           13|   4.8|  [13.0]|
|            9|   3.9|   [9.0]|
|           14|   5.0|  [14.0]|
|           10|   4.1|  [10.0]|
|            6|   2.9|   [6.0]|
|           12|   4.5|  [12.0]|
|            1|   1.5|   [1.0]|
|            4|   2.4|   [4.0]|
+-------------+------+--------+
only showing top 20 rows



In [29]:
final_data = data_w_features.select("features","grades")
final_data.show()

+--------+------+
|features|grades|
+--------+------+
|   [1.0]|   1.5|
|   [5.0]|   2.7|
|   [7.0]|   3.1|
|   [3.0]|   2.1|
|   [2.0]|   1.8|
|   [9.0]|   3.9|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|  [11.0]|   4.3|
|   [2.0]|   1.8|
|   [4.0]|   2.4|
|   [8.0]|   3.5|
|  [13.0]|   4.8|
|   [9.0]|   3.9|
|  [14.0]|   5.0|
|  [10.0]|   4.1|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|   [1.0]|   1.5|
|   [4.0]|   2.4|
+--------+------+
only showing top 20 rows



In [44]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [45]:
train_data.describe().show()

+-------+------------------+
|summary|            grades|
+-------+------------------+
|  count|                35|
|   mean|3.1799999999999993|
| stddev|1.0810670110372165|
|    min|               1.5|
|    max|               5.0|
+-------+------------------+



In [46]:
test_data.describe().show()

+-------+------------------+
|summary|            grades|
+-------+------------------+
|  count|                15|
|   mean|              3.32|
| stddev|1.1911579000523578|
|    min|               1.8|
|    max|               5.0|
+-------+------------------+



In [48]:
from pyspark.ml.regression import LinearRegression
LinReg = LinearRegression(featuresCol="features",labelCol="grades")


In [49]:
Model = LinReg.fit(train_data)

In [50]:
pred = Model.evaluate(test_data)

In [51]:
pred.predictions.show()

+--------+------+------------------+
|features|grades|        prediction|
+--------+------+------------------+
|   [2.0]|   1.8|1.8172513321491996|
|   [3.0]|   2.1| 2.094554840142095|
|   [3.0]|   2.1| 2.094554840142095|
|   [4.0]|   2.4|  2.37185834813499|
|   [4.0]|   2.4|  2.37185834813499|
|   [5.0]|   2.7|2.6491618561278854|
|   [6.0]|   2.9| 2.926465364120781|
|   [6.0]|   2.9| 2.926465364120781|
|   [7.0]|   3.1| 3.203768872113676|
|   [7.0]|   3.1| 3.203768872113676|
|  [12.0]|   4.5| 4.590286412078152|
|  [13.0]|   4.8|4.8675899200710475|
|  [14.0]|   5.0| 5.144893428063943|
|  [14.0]|   5.0| 5.144893428063943|
|  [14.0]|   5.0| 5.144893428063943|
+--------+------+------------------+



In [56]:
coefficient = Model.coefficients
print('The coefficient of the Model : %a '%coefficient)

The coefficient of the Model : DenseVector([0.2773]) 


In [57]:
intercept = Model.intercept
print('The intercept of the Model : %a' %intercept)

The intercept of the Model : 1.262644316163409


In [62]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluation = RegressionEvaluator(labelCol="grades",predictionCol="prediction")

rmse = evaluation.evaluate(pred.predictions,{evaluation.metricName:"rmse"})
print("RMSE : %.3f" %rmse)

mse = evaluation.evaluate(pred.predictions, {evaluation.metricName:"mse"})
print("MSE : %.3f" %mse)

mae = evaluation.evaluate(pred.predictions, {evaluation.metricName: "mae"})
print("MAE: %.3f" % mae)


r2 = evaluation.evaluate(pred.predictions, {evaluation.metricName:"r2"})
print("r2 : %.3f" %r2)



RMSE : 0.083
MSE : 0.007
MAE: 0.066
r2 : 0.995


In [65]:
unlabeled_dataset = test_data.select('features')
unlabeled_dataset.show()

+--------+
|features|
+--------+
|   [2.0]|
|   [3.0]|
|   [3.0]|
|   [4.0]|
|   [4.0]|
|   [5.0]|
|   [6.0]|
|   [6.0]|
|   [7.0]|
|   [7.0]|
|  [12.0]|
|  [13.0]|
|  [14.0]|
|  [14.0]|
|  [14.0]|
+--------+



In [68]:
new_predictions = Model.transform(unlabeled_dataset)
new_predictions.show()

+--------+------------------+
|features|        prediction|
+--------+------------------+
|   [2.0]|1.8172513321491996|
|   [3.0]| 2.094554840142095|
|   [3.0]| 2.094554840142095|
|   [4.0]|  2.37185834813499|
|   [4.0]|  2.37185834813499|
|   [5.0]|2.6491618561278854|
|   [6.0]| 2.926465364120781|
|   [6.0]| 2.926465364120781|
|   [7.0]| 3.203768872113676|
|   [7.0]| 3.203768872113676|
|  [12.0]| 4.590286412078152|
|  [13.0]|4.8675899200710475|
|  [14.0]| 5.144893428063943|
|  [14.0]| 5.144893428063943|
|  [14.0]| 5.144893428063943|
+--------+------------------+

