In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
data = spark.read.csv('student_grades',header=True,inferSchema=True)
data.printSchema()
data.show()

root
 |-- Time_to_Study: integer (nullable = true)
 |-- Grades: double (nullable = true)

+-------------+------+
|Time_to_Study|Grades|
+-------------+------+
|            1|   1.5|
|            5|   2.7|
|            7|   3.1|
|            3|   2.1|
|            2|   1.8|
|            9|   3.9|
|            6|   2.9|
|           12|   4.5|
|           11|   4.3|
|            2|   1.8|
|            4|   2.4|
|            8|   3.5|
|           13|   4.8|
|            9|   3.9|
|           14|   5.0|
|           10|   4.1|
|            6|   2.9|
|           12|   4.5|
|            1|   1.5|
|            4|   2.4|
+-------------+------+
only showing top 20 rows



In [2]:
# omit last column 
feature_cols = data.columns[:-1]


In [3]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_cols,outputCol='features')

In [4]:
data_w_features = assembler.transform(data)

In [5]:
final_data = data_w_features.select('features','Grades')
final_data.show()

+--------+------+
|features|Grades|
+--------+------+
|   [1.0]|   1.5|
|   [5.0]|   2.7|
|   [7.0]|   3.1|
|   [3.0]|   2.1|
|   [2.0]|   1.8|
|   [9.0]|   3.9|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|  [11.0]|   4.3|
|   [2.0]|   1.8|
|   [4.0]|   2.4|
|   [8.0]|   3.5|
|  [13.0]|   4.8|
|   [9.0]|   3.9|
|  [14.0]|   5.0|
|  [10.0]|   4.1|
|   [6.0]|   2.9|
|  [12.0]|   4.5|
|   [1.0]|   1.5|
|   [4.0]|   2.4|
+--------+------+
only showing top 20 rows



In [7]:
train_dataset,test_dataset = final_data.randomSplit([0.8,0.2])
train_dataset.describe().show()
test_dataset.describe().show()

+-------+-----------------+
|summary|           Grades|
+-------+-----------------+
|  count|               38|
|   mean|3.321052631578947|
| stddev|1.146359194389113|
|    min|              1.5|
|    max|              5.0|
+-------+-----------------+

+-------+------------------+
|summary|            Grades|
+-------+------------------+
|  count|                12|
|   mean| 2.908333333333333|
| stddev|0.9356167954236325|
|    min|               1.8|
|    max|               4.3|
+-------+------------------+



In [9]:
from pyspark.ml.regression import LinearRegression

linReg = LinearRegression(featuresCol='features',labelCol='Grades')
model = linReg.fit(train_dataset)

In [11]:
pred = model.evaluate(test_dataset)
pred.predictions.show()

+--------+------+------------------+
|features|Grades|        prediction|
+--------+------+------------------+
|   [2.0]|   1.8|1.8229050279329608|
|   [2.0]|   1.8|1.8229050279329608|
|   [3.0]|   2.1| 2.093998403830806|
|   [3.0]|   2.1| 2.093998403830806|
|   [3.0]|   2.1| 2.093998403830806|
|   [4.0]|   2.4| 2.365091779728651|
|   [8.0]|   3.5|3.4494652833200306|
|   [8.0]|   3.5|3.4494652833200306|
|   [8.0]|   3.5|3.4494652833200306|
|   [9.0]|   3.9|3.7205586592178754|
|   [9.0]|   3.9|3.7205586592178754|
|  [11.0]|   4.3| 4.262745411013565|
+--------+------+------------------+



In [13]:
coefficient = model.coefficients
print(coefficient)

[0.27109337589784493]


In [14]:
intercept = model.intercept
print(intercept)

1.280718276137271


In [19]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluation = RegressionEvaluator(labelCol="Grades",predictionCol='prediction')

In [20]:
rmse = evaluation.evaluate(pred.predictions,{evaluation.metricName:'rmse'})
print("RMSE : %.3f"%rmse)

RMSE : 0.079


In [22]:
mse = evaluation.evaluate(pred.predictions,{evaluation.metricName:"mse"})
print("MSE : %.3f" %mse)

MSE : 0.006


In [23]:
mae = evaluation.evaluate(pred.predictions,{evaluation.metricName:'mae'})
print("MAE : %.3f" %mae)

MAE : 0.054


In [24]:
r2 = evaluation.evaluate(pred.predictions,{evaluation.metricName:"r2"})
print("R2 : %.3f" %r2)

R2 : 0.992
