In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml.functions import *
from pyspark.ml.classification import *

In [2]:
spark = SparkSession.builder.appName("Amith").getOrCreate()

In [3]:
df = spark.read.csv("Exam_Score_Prediction.csv",header=True,inferSchema=True)
df.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|        7.4|         poor|     coaching|            low|           hard|      58.9|
|         2| 23| other|    bca|       3.37|            64.8|            yes|        4.6|      average|online videos|         medium|       moderate|      54.8|
|         3| 22|  male|   b.sc|       7.88|            76.8|            yes|        8.5|         poor|     coaching|           high|       moderate|      90.3|
|         4| 20| other|diploma|       0.

In [4]:
df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- course: string (nullable = true)
 |-- study_hours: double (nullable = true)
 |-- class_attendance: double (nullable = true)
 |-- internet_access: string (nullable = true)
 |-- sleep_hours: double (nullable = true)
 |-- sleep_quality: string (nullable = true)
 |-- study_method: string (nullable = true)
 |-- facility_rating: string (nullable = true)
 |-- exam_difficulty: string (nullable = true)
 |-- exam_score: double (nullable = true)



In [5]:
for i in df.columns:
    dd = df.select(i)
    print(i,":",dd.filter(col(i).isNull()).count())

student_id : 0
age : 0
gender : 0
course : 0
study_hours : 0
class_attendance : 0
internet_access : 0
sleep_hours : 0
sleep_quality : 0
study_method : 0
facility_rating : 0
exam_difficulty : 0
exam_score : 0


In [6]:
df_gender = df.groupBy("gender").count()
s = df_gender.agg(sum(col("count"))).first()[0]
df_gender = df_gender.withColumn("percentage",col("count")*100/s)
df_gender.show()

+------+-----+----------+
|gender|count|percentage|
+------+-----+----------+
|female| 6579|    32.895|
| other| 6726|     33.63|
|  male| 6695|    33.475|
+------+-----+----------+



In [7]:
df_course = df.groupBy("course").count()
s = df_course.agg(sum(col("count"))).first()[0]
df_course = df_course.withColumn("percentage",col("count")*100/s)
df_course.show()

+-------+-----+----------+
| course|count|percentage|
+-------+-----+----------+
|    bca| 2902|     14.51|
|   b.sc| 2878|     14.39|
|     ba| 2896|     14.48|
| b.tech| 2798|     13.99|
|  b.com| 2864|     14.32|
|    bba| 2836|     14.18|
|diploma| 2826|     14.13|
+-------+-----+----------+



In [8]:
df.agg(max(col("study_hours"))).show()
df.agg(min(col("study_hours"))).show()
df.agg(median(col("study_hours"))).show()
df.agg(mean(col("study_hours"))).show()

+----------------+
|max(study_hours)|
+----------------+
|            7.91|
+----------------+

+----------------+
|min(study_hours)|
+----------------+
|            0.08|
+----------------+

+-------------------+
|median(study_hours)|
+-------------------+
|               4.04|
+-------------------+

+-----------------+
| avg(study_hours)|
+-----------------+
|4.007603500000015|
+-----------------+



In [9]:
df.agg(max(col("class_attendance"))).show()
df.agg(min(col("class_attendance"))).show()
df.agg(median(col("class_attendance"))).show()
df.agg(mean(col("class_attendance"))).show()

+---------------------+
|max(class_attendance)|
+---------------------+
|                 99.4|
+---------------------+

+---------------------+
|min(class_attendance)|
+---------------------+
|                 40.6|
+---------------------+

+------------------------+
|median(class_attendance)|
+------------------------+
|                    69.9|
+------------------------+

+---------------------+
|avg(class_attendance)|
+---------------------+
|    70.01736499999947|
+---------------------+



In [10]:
df_internet_access = df.groupBy("internet_access").count()
s = df_course.agg(sum(col("count"))).first()[0]
df_internet_access = df_internet_access.withColumn("percentage",col("count")*100/s)
df_internet_access.show()

+---------------+-----+----------+
|internet_access|count|percentage|
+---------------+-----+----------+
|             no| 3012|     15.06|
|            yes|16988|     84.94|
+---------------+-----+----------+



In [11]:
df.agg(max(col("sleep_hours"))).show()
df.agg(min(col("sleep_hours"))).show()
df.agg(median(col("sleep_hours"))).show()
df.agg(mean(col("sleep_hours"))).show()

+----------------+
|max(sleep_hours)|
+----------------+
|             9.9|
+----------------+

+----------------+
|min(sleep_hours)|
+----------------+
|             4.1|
+----------------+

+-------------------+
|median(sleep_hours)|
+-------------------+
|                7.0|
+-------------------+

+-----------------+
| avg(sleep_hours)|
+-----------------+
|7.008560000000006|
+-----------------+



In [12]:
df_sleep_quality = df.groupBy("sleep_quality").count()
s = df_sleep_quality.agg(sum(col("count"))).first()[0]
df_sleep_quality = df_sleep_quality.withColumn("percentage",col("count")*100/s)
df_sleep_quality.show()

+-------------+-----+----------+
|sleep_quality|count|percentage|
+-------------+-----+----------+
|      average| 6694|     33.47|
|         poor| 6687|    33.435|
|         good| 6619|    33.095|
+-------------+-----+----------+



In [13]:
df_study_method = df.groupBy("study_method").count()
s = df_study_method.agg(sum(col("count"))).first()[0]
df_study_method = df_study_method.withColumn("percentage",col("count")*100/s)
df_study_method.show()

+-------------+-----+----------+
| study_method|count|percentage|
+-------------+-----+----------+
|        mixed| 3894|     19.47|
|online videos| 4069|    20.345|
|     coaching| 4036|     20.18|
|   self-study| 4079|    20.395|
|  group study| 3922|     19.61|
+-------------+-----+----------+



In [14]:
df_facility_rating = df.groupBy("facility_rating").count()
s = df_facility_rating.agg(sum(col("count"))).first()[0]
df_facility_rating = df_facility_rating.withColumn("percentage",col("count")*100/s)
df_facility_rating.show()

+---------------+-----+----------+
|facility_rating|count|percentage|
+---------------+-----+----------+
|            low| 6638|     33.19|
|           high| 6602|     33.01|
|         medium| 6760|      33.8|
+---------------+-----+----------+



In [15]:
df_exam_difficulty = df.groupBy("exam_difficulty").count()
s = df_exam_difficulty.agg(sum(col("count"))).first()[0]
df_exam_difficulty = df_exam_difficulty.withColumn("percentage",col("count")*100/s)
df_exam_difficulty.show()

+---------------+-----+----------+
|exam_difficulty|count|percentage|
+---------------+-----+----------+
|       moderate| 9878|     49.39|
|           hard| 3981|    19.905|
|           easy| 6141|    30.705|
+---------------+-----+----------+



In [16]:
df.agg(max(col("exam_score"))).show()
df.agg(min(col("exam_score"))).show()
df.agg(median(col("exam_score"))).show()
df.agg(mean(col("exam_score"))).show()

+---------------+
|max(exam_score)|
+---------------+
|          100.0|
+---------------+

+---------------+
|min(exam_score)|
+---------------+
|         19.599|
+---------------+

+------------------+
|median(exam_score)|
+------------------+
|              62.6|
+------------------+

+-----------------+
|  avg(exam_score)|
+-----------------+
|62.51322500000027|
+-----------------+



In [17]:
df.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|        7.4|         poor|     coaching|            low|           hard|      58.9|
|         2| 23| other|    bca|       3.37|            64.8|            yes|        4.6|      average|online videos|         medium|       moderate|      54.8|
|         3| 22|  male|   b.sc|       7.88|            76.8|            yes|        8.5|         poor|     coaching|           high|       moderate|      90.3|
|         4| 20| other|diploma|       0.

In [18]:
indexer = StringIndexerModel.from_labels(labels=["female","other","male"],inputCol="gender",outputCol="gender_indexed")
df_indexed = indexer.transform(df)
df_indexed = df_indexed.withColumn("gender_indexed",col("gender_indexed")-1)
df_1 = df_indexed.withColumnRenamed("gender_indexed","gender_encoded")
df_1.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|        7.4|         poor|     coaching|            low|           hard|      58.9|           1.0|
|         2| 23| other|    bca|       3.37|            64.8|            yes|        4.6|      average|online videos|         medium|       moderate|      54.8|           0.0|
|         3| 22|  male|   b.sc|       7.88|            76.8|            yes|        8.5|         poor|     coaching|         

In [19]:
string_indexer = StringIndexer(inputCol="course",outputCol="course_indexed")
one_hot_encoder = OneHotEncoder(inputCol="course_indexed",outputCol="course_encoded")

df_string_indexer = string_indexer.fit(df_1).transform(df_1)
df_one_hot_encoder = one_hot_encoder.fit(df_string_indexer).transform(df_string_indexer)
df_one_hot_encoder_1 = df_one_hot_encoder
df_one_hot_encoder.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|        7.4|         poor|     coaching|            low|           hard|      58.9|           1.0|           5.0| (6,[5],[1.0])|
|         2| 23| other|    bca|       3.37|            64.8|            yes|        4.6|      average|online videos|         medium|       moderate|      54.8|           0.0|      

In [20]:
ctr = 0
for i in df.select("course").distinct().collect()[:6]:
    df_one_hot_encoder_1 = df_one_hot_encoder_1.withColumn(("course_encoded_"+str(ctr)), vector_to_array("course_encoded")[ctr])
    ctr += 1
df_one_hot_encoder_1.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|course_encoded_0|course_encoded_1|course_encoded_2|course_encoded_3|course_encoded_4|course_encoded_5|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|   

In [21]:
string_indexer_2 = StringIndexerModel.from_labels(labels=["poor","average","good"],inputCol="sleep_quality",outputCol="sleep_quality_indexed")

df_string_indexed_2 = string_indexer_2.transform(df_one_hot_encoder_1)
df_string_indexed_2 = df_string_indexed_2.withColumn("sleep_quality_indexed",col("sleep_quality_indexed")-1)
df_string_indexed_2.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|course_encoded_0|course_encoded_1|course_encoded_2|course_encoded_3|course_encoded_4|course_encoded_5|sleep_quality_indexed|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------------+
|         1| 

In [22]:
string_indexer_2 = StringIndexer(inputCol="study_method",outputCol="study_method_indexed")
one_hot_encoder_2 = OneHotEncoder(inputCol="study_method_indexed",outputCol="study_method_encoded")

df_string_indexed_3 = string_indexer_2.fit(df_string_indexed_2).transform(df_string_indexed_2)
df_one_hot_encoded_2 = one_hot_encoder_2.fit(df_string_indexed_3).transform(df_string_indexed_3)

ctr = 0
for i in df_one_hot_encoded_2.select("study_method").distinct().collect()[:-1]:
    df_one_hot_encoded_2 = df_one_hot_encoded_2.withColumn(colName="study_method_encoded_"+str(ctr),col=vector_to_array("study_method_encoded")[ctr])
    ctr += 1
df_one_hot_encoded_2.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|course_encoded_0|course_encoded_1|course_encoded_2|course_encoded_3|course_encoded_4|course_encoded_5|sleep_quality_indexed|study_method_indexed|study_method_encoded|study_method_encoded_0|study_method_encoded_1|study_method_encoded_2|study_method_encoded_3|
+----------+---+------+-------+-----------+----------------+--------------

In [23]:
string_indexer_3 = StringIndexerModel.from_labels(labels=["low","medium","high"],inputCol="facility_rating",outputCol="facility_rating_indexed")

df_string_indexed_3 = string_indexer_3.transform(df_one_hot_encoded_2)
df_string_indexed_3 = df_string_indexed_3.withColumn("facility_rating_indexed",col("facility_rating_indexed")-1)
df_string_indexed_3.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+-----------------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|course_encoded_0|course_encoded_1|course_encoded_2|course_encoded_3|course_encoded_4|course_encoded_5|sleep_quality_indexed|study_method_indexed|study_method_encoded|study_method_encoded_0|study_method_encoded_1|study_method_encoded_2|study_method_encoded_3|facility_rating_indexed|
+----------+---+------+---

In [24]:
string_indexer_4 = StringIndexerModel.from_labels(labels=["easy","moderate","hard"],inputCol="exam_difficulty",outputCol="exam_difficulty_indexed")

df_string_indexed_4 = string_indexer_4.transform(df_string_indexed_3)
df_string_indexed_4 = df_string_indexed_4.withColumn("exam_difficulty_indexed",col("exam_difficulty_indexed")-1)
df_string_indexed_4.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+-----------------------+-----------------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|course_encoded_0|course_encoded_1|course_encoded_2|course_encoded_3|course_encoded_4|course_encoded_5|sleep_quality_indexed|study_method_indexed|study_method_encoded|study_method_encoded_0|study_method_encoded_1|study_method_encoded_2|study_method_encoded_3|facility_rating_indexed|exa

In [25]:
string_indexer_5 = StringIndexerModel.from_labels(labels=["other","female","male"],inputCol="gender",outputCol="gender_indexed")

df_string_indexed_5 = string_indexer_5.transform(df_string_indexed_4)
df_string_indexed_5 = df_string_indexed_5.withColumn("gender_indexed",col("gender_indexed")-1)
df_string_indexed_5.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+-----------------------+-----------------------+--------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|course_encoded_0|course_encoded_1|course_encoded_2|course_encoded_3|course_encoded_4|course_encoded_5|sleep_quality_indexed|study_method_indexed|study_method_encoded|study_method_encoded_0|study_method_encoded_1|study_method_encoded_2|study_method_encoded_3|facility_rat

In [26]:
string_indexer_6 = StringIndexerModel.from_labels(labels=["no","yes"],inputCol="internet_access",outputCol="internet_access_indexed")

df_string_indexed_6 = string_indexer_6.transform(df_string_indexed_5)
df_string_indexed_6 = df_string_indexed_6.withColumn("internet_access_indexed",col("internet_access_indexed"))
df_string_indexed_6.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+-----------------------+-----------------------+--------------+-----------------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|course_encoded_0|course_encoded_1|course_encoded_2|course_encoded_3|course_encoded_4|course_encoded_5|sleep_quality_indexed|study_method_indexed|study_method_encoded|study_method_encoded_0|study_method_encoded_1|study_method_encoded_2|study_metho

In [27]:
final_df_1 = df_string_indexed_6.select("age","gender_indexed","course_encoded_0","course_encoded_1","course_encoded_2","course_encoded_3","course_encoded_4","course_encoded_5","study_hours","class_attendance","internet_access_indexed","sleep_hours","sleep_quality_indexed","study_method_encoded_0","study_method_encoded_1","study_method_encoded_2","study_method_encoded_3","facility_rating_indexed","exam_difficulty_indexed")
final_df_1.show()

+---+--------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------+----------------+-----------------------+-----------+---------------------+----------------------+----------------------+----------------------+----------------------+-----------------------+-----------------------+
|age|gender_indexed|course_encoded_0|course_encoded_1|course_encoded_2|course_encoded_3|course_encoded_4|course_encoded_5|study_hours|class_attendance|internet_access_indexed|sleep_hours|sleep_quality_indexed|study_method_encoded_0|study_method_encoded_1|study_method_encoded_2|study_method_encoded_3|facility_rating_indexed|exam_difficulty_indexed|
+---+--------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------+----------------+-----------------------+-----------+---------------------+----------------------+----------------------+----------------------+----------------------

In [28]:
from pyspark.ml import *

cols_to_be_featured = final_df_1.columns

vec_assemble = VectorAssembler(inputCols=cols_to_be_featured,outputCol="features")
scaler = StandardScaler(inputCol="features",outputCol="features_scaled",withStd=True,withMean=False)

pipeline = Pipeline(stages=[vec_assemble,scaler])

df_scaler = pipeline.fit(final_df_1).transform(final_df_1)
df_exam = df.select("exam_score")

df_scaler = df_scaler.withColumn("_idx",monotonically_increasing_id())
df_exam = df_exam.withColumn("_idx",monotonically_increasing_id())

df_final = df_scaler.join(df_exam,on="_idx",how="inner")

df_final = df_final.drop("_idx")

df_final = df_final.select("features_scaled","exam_score")

df_final.show()

+--------------------+----------+
|     features_scaled|exam_score|
+--------------------+----------+
|(19,[0,1,7,8,9,10...|      58.9|
|(19,[0,1,2,8,9,10...|      54.8|
|(19,[0,1,4,8,9,10...|      90.3|
|(19,[0,1,7,8,9,10...|      29.7|
|(19,[0,7,8,9,10,1...|      43.7|
|(19,[0,1,8,9,10,1...|      58.2|
|(19,[0,8,9,10,11,...|      53.7|
|(19,[0,1,4,8,9,10...|      47.3|
|(19,[0,1,2,8,9,10...|      44.9|
|(19,[0,1,6,8,9,10...|      77.7|
|(19,[0,3,8,9,10,1...|      63.2|
|(19,[0,1,5,8,9,11...|      53.5|
|(19,[0,3,8,9,10,1...|      63.9|
|(19,[0,6,8,9,10,1...|      34.1|
|(19,[0,1,8,9,10,1...|      83.5|
|(19,[0,1,6,8,9,10...|      98.5|
|(19,[0,1,4,8,9,10...|      70.3|
|(19,[0,1,2,8,9,10...|      32.3|
|(19,[0,4,8,9,10,1...|      56.2|
|(19,[0,1,8,9,10,1...|      49.3|
+--------------------+----------+
only showing top 20 rows



In [29]:
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *


df_train,df_test = df_final.randomSplit([0.75,0.25],seed=42)

random_forest = RandomForestRegressor(
    featuresCol="features_scaled",
    labelCol="exam_score",
    numTrees=200,
    maxDepth=10,
    seed=42
    )

model = random_forest.fit(df_train)

df_pred = model.transform(df_test)


r2_eval = RegressionEvaluator(predictionCol="prediction",labelCol="exam_score",metricName="r2")

In [30]:
df_pred.show()

+--------------------+----------+------------------+
|     features_scaled|exam_score|        prediction|
+--------------------+----------+------------------+
|(19,[0,1,2,8,9,10...|      71.0| 68.07121396820149|
|(19,[0,1,2,8,9,10...|      59.8| 50.07291668231619|
|(19,[0,1,2,8,9,10...|      45.4| 39.42717614068068|
|(19,[0,1,2,8,9,10...|      82.3| 79.87864757922695|
|(19,[0,1,2,8,9,10...|      41.3|  48.8485306515473|
|(19,[0,1,2,8,9,10...|      68.5|63.407855244826294|
|(19,[0,1,2,8,9,10...|      69.1| 80.06231972267841|
|(19,[0,1,2,8,9,10...|      55.0| 51.79639292361672|
|(19,[0,1,2,8,9,10...|      88.7|  73.9555276182132|
|(19,[0,1,2,8,9,10...|      60.1| 50.43744462894288|
|(19,[0,1,2,8,9,10...|      58.5|60.456580567773614|
|(19,[0,1,2,8,9,10...|      51.0| 63.18502212026953|
|(19,[0,1,2,8,9,10...|      32.7|34.536317137368314|
|(19,[0,1,2,8,9,10...|      53.9|  70.2931906964182|
|(19,[0,1,2,8,9,10...|      42.6| 46.16610330301407|
|(19,[0,1,2,8,9,10...|     100.0| 71.512072320

In [31]:
print(r2_eval.evaluate(df_pred))

0.7095177336003389


In [34]:
lr = LinearRegression(
    featuresCol="features_scaled",
    labelCol="exam_score"
)

model = lr.fit(df_train)

df_pred = model.transform(df_test)
df_pred.show()

+--------------------+----------+------------------+
|     features_scaled|exam_score|        prediction|
+--------------------+----------+------------------+
|(19,[0,1,2,8,9,10...|      71.0|  71.7249920239265|
|(19,[0,1,2,8,9,10...|      59.8| 52.13351771811109|
|(19,[0,1,2,8,9,10...|      45.4|42.186070739543375|
|(19,[0,1,2,8,9,10...|      82.3| 83.50145603560112|
|(19,[0,1,2,8,9,10...|      41.3| 50.79167216058575|
|(19,[0,1,2,8,9,10...|      68.5| 60.34541067860492|
|(19,[0,1,2,8,9,10...|      69.1| 82.87404404324081|
|(19,[0,1,2,8,9,10...|      55.0| 54.31083554295025|
|(19,[0,1,2,8,9,10...|      88.7| 75.43124909056561|
|(19,[0,1,2,8,9,10...|      60.1| 52.63730668357138|
|(19,[0,1,2,8,9,10...|      58.5|61.903511871103554|
|(19,[0,1,2,8,9,10...|      51.0| 63.22897896673055|
|(19,[0,1,2,8,9,10...|      32.7| 34.38855466195114|
|(19,[0,1,2,8,9,10...|      53.9|  70.1479829504877|
|(19,[0,1,2,8,9,10...|      42.6|48.626808467559535|
|(19,[0,1,2,8,9,10...|     100.0| 73.865998515

In [35]:
met = RegressionEvaluator(predictionCol="prediction",labelCol="exam_score",metricName="rmse")
met.evaluate(df_pred)


9.745200660965594