In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import *
from pyspark.ml.classification import *

In [2]:
spark = SparkSession.builder.appName("Amith").getOrCreate()

In [3]:
df = spark.read.csv("Exam_Score_Prediction.csv",header=True,inferSchema=True)
df.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|        7.4|         poor|     coaching|            low|           hard|      58.9|
|         2| 23| other|    bca|       3.37|            64.8|            yes|        4.6|      average|online videos|         medium|       moderate|      54.8|
|         3| 22|  male|   b.sc|       7.88|            76.8|            yes|        8.5|         poor|     coaching|           high|       moderate|      90.3|
|         4| 20| other|diploma|       0.

In [4]:
df.printSchema()

root
 |-- student_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- course: string (nullable = true)
 |-- study_hours: double (nullable = true)
 |-- class_attendance: double (nullable = true)
 |-- internet_access: string (nullable = true)
 |-- sleep_hours: double (nullable = true)
 |-- sleep_quality: string (nullable = true)
 |-- study_method: string (nullable = true)
 |-- facility_rating: string (nullable = true)
 |-- exam_difficulty: string (nullable = true)
 |-- exam_score: double (nullable = true)



In [5]:
for i in df.columns:
    dd = df.select(i)
    print(i,":",dd.filter(col(i).isNull()).count())

student_id : 0
age : 0
gender : 0
course : 0
study_hours : 0
class_attendance : 0
internet_access : 0
sleep_hours : 0
sleep_quality : 0
study_method : 0
facility_rating : 0
exam_difficulty : 0
exam_score : 0


In [6]:
df_gender = df.groupBy("gender").count()
s = df_gender.agg(sum(col("count"))).first()[0]
df_gender = df_gender.withColumn("percentage",col("count")*100/s)
df_gender.show()

+------+-----+----------+
|gender|count|percentage|
+------+-----+----------+
|female| 6579|    32.895|
| other| 6726|     33.63|
|  male| 6695|    33.475|
+------+-----+----------+



In [7]:
df_course = df.groupBy("course").count()
s = df_course.agg(sum(col("count"))).first()[0]
df_course = df_course.withColumn("percentage",col("count")*100/s)
df_course.show()

+-------+-----+----------+
| course|count|percentage|
+-------+-----+----------+
|    bca| 2902|     14.51|
|   b.sc| 2878|     14.39|
|     ba| 2896|     14.48|
| b.tech| 2798|     13.99|
|  b.com| 2864|     14.32|
|    bba| 2836|     14.18|
|diploma| 2826|     14.13|
+-------+-----+----------+



In [8]:
df.agg(max(col("study_hours"))).show()
df.agg(min(col("study_hours"))).show()
df.agg(median(col("study_hours"))).show()
df.agg(mean(col("study_hours"))).show()

+----------------+
|max(study_hours)|
+----------------+
|            7.91|
+----------------+

+----------------+
|min(study_hours)|
+----------------+
|            0.08|
+----------------+

+-------------------+
|median(study_hours)|
+-------------------+
|               4.04|
+-------------------+

+-----------------+
| avg(study_hours)|
+-----------------+
|4.007603500000015|
+-----------------+



In [9]:
df.agg(max(col("class_attendance"))).show()
df.agg(min(col("class_attendance"))).show()
df.agg(median(col("class_attendance"))).show()
df.agg(mean(col("class_attendance"))).show()

+---------------------+
|max(class_attendance)|
+---------------------+
|                 99.4|
+---------------------+

+---------------------+
|min(class_attendance)|
+---------------------+
|                 40.6|
+---------------------+

+------------------------+
|median(class_attendance)|
+------------------------+
|                    69.9|
+------------------------+

+---------------------+
|avg(class_attendance)|
+---------------------+
|    70.01736499999947|
+---------------------+



In [10]:
df_internet_access = df.groupBy("internet_access").count()
s = df_course.agg(sum(col("count"))).first()[0]
df_internet_access = df_internet_access.withColumn("percentage",col("count")*100/s)
df_internet_access.show()

+---------------+-----+----------+
|internet_access|count|percentage|
+---------------+-----+----------+
|             no| 3012|     15.06|
|            yes|16988|     84.94|
+---------------+-----+----------+



In [11]:
df.agg(max(col("sleep_hours"))).show()
df.agg(min(col("sleep_hours"))).show()
df.agg(median(col("sleep_hours"))).show()
df.agg(mean(col("sleep_hours"))).show()

+----------------+
|max(sleep_hours)|
+----------------+
|             9.9|
+----------------+

+----------------+
|min(sleep_hours)|
+----------------+
|             4.1|
+----------------+

+-------------------+
|median(sleep_hours)|
+-------------------+
|                7.0|
+-------------------+

+-----------------+
| avg(sleep_hours)|
+-----------------+
|7.008560000000006|
+-----------------+



In [12]:
df_sleep_quality = df.groupBy("sleep_quality").count()
s = df_sleep_quality.agg(sum(col("count"))).first()[0]
df_sleep_quality = df_sleep_quality.withColumn("percentage",col("count")*100/s)
df_sleep_quality.show()

+-------------+-----+----------+
|sleep_quality|count|percentage|
+-------------+-----+----------+
|      average| 6694|     33.47|
|         poor| 6687|    33.435|
|         good| 6619|    33.095|
+-------------+-----+----------+



In [13]:
df_study_method = df.groupBy("study_method").count()
s = df_study_method.agg(sum(col("count"))).first()[0]
df_study_method = df_study_method.withColumn("percentage",col("count")*100/s)
df_study_method.show()

+-------------+-----+----------+
| study_method|count|percentage|
+-------------+-----+----------+
|        mixed| 3894|     19.47|
|online videos| 4069|    20.345|
|     coaching| 4036|     20.18|
|   self-study| 4079|    20.395|
|  group study| 3922|     19.61|
+-------------+-----+----------+



In [14]:
df_facility_rating = df.groupBy("facility_rating").count()
s = df_facility_rating.agg(sum(col("count"))).first()[0]
df_facility_rating = df_facility_rating.withColumn("percentage",col("count")*100/s)
df_facility_rating.show()

+---------------+-----+----------+
|facility_rating|count|percentage|
+---------------+-----+----------+
|            low| 6638|     33.19|
|           high| 6602|     33.01|
|         medium| 6760|      33.8|
+---------------+-----+----------+



In [15]:
df_exam_difficulty = df.groupBy("exam_difficulty").count()
s = df_exam_difficulty.agg(sum(col("count"))).first()[0]
df_exam_difficulty = df_exam_difficulty.withColumn("percentage",col("count")*100/s)
df_exam_difficulty.show()

+---------------+-----+----------+
|exam_difficulty|count|percentage|
+---------------+-----+----------+
|       moderate| 9878|     49.39|
|           hard| 3981|    19.905|
|           easy| 6141|    30.705|
+---------------+-----+----------+



In [16]:
df.agg(max(col("exam_score"))).show()
df.agg(min(col("exam_score"))).show()
df.agg(median(col("exam_score"))).show()
df.agg(mean(col("exam_score"))).show()

+---------------+
|max(exam_score)|
+---------------+
|          100.0|
+---------------+

+---------------+
|min(exam_score)|
+---------------+
|         19.599|
+---------------+

+------------------+
|median(exam_score)|
+------------------+
|              62.6|
+------------------+

+-----------------+
|  avg(exam_score)|
+-----------------+
|62.51322500000027|
+-----------------+



In [17]:
df.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|        7.4|         poor|     coaching|            low|           hard|      58.9|
|         2| 23| other|    bca|       3.37|            64.8|            yes|        4.6|      average|online videos|         medium|       moderate|      54.8|
|         3| 22|  male|   b.sc|       7.88|            76.8|            yes|        8.5|         poor|     coaching|           high|       moderate|      90.3|
|         4| 20| other|diploma|       0.

In [18]:
indexer = StringIndexerModel.from_labels(labels=["female","other","male"],inputCol="gender",outputCol="gender_indexed")
df_indexed = indexer.transform(df)
df_indexed = df_indexed.withColumn("gender_indexed",col("gender_indexed")-1)
df_1 = df_indexed.withColumnRenamed("gender_indexed","gender_encoded")
df_1.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|        7.4|         poor|     coaching|            low|           hard|      58.9|           1.0|
|         2| 23| other|    bca|       3.37|            64.8|            yes|        4.6|      average|online videos|         medium|       moderate|      54.8|           0.0|
|         3| 22|  male|   b.sc|       7.88|            76.8|            yes|        8.5|         poor|     coaching|         

In [19]:
string_indexer = StringIndexer(inputCol="course",outputCol="course_indexed")
one_hot_encoder = OneHotEncoder(inputCol="course_indexed",outputCol="course_encoded")

df_string_indexer = string_indexer.fit(df_1).transform(df_1)
df_one_hot_encoder = one_hot_encoder.fit(df_string_indexer).transform(df_string_indexer)
df_one_hot_encoder.show()

+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+
|student_id|age|gender| course|study_hours|class_attendance|internet_access|sleep_hours|sleep_quality| study_method|facility_rating|exam_difficulty|exam_score|gender_encoded|course_indexed|course_encoded|
+----------+---+------+-------+-----------+----------------+---------------+-----------+-------------+-------------+---------------+---------------+----------+--------------+--------------+--------------+
|         1| 17|  male|diploma|       2.78|            92.9|            yes|        7.4|         poor|     coaching|            low|           hard|      58.9|           1.0|           5.0| (6,[5],[1.0])|
|         2| 23| other|    bca|       3.37|            64.8|            yes|        4.6|      average|online videos|         medium|       moderate|      54.8|           0.0|      

In [98]:
from typing import Any


pt = []
for i in df_one_hot_encoder.select("course_encoded").collect():
    lk = list[Any](i[0])
    for j in lk:
        print(j[0])
    pt.append(lk)
print(pt)

IndexError: invalid index to scalar variable.

In [97]:
pt[0]

[np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(0.0),
 np.float64(1.0)]