In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Student percentage').getOrCreate()

studentData=[(1,"Ram"),(2,"Sai"),(3,"Victor"),(4,"Anitha")]
studentSchema =["student_id","student_name"]

marksData = [(1,"math",90),(2,"math",80),(3,"math",40),(4,"math",20),
            (1,"eng",80),(2,"eng",60),(4,"eng",30),
            (1,"sci",90),(2,"sci",50),(3,"sci",50),(4,"sci",10)]
marksSchema = ["student_id","subject","marks"]

studentDf=spark.createDataFrame(studentData,studentSchema)
studentDf.show()
markDf=spark.createDataFrame(marksData,marksSchema)
markDf.show()


+----------+------------+
|student_id|student_name|
+----------+------------+
|         1|         Ram|
|         2|         Sai|
|         3|      Victor|
|         4|      Anitha|
+----------+------------+

+----------+-------+-----+
|student_id|subject|marks|
+----------+-------+-----+
|         1|   math|   90|
|         2|   math|   80|
|         3|   math|   40|
|         4|   math|   20|
|         1|    eng|   80|
|         2|    eng|   60|
|         4|    eng|   30|
|         1|    sci|   90|
|         2|    sci|   50|
|         3|    sci|   50|
|         4|    sci|   10|
+----------+-------+-----+



In [46]:
# Find percentage of each student with Result

from pyspark.sql.window import Window
from pyspark.sql.functions import sum,col,count,when,round

joinDf = studentDf.join(markDf,"student_id")
joinDf.show()

+----------+------------+-------+-----+
|student_id|student_name|subject|marks|
+----------+------------+-------+-----+
|         1|         Ram|   math|   90|
|         1|         Ram|    eng|   80|
|         1|         Ram|    sci|   90|
|         2|         Sai|   math|   80|
|         2|         Sai|    eng|   60|
|         2|         Sai|    sci|   50|
|         3|      Victor|   math|   40|
|         3|      Victor|    sci|   50|
|         4|      Anitha|   math|   20|
|         4|      Anitha|    eng|   30|
|         4|      Anitha|    sci|   10|
+----------+------------+-------+-----+



In [47]:
wdf=Window.partitionBy("student_id")
sumDf=joinDf.select("student_id","student_name",
                    round(sum(col("marks")).over(wdf)/count(col("subject")).over(wdf),2)
                    .alias("percentage")).dropDuplicates()
sumDf.show()
sumDf.printSchema()

+----------+------------+----------+
|student_id|student_name|percentage|
+----------+------------+----------+
|         1|         Ram|     86.67|
|         2|         Sai|     63.33|
|         3|      Victor|      45.0|
|         4|      Anitha|      20.0|
+----------+------------+----------+

root
 |-- student_id: long (nullable = true)
 |-- student_name: string (nullable = true)
 |-- percentage: double (nullable = true)



In [48]:
# Define the results

sumDf.withColumn("result",when(col("percentage") >= 75,'Distinction') \
                 .when(col("percentage").between(60,74),'First Class') \
                 .when(col("percentage").between(35,59),'second Class').otherwise('Fail')).show()

+----------+------------+----------+------------+
|student_id|student_name|percentage|      result|
+----------+------------+----------+------------+
|         1|         Ram|     86.67| Distinction|
|         2|         Sai|     63.33| First Class|
|         3|      Victor|      45.0|second Class|
|         4|      Anitha|      20.0|        Fail|
+----------+------------+----------+------------+

