In [0]:
 csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""
with open("/tmp/student_scores.csv", "w") as f: 
  f.write(csv_data)

Read into DataFrame

In [0]:
df = spark.read.option("header", True).option("inferSchema", 
True).csv("/tmp/student_scores.csv") 
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



write to delta


In [0]:

df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")
delta_df=spark.read.format("delta").load("/tmp/delta/student_scores")
delta_df.show()


+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



register delta table

In [0]:
spark.sql("USE hive_metastore.default")
spark.sql("DROP TABLE IF EXISTS student_scores") 
spark.sql("""CREATE TABLE student_scores USING DELTA LOCATION 
'dbfs:/tmp/delta/student_scores' """)

DataFrame[]

basic tasks

In [0]:
df=spark.read.format("delta").load("/tmp/delta/student_scores")
selected=df.select("name","score")
selected.show()
no_students_sub=df.groupBy("subject").count()
display(no_students_sub)
avg_score_sub=df.groupBy("subject").avg("score")
display(avg_score_sub)
more_80=df.filter(df.score>80).select("name")
display(more_80)


+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Divya|   92|
|Rahul|   78|
|Sneha|   65|
|Aryan|   55|
| Isha|   88|
|Tanvi|   91|
|Kunal|   72|
|Megha|   60|
|Rohan|   40|
+-----+-----+



subject,count
Science,3
Math,4
English,3


subject,avg(score)
Science,73.0
Math,70.25
English,75.33333333333333


name
Ankit
Divya
Isha
Tanvi


advanced queries

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank,desc
w=Window.partitionBy("subject").orderBy(desc("score"))
df.withColumn("rank",rank().over(w)).filter("rank=1").show()



+----------+-----+-------+-----+-----+----+
|student_id| name|subject|score|grade|rank|
+----------+-----+-------+-----+-----+----+
|         6| Isha|English|   88|    A|   1|
|         7|Tanvi|   Math|   91|    A|   1|
|         2|Divya|Science|   92|    A|   1|
+----------+-----+-------+-----+-----+----+



In [0]:
grades_student_count=df.groupBy("grade").count()
display(grades_student_count)
failed=df.filter(df.grade=="F").select("name")
display(failed)
scre_60_90=df.filter(df.score.between(60,90)).select("name")
display(scre_60_90)
w=Window.partitionBy("subject").orderBy(desc("score"))
df=df.withColumn("ranks",rank().over(w))
df.show()


grade,count
F,1
B,2
D,1
C,2
A,4


name
Rohan


name
Ankit
Rahul
Sneha
Isha
Kunal
Megha


+----------+-----+-------+-----+-----+-----+
|student_id| name|subject|score|grade|ranks|
+----------+-----+-------+-----+-----+-----+
|         6| Isha|English|   88|    A|    1|
|         3|Rahul|English|   78|    B|    2|
|         9|Megha|English|   60|    C|    3|
|         7|Tanvi|   Math|   91|    A|    1|
|         1|Ankit|   Math|   85|    A|    2|
|         4|Sneha|   Math|   65|    C|    3|
|        10|Rohan|   Math|   40|    F|    4|
|         2|Divya|Science|   92|    A|    1|
|         8|Kunal|Science|   72|    B|    2|
|         5|Aryan|Science|   55|    D|    3|
+----------+-----+-------+-----+-----+-----+



update amd delete

In [0]:
from delta.tables import DeltaTable
deltatable=DeltaTable.forPath(spark,"/tmp/delta/student_scores")
deltatable.update(condition="subject='English'",set={"score":"score+5"})
deltatable.toDF().show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         3|Rahul|English|  128|    B|
|         6| Isha|English|  138|    A|
|         9|Megha|English|  110|    C|
+----------+-----+-------+-----+-----+



In [0]:
deltatable.delete("score<60")
deltatable.toDF().show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         3|Rahul|English|  128|    B|
|         6| Isha|English|  138|    A|
|         9|Megha|English|  110|    C|
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
+----------+-----+-------+-----+-----+



In [0]:
from pyspark.sql.functions import col,when
df2=deltatable.toDF().withColumn("pass_status",when(col("score")>=50,"Pass").otherwise("Fail"))
df2.show()


+----------+-----+-------+-----+-----+-----------+
|student_id| name|subject|score|grade|pass_status|
+----------+-----+-------+-----+-----+-----------+
|         3|Rahul|English|  128|    B|       Pass|
|         6| Isha|English|  138|    A|       Pass|
|         9|Megha|English|  110|    C|       Pass|
|         1|Ankit|   Math|   85|    A|       Pass|
|         2|Divya|Science|   92|    A|       Pass|
|         4|Sneha|   Math|   65|    C|       Pass|
|         7|Tanvi|   Math|   91|    A|       Pass|
|         8|Kunal|Science|   72|    B|       Pass|
+----------+-----+-------+-----+-----+-----------+



Data transformation and views

In [0]:
df2.createOrReplaceTempView("students_views")
spark.sql("select avg(score)from students_views").show()

+----------+
|avg(score)|
+----------+
|    97.625|
+----------+



In [0]:
df2.write.format("delta").mode("overwrite").save("dbfs:/tmp/delta/student_scores2")

In [0]:
spark.sql("use hive_metastore.default")
spark.sql("DROP TABLE IF EXISTS student_scores2")
spark.sql(""" create table student_scores2 using delta location 'dbfs:/tmp/delta/student_scores2' """)


DataFrame[]

In [0]:
df2.write.mode("overwrite").parquet("/tmp/delta/student_scores_parquet_file")
df2.write.mode("overwrite").json("/tmp/delta/student_scores_json_file")