In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window 
# Create a Spark session
spark = SparkSession.builder.appName("Day5").getOrCreate()

In [0]:
# Sample data 
data = [
    ("Alice", "HR", 50000, "New York"),
    ("Bob", "Engineering", 60000, "San Francisco"),
    ("Charlie", "HR", 55000, "Los Angeles"),
    ("David", "Engineering", 62000, "Seattle"),
    ("Eva", "Finance", 70000, "Chicago"),
    ("Frank", "Finance", 75000, "Houston"),
    ("Grace", "Engineering", 65000, "Boston"),
    ("Hannah", "HR", 48000, "Miami"),
    ("Ian", "Finance", 68000, "Dallas"),
    ("Jessica", "Engineering", 63000, "Atlanta")
]

# Define the schema for the DataFrame
schema = ["name", "dept", "salary", "city"]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

df.show()

+-------+-----------+------+-------------+
|   name|       dept|salary|         city|
+-------+-----------+------+-------------+
|  Alice|         HR| 50000|     New York|
|    Bob|Engineering| 60000|San Francisco|
|Charlie|         HR| 55000|  Los Angeles|
|  David|Engineering| 62000|      Seattle|
|    Eva|    Finance| 70000|      Chicago|
|  Frank|    Finance| 75000|      Houston|
|  Grace|Engineering| 65000|       Boston|
| Hannah|         HR| 48000|        Miami|
|    Ian|    Finance| 68000|       Dallas|
|Jessica|Engineering| 63000|      Atlanta|
+-------+-----------+------+-------------+



In [0]:
# first value in group
df.withColumn("first_value",F.first("name").over(Window.partitionBy("dept"))).show()

+-------+-----------+------+-------------+-----------+
|   name|       dept|salary|         city|first_value|
+-------+-----------+------+-------------+-----------+
|    Bob|Engineering| 60000|San Francisco|        Bob|
|  David|Engineering| 62000|      Seattle|        Bob|
|  Grace|Engineering| 65000|       Boston|        Bob|
|Jessica|Engineering| 63000|      Atlanta|        Bob|
|    Eva|    Finance| 70000|      Chicago|        Eva|
|  Frank|    Finance| 75000|      Houston|        Eva|
|    Ian|    Finance| 68000|       Dallas|        Eva|
|  Alice|         HR| 50000|     New York|      Alice|
|Charlie|         HR| 55000|  Los Angeles|      Alice|
| Hannah|         HR| 48000|        Miami|      Alice|
+-------+-----------+------+-------------+-----------+



In [0]:
# last value in group
df.withColumn("last_value",F.last("name").over(Window.partitionBy("dept"))).show()

+-------+-----------+------+-------------+----------+
|   name|       dept|salary|         city|last_value|
+-------+-----------+------+-------------+----------+
|    Bob|Engineering| 60000|San Francisco|   Jessica|
|  David|Engineering| 62000|      Seattle|   Jessica|
|  Grace|Engineering| 65000|       Boston|   Jessica|
|Jessica|Engineering| 63000|      Atlanta|   Jessica|
|    Eva|    Finance| 70000|      Chicago|       Ian|
|  Frank|    Finance| 75000|      Houston|       Ian|
|    Ian|    Finance| 68000|       Dallas|       Ian|
|  Alice|         HR| 50000|     New York|    Hannah|
|Charlie|         HR| 55000|  Los Angeles|    Hannah|
| Hannah|         HR| 48000|        Miami|    Hannah|
+-------+-----------+------+-------------+----------+



In [0]:
# row number over partition 
df.withColumn("row_number",F.row_number().over(Window.partitionBy("dept").orderBy("salary"))).show()

+-------+-----------+------+-------------+----------+
|   name|       dept|salary|         city|row_number|
+-------+-----------+------+-------------+----------+
|    Bob|Engineering| 60000|San Francisco|         1|
|  David|Engineering| 62000|      Seattle|         2|
|Jessica|Engineering| 63000|      Atlanta|         3|
|  Grace|Engineering| 65000|       Boston|         4|
|    Ian|    Finance| 68000|       Dallas|         1|
|    Eva|    Finance| 70000|      Chicago|         2|
|  Frank|    Finance| 75000|      Houston|         3|
| Hannah|         HR| 48000|        Miami|         1|
|  Alice|         HR| 50000|     New York|         2|
|Charlie|         HR| 55000|  Los Angeles|         3|
+-------+-----------+------+-------------+----------+



In [0]:
# rank number over partition 
df.withColumn("rank",F.rank().over(Window.partitionBy("dept").orderBy("salary"))).show()

+-------+-----------+------+-------------+----+
|   name|       dept|salary|         city|rank|
+-------+-----------+------+-------------+----+
|    Bob|Engineering| 60000|San Francisco|   1|
|  David|Engineering| 62000|      Seattle|   2|
|Jessica|Engineering| 63000|      Atlanta|   3|
|  Grace|Engineering| 65000|       Boston|   4|
|    Ian|    Finance| 68000|       Dallas|   1|
|    Eva|    Finance| 70000|      Chicago|   2|
|  Frank|    Finance| 75000|      Houston|   3|
| Hannah|         HR| 48000|        Miami|   1|
|  Alice|         HR| 50000|     New York|   2|
|Charlie|         HR| 55000|  Los Angeles|   3|
+-------+-----------+------+-------------+----+



In [0]:
# dense rank number over partition 
df.withColumn("dense_rank",F.dense_rank().over(Window.partitionBy("dept").orderBy("salary"))).show()

+-------+-----------+------+-------------+----------+
|   name|       dept|salary|         city|dense_rank|
+-------+-----------+------+-------------+----------+
|    Bob|Engineering| 60000|San Francisco|         1|
|  David|Engineering| 62000|      Seattle|         2|
|Jessica|Engineering| 63000|      Atlanta|         3|
|  Grace|Engineering| 65000|       Boston|         4|
|    Ian|    Finance| 68000|       Dallas|         1|
|    Eva|    Finance| 70000|      Chicago|         2|
|  Frank|    Finance| 75000|      Houston|         3|
| Hannah|         HR| 48000|        Miami|         1|
|  Alice|         HR| 50000|     New York|         2|
|Charlie|         HR| 55000|  Los Angeles|         3|
+-------+-----------+------+-------------+----------+



In [0]:
# min value in group 
df.withColumn("min",F.min("salary").over(Window.partitionBy("dept"))).show()

+-------+-----------+------+-------------+-----+
|   name|       dept|salary|         city|  min|
+-------+-----------+------+-------------+-----+
|    Bob|Engineering| 60000|San Francisco|60000|
|  David|Engineering| 62000|      Seattle|60000|
|  Grace|Engineering| 65000|       Boston|60000|
|Jessica|Engineering| 63000|      Atlanta|60000|
|    Eva|    Finance| 70000|      Chicago|68000|
|  Frank|    Finance| 75000|      Houston|68000|
|    Ian|    Finance| 68000|       Dallas|68000|
|  Alice|         HR| 50000|     New York|48000|
|Charlie|         HR| 55000|  Los Angeles|48000|
| Hannah|         HR| 48000|        Miami|48000|
+-------+-----------+------+-------------+-----+



In [0]:
# min value in table 
df.select(F.min("salary").alias("min")).show()

+-----+
|  min|
+-----+
|48000|
+-----+



In [0]:
# max value in group 
df.withColumn("max",F.max("salary").over(Window.partitionBy("dept"))).show()

+-------+-----------+------+-------------+-----+
|   name|       dept|salary|         city|  max|
+-------+-----------+------+-------------+-----+
|    Bob|Engineering| 60000|San Francisco|65000|
|  David|Engineering| 62000|      Seattle|65000|
|  Grace|Engineering| 65000|       Boston|65000|
|Jessica|Engineering| 63000|      Atlanta|65000|
|    Eva|    Finance| 70000|      Chicago|75000|
|  Frank|    Finance| 75000|      Houston|75000|
|    Ian|    Finance| 68000|       Dallas|75000|
|  Alice|         HR| 50000|     New York|55000|
|Charlie|         HR| 55000|  Los Angeles|55000|
| Hannah|         HR| 48000|        Miami|55000|
+-------+-----------+------+-------------+-----+



In [0]:
# max value in table 
df.select(F.max("salary").alias("max")).show()

+-----+
|  max|
+-----+
|75000|
+-----+

