In [20]:
import findspark
findspark.init()

from pyspark.sql import SparkSession,Row
spark = SparkSession.builder.appName('Windows Function').getOrCreate()

from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

data = [("James","Sales",3000),("Michael","Sales",4600),
      ("Robert","Sales",4100),("Maria","Finance",3000),
      ("Raman","Finance",3000),("Scott","Finance",3300),
      ("Jen","Finance",3900),("Jeff","Marketing",3000),
      ("Kumar","Marketing",2000),("Jim","Sales",2400),
       ("Kumar","Sales",3000)]

df = spark.createDataFrame(data,["Name","Department","Salary"])
df.show()

# Add new column row_number
wdf=Window.partitionBy("department").orderBy("salary")
df.withColumn("row_number",row_number().over(wdf)).show()

## Max salary in each department
wdf = Window.partitionBy("department").orderBy(col("salary").desc())
df.withColumn("row",row_number().over(wdf)).filter(col("row") ==1).show()


## Min salary in each department
wdf = Window.partitionBy("department").orderBy("salary")
df.withColumn("row",row_number().over(wdf)).filter(col("row") ==1).show()


## Top 3 salary in each department
wdf = Window.partitionBy("department").orderBy(col("salary").desc())
df.withColumn("row",row_number().over(wdf)).filter(col("row") <=3).show()




+-------+----------+------+
|   Name|Department|Salary|
+-------+----------+------+
|  James|     Sales|  3000|
|Michael|     Sales|  4600|
| Robert|     Sales|  4100|
|  Maria|   Finance|  3000|
|  Raman|   Finance|  3000|
|  Scott|   Finance|  3300|
|    Jen|   Finance|  3900|
|   Jeff| Marketing|  3000|
|  Kumar| Marketing|  2000|
|    Jim|     Sales|  2400|
|  Kumar|     Sales|  3000|
+-------+----------+------+

+-------+----------+------+----------+
|   Name|Department|Salary|row_number|
+-------+----------+------+----------+
|  Maria|   Finance|  3000|         1|
|  Raman|   Finance|  3000|         2|
|  Scott|   Finance|  3300|         3|
|    Jen|   Finance|  3900|         4|
|  Kumar| Marketing|  2000|         1|
|   Jeff| Marketing|  3000|         2|
|    Jim|     Sales|  2400|         1|
|  James|     Sales|  3000|         2|
|  Kumar|     Sales|  3000|         3|
| Robert|     Sales|  4100|         4|
|Michael|     Sales|  4600|         5|
+-------+----------+------+------

In [21]:
# max salary using max function

from pyspark.sql.functions import max

# Define a window specification partitioned by department
windowSpec = Window.partitionBy("Department")

# Add a column for the maximum salary within each department
df_max_salary = df.withColumn("max_salary", max(col("Salary")).over(windowSpec))

# Filter the DataFrame to retain only rows where the salary matches the maximum salary within its department
result = df_max_salary.filter(col("Salary") == col("max_salary")).drop("max_salary")

# Show the result
result.show()

+-------+----------+------+
|   Name|Department|Salary|
+-------+----------+------+
|    Jen|   Finance|  3900|
|   Jeff| Marketing|  3000|
|Michael|     Sales|  4600|
+-------+----------+------+



In [22]:
#Get max, min, avg, sum of each group
from pyspark.sql.functions import col, row_number,avg,sum,min,max
w4 = Window.partitionBy("department")
df.withColumn("row",row_number().over(wdf)) \
  .withColumn("avg", avg(col("salary")).over(w4)) \
  .withColumn("sum", sum(col("salary")).over(w4)) \
  .withColumn("min", min(col("salary")).over(w4)) \
  .withColumn("max", max(col("salary")).over(w4)) \
  .where(col("row")==1).select("department","avg","sum","min","max") \
  .show()

+----------+------+-----+----+----+
|department|   avg|  sum| min| max|
+----------+------+-----+----+----+
|   Finance|3300.0|13200|3000|3900|
| Marketing|2500.0| 5000|2000|3000|
|     Sales|3420.0|17100|2400|4600|
+----------+------+-----+----+----+



In [23]:
# The RANK() function assigns a unique rank to each distinct row within a result set.
from pyspark.sql.functions import rank
df.withColumn("rank",rank().over(wdf)).show()

# DENSE_RANK() ensures that the ranks are continuous without any gaps, regardless of ties. 
from pyspark.sql.functions import dense_rank
df.withColumn("dense_rank",dense_rank().over(wdf)).show()

# Ranking in the form of percentage: rank=1, percent_rank=0.0: rank=2 out of 4 ranks, percent_rank=0.25
from pyspark.sql.functions import percent_rank
df.withColumn("percent_rank",percent_rank().over(wdf)).show()
    
# The NTILE() function is a window function that distributes rows of an ordered partition 
# into a specified number of approximately equal groups, or buckets.
from pyspark.sql.functions import ntile
df.withColumn("ntile",ntile(2).over(wdf)).show()

# CUME_DIST returns the cumulative distribution of a value in a group of values.
from pyspark.sql.functions import cume_dist    
df.withColumn("cume_dist",cume_dist().over(wdf)).show()

# creates a new column that accesses a previous row from another column.
from pyspark.sql.functions import lag    
df.withColumn("lag",lag("salary",1).over(wdf)).show()


# creates a new column that accesses a next row from current column.
from pyspark.sql.functions import lead    
df.withColumn("lead",lead("salary",1).over(wdf)).show()


+-------+----------+------+----+
|   Name|Department|Salary|rank|
+-------+----------+------+----+
|    Jen|   Finance|  3900|   1|
|  Scott|   Finance|  3300|   2|
|  Maria|   Finance|  3000|   3|
|  Raman|   Finance|  3000|   3|
|   Jeff| Marketing|  3000|   1|
|  Kumar| Marketing|  2000|   2|
|Michael|     Sales|  4600|   1|
| Robert|     Sales|  4100|   2|
|  James|     Sales|  3000|   3|
|  Kumar|     Sales|  3000|   3|
|    Jim|     Sales|  2400|   5|
+-------+----------+------+----+

+-------+----------+------+----------+
|   Name|Department|Salary|dense_rank|
+-------+----------+------+----------+
|    Jen|   Finance|  3900|         1|
|  Scott|   Finance|  3300|         2|
|  Maria|   Finance|  3000|         3|
|  Raman|   Finance|  3000|         3|
|   Jeff| Marketing|  3000|         1|
|  Kumar| Marketing|  2000|         2|
|Michael|     Sales|  4600|         1|
| Robert|     Sales|  4100|         2|
|  James|     Sales|  3000|         3|
|  Kumar|     Sales|  3000|         

In [27]:

# First provides the max value in the given group displayed as new column
from pyspark.sql.functions import first    
df.withColumn("first",first("salary").over(wdf)).show()


# Last provides the min value in the given group displayed as new column
from pyspark.sql.functions import last    
df.withColumn("last",last("salary").over(wdf)).show()


# standard deviation
from pyspark.sql.functions import stddev    
df.withColumn("stddev",stddev("salary").over(wdf)).show()

+-------+----------+------+----+
|   Name|Department|Salary|lead|
+-------+----------+------+----+
|    Jen|   Finance|  3900|3900|
|  Scott|   Finance|  3300|3900|
|  Maria|   Finance|  3000|3900|
|  Raman|   Finance|  3000|3900|
|   Jeff| Marketing|  3000|3000|
|  Kumar| Marketing|  2000|3000|
|Michael|     Sales|  4600|4600|
| Robert|     Sales|  4100|4600|
|  James|     Sales|  3000|4600|
|  Kumar|     Sales|  3000|4600|
|    Jim|     Sales|  2400|4600|
+-------+----------+------+----+

+-------+----------+------+----+
|   Name|Department|Salary|lead|
+-------+----------+------+----+
|    Jen|   Finance|  3900|3900|
|  Scott|   Finance|  3300|3300|
|  Maria|   Finance|  3000|3000|
|  Raman|   Finance|  3000|3000|
|   Jeff| Marketing|  3000|3000|
|  Kumar| Marketing|  2000|2000|
|Michael|     Sales|  4600|4600|
| Robert|     Sales|  4100|4100|
|  James|     Sales|  3000|3000|
|  Kumar|     Sales|  3000|3000|
|    Jim|     Sales|  2400|2400|
+-------+----------+------+----+

+-------