# <font color=Blue>Window Functions</font>

* Window functions operate on a group of rows and return a single value for every input row

In [None]:
# Create SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.show(truncate=False)

In [None]:
# Output
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|        James|     Sales|  3000|
|      Michael|     Sales|  4600|
|       Robert|     Sales|  4100|
|        Maria|   Finance|  3000|
|        James|     Sales|  3000|
|        Scott|   Finance|  3300|
|          Jen|   Finance|  3900|
|         Jeff| Marketing|  3000|
|        Kumar| Marketing|  2000|
|         Saif|     Sales|  4100|
+-------------+----------+------+

### 1) row_number()

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

res_df = df.withColumn("row_num", row_number().over(Window.partitionBy("department").orderBy("salary")))

### 2) rank()

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

res_df = df.withColumn("rnk", rank().over(Window.partitionBy("department").orderBy("salary")))

### 3) dense_rank()

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

res_df = df.withColumn("dns_rnk", dense_rank().over(Window.partitionBy("department").orderBy("salary")))

### 4) percent_rank()

* Returns the percentile rank of rows within a window partition

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

res_df = df.withColumn("dns_rnk", percent_rank().over(Window.partitionBy("department").orderBy("salary")))

In [None]:
# Output:
+-------------+----------+------+------------+
|employee_name|department|salary|percent_rank|
+-------------+----------+------+------------+
|        James|     Sales|  3000|         0.0|
|        James|     Sales|  3000|         0.0|
|       Robert|     Sales|  4100|         0.5|
|         Saif|     Sales|  4100|         0.5|
|      Michael|     Sales|  4600|         1.0|
|        Maria|   Finance|  3000|         0.0|
|        Scott|   Finance|  3300|         0.5|
|          Jen|   Finance|  3900|         1.0|
|        Kumar| Marketing|  2000|         0.0|
|         Jeff| Marketing|  3000|         1.0|
+-------------+----------+------+------------+

### 5) ntile()

* Returns the relative rank of result rows within a window partition
* Below we have used 2 as a argument hence it retuns ranking between 2 values (1 and 2)

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import ntile

res_df = df.withColumn("ntile", ntile(2).over(Window.partitionBy("department").orderBy("salary")))

In [None]:
# Output:
+-------------+----------+------+-----+
|employee_name|department|salary|ntile|
+-------------+----------+------+-----+
|        James|     Sales|  3000|    1|
|        James|     Sales|  3000|    1|
|       Robert|     Sales|  4100|    1|
|         Saif|     Sales|  4100|    2|
|      Michael|     Sales|  4600|    2|
|        Maria|   Finance|  3000|    1|
|        Scott|   Finance|  3300|    1|
|          Jen|   Finance|  3900|    2|
|        Kumar| Marketing|  2000|    1|
|         Jeff| Marketing|  3000|    2|
+-------------+----------+------+-----+

### 6) cume_dist()

* This function computes the cumulative distribution of the value within a window partition
* The result ranges from 0 to 1

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import cume_dist

res_df = df.withColumn("cumeDist", cume_dist().over(Window.partitionBy("department").orderBy("salary")))

In [None]:
# Output:
+-------------+----------+------+------------------+
|employee_name|department|salary|         cume_dist|
+-------------+----------+------+------------------+
|        James|     Sales|  3000|               0.4|
|        James|     Sales|  3000|               0.4|
|       Robert|     Sales|  4100|               0.8|
|         Saif|     Sales|  4100|               0.8|
|      Michael|     Sales|  4600|               1.0|
|        Maria|   Finance|  3000|0.3333333333333333|
|        Scott|   Finance|  3300|0.6666666666666666|
|          Jen|   Finance|  3900|               1.0|
|        Kumar| Marketing|  2000|               0.5|
|         Jeff| Marketing|  3000|               1.0|
+-------------+----------+------+------------------+

### 7) lag()

* lag() function allows you to access a previous row's value within the partition based on a specified offset

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

res_df = df.withColumn("lag", lag("salary").over(Window.partitionBy("department").orderBy("salary")))

### 8) lead()

* lead() retrieves the column value from the following row within the partition based on specified offset

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lead

res_df = df.withColumn("lag", lead("salary", 2).over(Window.partitionBy("department").orderBy("salary")))