<a href="https://colab.research.google.com/github/anaferreira744/DE-DP-ADF/blob/main/spark/examples/09-windows-function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Windows Function
- Window functions operate on a group of rows, referred to as a window, and calculate a return value for each row based on the group of rows.
- Window functions are useful for processing tasks such as calculating a moving average, computing a cumulative statistic, or accessing the value of rows given the relative position of the current row.

# Setting up PySpark

In [None]:
%pip install pyspark



In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').config('spark.ui.port', '4050').getOrCreate()

# Windows Function

In [2]:
# using inline table to prepare the data

qry = """CREATE OR REPLACE TEMPORARY VIEW employees AS SELECT * FROM VALUES("Lisa", "Sales", 10000, 35),
("Evan", "Sales", 32000, 38),
("Fred", "Engineering", 21000, 28),
("Alex", "Sales", 30000, 33),
("Tom", "Engineering", 23000, 33),
("Jane", "Marketing", 29000, 28),
("Jeff", "Marketing", 35000, 38),
("Paul", "Engineering", 29000, 23),
("Chloe", "Engineering", 23000, 25)
AS employees(name, dept, salary, age)"""

spark.sql(qry)
spark.table("employees").show()

+-----+-----------+------+---+
| name|       dept|salary|age|
+-----+-----------+------+---+
| Lisa|      Sales| 10000| 35|
| Evan|      Sales| 32000| 38|
| Fred|Engineering| 21000| 28|
| Alex|      Sales| 30000| 33|
|  Tom|Engineering| 23000| 33|
| Jane|  Marketing| 29000| 28|
| Jeff|  Marketing| 35000| 38|
| Paul|Engineering| 29000| 23|
|Chloe|Engineering| 23000| 25|
+-----+-----------+------+---+



In [3]:
# calculate dense_rank on salary
qry1 = """SELECT name, dept, salary, DENSE_RANK() OVER (PARTITION BY dept ORDER BY salary ROWS BETWEEN
    UNBOUNDED PRECEDING AND CURRENT ROW) AS dense_rank FROM employees;"""

spark.sql(qry1).show()

+-----+-----------+------+----------+
| name|       dept|salary|dense_rank|
+-----+-----------+------+----------+
| Fred|Engineering| 21000|         1|
|  Tom|Engineering| 23000|         2|
|Chloe|Engineering| 23000|         2|
| Paul|Engineering| 29000|         3|
| Jane|  Marketing| 29000|         1|
| Jeff|  Marketing| 35000|         2|
| Lisa|      Sales| 10000|         1|
| Alex|      Sales| 30000|         2|
| Evan|      Sales| 32000|         3|
+-----+-----------+------+----------+



In [12]:
# calculate min salary by dept
qry2 = """SELECT name, dept, salary, MIN(salary) OVER (PARTITION BY dept ORDER BY salary) AS min,
    case when salary=min then true else false end as min_salary
    FROM employees order by name;"""

spark.sql(qry2).show()



+-----+-----------+------+-----+----------+
| name|       dept|salary|  min|min_salary|
+-----+-----------+------+-----+----------+
| Alex|      Sales| 30000|10000|     false|
|Chloe|Engineering| 23000|21000|     false|
| Evan|      Sales| 32000|10000|     false|
| Fred|Engineering| 21000|21000|      true|
| Jane|  Marketing| 29000|29000|      true|
| Jeff|  Marketing| 35000|29000|     false|
| Lisa|      Sales| 10000|10000|      true|
| Paul|Engineering| 29000|21000|     false|
|  Tom|Engineering| 23000|21000|     false|
+-----+-----------+------+-----+----------+



In [8]:
# same logic but using dataframes
from pyspark.sql.functions import min
from pyspark.sql.window import Window

df = spark.table("employees")
windowSpec = Window.partitionBy("dept").orderBy("salary")
df.withColumn("min", min("salary").over(windowSpec)).show()

+-----+-----------+------+---+-----+
| name|       dept|salary|age|  min|
+-----+-----------+------+---+-----+
| Fred|Engineering| 21000| 28|21000|
|  Tom|Engineering| 23000| 33|21000|
|Chloe|Engineering| 23000| 25|21000|
| Paul|Engineering| 29000| 23|21000|
| Jane|  Marketing| 29000| 28|29000|
| Jeff|  Marketing| 35000| 38|29000|
| Lisa|      Sales| 10000| 35|10000|
| Alex|      Sales| 30000| 33|10000|
| Evan|      Sales| 32000| 38|10000|
+-----+-----------+------+---+-----+



In [15]:
 # same logic but using dataframes
from pyspark.sql.functions import *
from pyspark.sql.window import Window
df = spark.table("employees")
windowSpec = Window.partitionBy("dept").orderBy("salary")
df = df.withColumn("min", min("salary").over(windowSpec))
df = df.withColumn("is_min", when(col("salary") == col("min"), lit("true")).otherwise(lit("false")))
df.show()

+-----+-----------+------+---+-----+------+
| name|       dept|salary|age|  min|is_min|
+-----+-----------+------+---+-----+------+
| Fred|Engineering| 21000| 28|21000|  true|
|  Tom|Engineering| 23000| 33|21000| false|
|Chloe|Engineering| 23000| 25|21000| false|
| Paul|Engineering| 29000| 23|21000| false|
| Jane|  Marketing| 29000| 28|29000|  true|
| Jeff|  Marketing| 35000| 38|29000| false|
| Lisa|      Sales| 10000| 35|10000|  true|
| Alex|      Sales| 30000| 33|10000| false|
| Evan|      Sales| 32000| 38|10000| false|
+-----+-----------+------+---+-----+------+



# Question

In [None]:
# Q1
# Use windows functions to identify the higher salary by dept
# Create new column "highest_salary_dept" and assign TRUE/FALSE accordingly
# Identify the high salary of the company (including all the dept)
# Create new column "highest_salary_company" and assign TRUE/FALSE accordingly

In [14]:
qry3 = """SELECT name, dept, salary,
                MAX(salary) OVER (PARTITION BY dept ORDER BY salary desc) AS MAX,
                case when salary=max then true else false end as max_salary
          FROM employees;"""

spark.sql(qry3).show()

+-----+-----------+------+-----+----------+
| name|       dept|salary|  MAX|max_salary|
+-----+-----------+------+-----+----------+
| Paul|Engineering| 29000|29000|      true|
|  Tom|Engineering| 23000|29000|     false|
|Chloe|Engineering| 23000|29000|     false|
| Fred|Engineering| 21000|29000|     false|
| Jeff|  Marketing| 35000|35000|      true|
| Jane|  Marketing| 29000|35000|     false|
| Evan|      Sales| 32000|32000|      true|
| Alex|      Sales| 30000|32000|     false|
| Lisa|      Sales| 10000|32000|     false|
+-----+-----------+------+-----+----------+



In [27]:
 # same logic but using dataframes
from pyspark.sql.functions import *
from pyspark.sql.window import Window
df = spark.table("employees")
windowSpec = Window.partitionBy("dept")
df = df.withColumn("max", max("salary").over(windowSpec))
df = df.withColumn("highest_salary_dept", when(col("salary") == col("max"), lit("true")).otherwise(lit("false")))

# Calcular o salário máximo da companhia inteira
highest_salary = df.agg(max("salary").alias("highest_salary_company")).collect()[0]["highest_salary_company"]
df = df.withColumn("highest_salary_company", lit(highest_salary))
df = df.withColumn("highest_salary_company", when(col("salary") == col("highest_salary_company"), lit("true")).otherwise(lit("false")))



#OU
#df = spark.table("employees")
#windowSpec = Window.partitionBy("dept")
#windowSpec2 = Window.partitionBy()
#df = df.withColumn("highest_salary_dept", max("salary").over(windowSpec))
#df = df.withColumn("is_highest_salary_dept", when(col("salary") == col("highest_salary_dept"), lit("true")).otherwise(lit("false")))
#df = df.withColumn("highest_salary_company", max("salary").over(windowSpec2))
#df = df.withColumn("is_highest_salary_company", when(col("salary") == col("highest_salary_company"), lit("true")).otherwise(lit("false")))
#df.show()

df.show()

+-----+-----------+------+---+-----+-------------------+----------------------+
| name|       dept|salary|age|  max|highest_salary_dept|highest_salary_company|
+-----+-----------+------+---+-----+-------------------+----------------------+
| Fred|Engineering| 21000| 28|29000|              false|                 false|
|  Tom|Engineering| 23000| 33|29000|              false|                 false|
| Paul|Engineering| 29000| 23|29000|               true|                 false|
|Chloe|Engineering| 23000| 25|29000|              false|                 false|
| Jane|  Marketing| 29000| 28|35000|              false|                 false|
| Jeff|  Marketing| 35000| 38|35000|               true|                  true|
| Lisa|      Sales| 10000| 35|32000|              false|                 false|
| Evan|      Sales| 32000| 38|32000|               true|                 false|
| Alex|      Sales| 30000| 33|32000|              false|                 false|
+-----+-----------+------+---+-----+----