<a href="https://colab.research.google.com/github/arulrajgopal-zerotoone/zero_to_one_spark/blob/main/apache_spark/10_windows_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

from pyspark.sql import SparkSession

#create spark session
spark= SparkSession.builder.appName('mysparksession').getOrCreate()

#create spark context
sc = spark.sparkContext

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=76d100746ef2f8c2b6774c9aa156692453412428916d7d0fb2dca12291e4685b
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [2]:
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Sales", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Finance", 3000), \
    ("Kumar", "Finance", 2000),\
    ("Saif", "Finance", 4100) \
  )

columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Sales     |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Finance   |3000  |
|Kumar        |Finance   |2000  |
|Saif         |Finance   |4100  |
+-------------+----------+------+



In [3]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, rank, dense_rank

In [4]:
windowSpec  = Window.partitionBy("department").orderBy("salary")

df.withColumn("row_number",row_number().over(windowSpec)) \
    .show(truncate=False)

+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|Kumar        |Finance   |2000  |1         |
|Jeff         |Finance   |3000  |2         |
|Scott        |Finance   |3300  |3         |
|Jen          |Finance   |3900  |4         |
|Saif         |Finance   |4100  |5         |
|James        |Sales     |3000  |1         |
|Maria        |Sales     |3000  |2         |
|James        |Sales     |3000  |3         |
|Robert       |Sales     |4100  |4         |
|Michael      |Sales     |4600  |5         |
+-------------+----------+------+----------+



In [5]:
df.withColumn("rank",rank().over(windowSpec)) \
    .show()

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        Kumar|   Finance|  2000|   1|
|         Jeff|   Finance|  3000|   2|
|        Scott|   Finance|  3300|   3|
|          Jen|   Finance|  3900|   4|
|         Saif|   Finance|  4100|   5|
|        James|     Sales|  3000|   1|
|        Maria|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   4|
|      Michael|     Sales|  4600|   5|
+-------------+----------+------+----+



In [6]:
df.withColumn("dense_rank",dense_rank().over(windowSpec)) \
    .show()

+-------------+----------+------+----------+
|employee_name|department|salary|dense_rank|
+-------------+----------+------+----------+
|        Kumar|   Finance|  2000|         1|
|         Jeff|   Finance|  3000|         2|
|        Scott|   Finance|  3300|         3|
|          Jen|   Finance|  3900|         4|
|         Saif|   Finance|  4100|         5|
|        James|     Sales|  3000|         1|
|        Maria|     Sales|  3000|         1|
|        James|     Sales|  3000|         1|
|       Robert|     Sales|  4100|         2|
|      Michael|     Sales|  4600|         3|
+-------------+----------+------+----------+

