# Rank Functions
Difference between `rank`, `dense_rank` and `row_number` in `pyspark`

In [5]:
from pyspark.sql import SparkSession
import pandas as pd
from datetime import datetime as dt, timedelta
import numpy as np 

spark = SparkSession.builder.appName('windows').getOrCreate()

In [41]:
N =24 
id1 = pd.DataFrame({
    'id': ['a'] * N, 
    'date': np.arange(
        start=np.datetime64('2022-01'), 
        stop=np.datetime64('2024-01'), 
        step=np.timedelta64(1, 'M')),
    'num': np.random.normal(loc=1000, scale=100, size=N)
})
# lets put some duplicates in the year-month here
N =24 
id2 = pd.DataFrame({
    'id': ['b'] * N, 
    'date': [*np.arange(
        start=np.datetime64('2022-01'), 
        stop=np.datetime64('2023-01'), 
        step=np.timedelta64(1, 'M')), *np.arange(
        start=np.datetime64('2022-06'), 
        stop=np.datetime64('2023-06'), 
        step=np.timedelta64(1, 'M'))],
    'num': np.random.normal(loc=1000, scale=100, size=N)
})

df = pd.concat([id1, id2], ignore_index=True)
df = df.sort_values('date', ignore_index=True)

In [42]:
df = spark.createDataFrame(df)
df.head()

Row(id='a', date=datetime.datetime(2022, 1, 1, 0, 0), num=956.091853989788)

In [43]:
df.show(5)

+---+-------------------+------------------+
| id|               date|               num|
+---+-------------------+------------------+
|  a|2022-01-01 00:00:00|  956.091853989788|
|  b|2022-01-01 00:00:00|1045.8760933695205|
|  a|2022-02-01 00:00:00|1009.2635822947411|
|  b|2022-02-01 00:00:00|1002.4961729983769|
|  a|2022-03-01 00:00:00|  696.513394183562|
+---+-------------------+------------------+
only showing top 5 rows



# `row_number`

Row-number arbitrarily assigns the value

In [44]:
from pyspark.sql import functions as F
from pyspark.sql import Window

w = Window.partitionBy('id').orderBy('date')

(df.withColumn(
    'row_number',
    F.row_number().over(w)
)).filter('id="b"').show()

+---+-------------------+------------------+----------+
| id|               date|               num|row_number|
+---+-------------------+------------------+----------+
|  b|2022-01-01 00:00:00|1045.8760933695205|         1|
|  b|2022-02-01 00:00:00|1002.4961729983769|         2|
|  b|2022-03-01 00:00:00| 920.9210263670503|         3|
|  b|2022-04-01 00:00:00| 960.1723130221495|         4|
|  b|2022-05-01 00:00:00| 868.4408646892177|         5|
|  b|2022-06-01 00:00:00| 969.0217679727573|         6|
|  b|2022-06-01 00:00:00|1023.6725538430055|         7|
|  b|2022-07-01 00:00:00| 952.8071251576209|         8|
|  b|2022-07-01 00:00:00|1068.0256346416263|         9|
|  b|2022-08-01 00:00:00|1094.5945678105602|        10|
|  b|2022-08-01 00:00:00|1072.5133646720624|        11|
|  b|2022-09-01 00:00:00| 945.9308164379078|        12|
|  b|2022-09-01 00:00:00|1014.7616006406685|        13|
|  b|2022-10-01 00:00:00|1018.9798686384539|        14|
|  b|2022-10-01 00:00:00|1057.2859234993714|    

# `dense_rank`

Dense rank leaves no gaps in the order, but duplicates values for identical `order` columns

In [45]:
from pyspark.sql import functions as F
from pyspark.sql import Window

w = Window.partitionBy('id').orderBy('date')

(df.withColumn(
    'dense_rank',
    F.dense_rank().over(w)
)).filter('id="b"').show()

+---+-------------------+------------------+----------+
| id|               date|               num|dense_rank|
+---+-------------------+------------------+----------+
|  b|2022-01-01 00:00:00|1045.8760933695205|         1|
|  b|2022-02-01 00:00:00|1002.4961729983769|         2|
|  b|2022-03-01 00:00:00| 920.9210263670503|         3|
|  b|2022-04-01 00:00:00| 960.1723130221495|         4|
|  b|2022-05-01 00:00:00| 868.4408646892177|         5|
|  b|2022-06-01 00:00:00| 969.0217679727573|         6|
|  b|2022-06-01 00:00:00|1023.6725538430055|         6|
|  b|2022-07-01 00:00:00| 952.8071251576209|         7|
|  b|2022-07-01 00:00:00|1068.0256346416263|         7|
|  b|2022-08-01 00:00:00|1094.5945678105602|         8|
|  b|2022-08-01 00:00:00|1072.5133646720624|         8|
|  b|2022-09-01 00:00:00| 945.9308164379078|         9|
|  b|2022-09-01 00:00:00|1014.7616006406685|         9|
|  b|2022-10-01 00:00:00|1018.9798686384539|        10|
|  b|2022-10-01 00:00:00|1057.2859234993714|    

# `rank`
This is similar, but leaves gaps when there are duplicates in `order` column

In [46]:
w = Window.partitionBy('id').orderBy('date')

(df.withColumn(
    'rank',
    F.rank().over(w)
)).filter('id="b"').show()

+---+-------------------+------------------+----+
| id|               date|               num|rank|
+---+-------------------+------------------+----+
|  b|2022-01-01 00:00:00|1045.8760933695205|   1|
|  b|2022-02-01 00:00:00|1002.4961729983769|   2|
|  b|2022-03-01 00:00:00| 920.9210263670503|   3|
|  b|2022-04-01 00:00:00| 960.1723130221495|   4|
|  b|2022-05-01 00:00:00| 868.4408646892177|   5|
|  b|2022-06-01 00:00:00| 969.0217679727573|   6|
|  b|2022-06-01 00:00:00|1023.6725538430055|   6|
|  b|2022-07-01 00:00:00| 952.8071251576209|   8|
|  b|2022-07-01 00:00:00|1068.0256346416263|   8|
|  b|2022-08-01 00:00:00|1094.5945678105602|  10|
|  b|2022-08-01 00:00:00|1072.5133646720624|  10|
|  b|2022-09-01 00:00:00| 945.9308164379078|  12|
|  b|2022-09-01 00:00:00|1014.7616006406685|  12|
|  b|2022-10-01 00:00:00|1018.9798686384539|  14|
|  b|2022-10-01 00:00:00|1057.2859234993714|  14|
|  b|2022-11-01 00:00:00| 850.5474922395113|  16|
|  b|2022-11-01 00:00:00|  915.771250781334|  16|


# All Together

In [47]:
w = Window.partitionBy('id').orderBy('date')

(df
    .withColumn('rank',F.rank().over(w))
    .withColumn('dense_rank', F.dense_rank().over(w))    
    .withColumn('row_number', F.row_number().over(w))    
.filter('id="b"')).show()

+---+-------------------+------------------+----+----------+----------+
| id|               date|               num|rank|dense_rank|row_number|
+---+-------------------+------------------+----+----------+----------+
|  b|2022-01-01 00:00:00|1045.8760933695205|   1|         1|         1|
|  b|2022-02-01 00:00:00|1002.4961729983769|   2|         2|         2|
|  b|2022-03-01 00:00:00| 920.9210263670503|   3|         3|         3|
|  b|2022-04-01 00:00:00| 960.1723130221495|   4|         4|         4|
|  b|2022-05-01 00:00:00| 868.4408646892177|   5|         5|         5|
|  b|2022-06-01 00:00:00| 969.0217679727573|   6|         6|         6|
|  b|2022-06-01 00:00:00|1023.6725538430055|   6|         6|         7|
|  b|2022-07-01 00:00:00| 952.8071251576209|   8|         7|         8|
|  b|2022-07-01 00:00:00|1068.0256346416263|   8|         7|         9|
|  b|2022-08-01 00:00:00|1094.5945678105602|  10|         8|        10|
|  b|2022-08-01 00:00:00|1072.5133646720624|  10|         8|    

In [48]:
(df
    .withColumn('rank',F.rank().over(w))
    .withColumn('dense_rank', F.dense_rank().over(w))    
    .withColumn('row_number', F.row_number().over(w))    
.filter('id="a"')).show()

+---+-------------------+------------------+----+----------+----------+
| id|               date|               num|rank|dense_rank|row_number|
+---+-------------------+------------------+----+----------+----------+
|  a|2022-01-01 00:00:00|  956.091853989788|   1|         1|         1|
|  a|2022-02-01 00:00:00|1009.2635822947411|   2|         2|         2|
|  a|2022-03-01 00:00:00|  696.513394183562|   3|         3|         3|
|  a|2022-04-01 00:00:00|1085.3574742888084|   4|         4|         4|
|  a|2022-05-01 00:00:00|1021.8673297882576|   5|         5|         5|
|  a|2022-06-01 00:00:00| 956.0018590176743|   6|         6|         6|
|  a|2022-07-01 00:00:00| 969.5424268316444|   7|         7|         7|
|  a|2022-08-01 00:00:00|1032.0584101482918|   8|         8|         8|
|  a|2022-09-01 00:00:00| 868.8469927305041|   9|         9|         9|
|  a|2022-10-01 00:00:00|1215.4821477653463|  10|        10|        10|
|  a|2022-11-01 00:00:00|1058.3849716373102|  11|        11|    