In [0]:
sampleData = [('Ashu','IT',20000),
              ('Ashi','HR',30000),
              ('Jai','WPS',40000),
              ('Prakhar','IT',20000)]

schema = ['name','dep','salary']
df = spark.createDataFrame(data=sampleData,schema=schema)
df.show()

+-------+---+------+
|   name|dep|salary|
+-------+---+------+
|   Ashu| IT| 20000|
|   Ashi| HR| 30000|
|    Jai|WPS| 40000|
|Prakhar| IT| 20000|
+-------+---+------+



In [0]:
from pyspark.sql.functions import approx_count_distinct, avg, collect_list, collect_set, countDistinct, count

df.select(approx_count_distinct('salary')).show()
df.select(avg('salary')).show()
df.select(collect_list('salary')).show(truncate=False)
df.select(collect_set('salary')).show(truncate=False)
df.select(countDistinct('salary')).show()
df.select(count('salary')).show()


+-----------------------------+
|approx_count_distinct(salary)|
+-----------------------------+
|                            3|
+-----------------------------+

+-----------+
|avg(salary)|
+-----------+
|    27500.0|
+-----------+

+----------------------------+
|collect_list(salary)        |
+----------------------------+
|[20000, 30000, 40000, 20000]|
+----------------------------+

+---------------------+
|collect_set(salary)  |
+---------------------+
|[40000, 30000, 20000]|
+---------------------+

+----------------------+
|count(DISTINCT salary)|
+----------------------+
|                     3|
+----------------------+

+-------------+
|count(salary)|
+-------------+
|            4|
+-------------+



In [0]:
from pyspark.sql.functions import row_number, rank, dense_rank
from pyspark.sql.window import Window

sampleData1 = [('Ashu','IT',20000),
              ('Ashi','HR',30000),
              ('Jai','WPS',40000),
              ('Prakhar','IT',20000),
              ('Annu','HR',50000),
              ('Abhishek','IT',50000),
              ('Adi','Payroll',60000),
              ('Akarsh','Payroll',70000),
              ('Anjali','HR',60000),
              ('Saumya','IT',40000)]

schema = ['name','dep','salary']
df = spark.createDataFrame(data=sampleData1,schema=schema)
#df.show()
df.sort('dep').show()

window = Window.partitionBy('dep').orderBy('salary')

#Rank skip identical rank but dense_rank not skip identical rank
df.withColumn('rowNumber', row_number().over(window)).\
    withColumn('rank', rank().over(window)).\
    withColumn('dense_rank', dense_rank().over(window)).show()           

+--------+-------+------+
|    name|    dep|salary|
+--------+-------+------+
|    Annu|     HR| 50000|
|    Ashi|     HR| 30000|
|  Anjali|     HR| 60000|
|    Ashu|     IT| 20000|
|Abhishek|     IT| 50000|
| Prakhar|     IT| 20000|
|  Saumya|     IT| 40000|
|     Adi|Payroll| 60000|
|  Akarsh|Payroll| 70000|
|     Jai|    WPS| 40000|
+--------+-------+------+

+--------+-------+------+---------+----+----------+
|    name|    dep|salary|rowNumber|rank|dense_rank|
+--------+-------+------+---------+----+----------+
|    Ashi|     HR| 30000|        1|   1|         1|
|    Annu|     HR| 50000|        2|   2|         2|
|  Anjali|     HR| 60000|        3|   3|         3|
|    Ashu|     IT| 20000|        1|   1|         1|
| Prakhar|     IT| 20000|        2|   1|         1|
|  Saumya|     IT| 40000|        3|   3|         2|
|Abhishek|     IT| 50000|        4|   4|         3|
|     Adi|Payroll| 60000|        1|   1|         1|
|  Akarsh|Payroll| 70000|        2|   2|         2|
|     Jai| 