In [0]:
import pyspark.sql.functions as F


summary_df = spark.read\
                    .format("parquet")\
                        .load("/FileStore/tables/my_invoices")


summary_df.show(5)

+---------------+----+-----------+
|        Country|week|NumInvoices|
+---------------+----+-----------+
|           EIRE|  17|          1|
|         Norway|  47|          1|
|Channel Islands|  46|          1|
|         France|  10|          2|
|         France|   6|          3|
+---------------+----+-----------+
only showing top 5 rows



In [0]:
summary_df.sort("Country", "week").show()

+---------+----+-----------+
|  Country|week|NumInvoices|
+---------+----+-----------+
|Australia|   1|          2|
|Australia|   2|          3|
|Australia|   3|          3|
|Australia|   4|          1|
|Australia|   6|          3|
|Australia|   7|          1|
|Australia|   8|          1|
|Australia|   9|          1|
|Australia|  10|          1|
|Australia|  12|          1|
|Australia|  13|          1|
|Australia|  14|          1|
|Australia|  17|          0|
|Australia|  19|          1|
|Australia|  20|          2|
|Australia|  21|          1|
|Australia|  22|          0|
|Australia|  24|          2|
|Australia|  26|          2|
|Australia|  28|          3|
+---------+----+-----------+
only showing top 20 rows



In [0]:
from pyspark.sql import Window

rank_window = Window\
                    .partitionBy("Country")\
                        .orderBy(F.desc("NumInvoices"))\
                            .rowsBetween(Window.unboundedPreceding, Window.currentRow)

top_one_df = summary_df.withColumn("Rank", F.dense_rank().over(rank_window))

In [0]:
top_one_df.sort("Country", "Rank").show(10)

+---------+----+-----------+----+
|  Country|week|NumInvoices|Rank|
+---------+----+-----------+----+
|Australia|  40|          4|   1|
|Australia|   2|          3|   2|
|Australia|  29|          3|   2|
|Australia|  28|          3|   2|
|Australia|   3|          3|   2|
|Australia|  44|          3|   2|
|Australia|   6|          3|   2|
|Australia|  39|          3|   2|
|Australia|  46|          2|   3|
|Australia|  24|          2|   3|
+---------+----+-----------+----+
only showing top 10 rows



In [0]:
top_one_df.where("Rank == 1").show(5)

+---------+----+-----------+----+
|  Country|week|NumInvoices|Rank|
+---------+----+-----------+----+
|Australia|  40|          4|   1|
|  Austria|   5|          2|   1|
|  Austria|  17|          2|   1|
|  Bahrain|  51|          1|   1|
|  Bahrain|  19|          1|   1|
+---------+----+-----------+----+
only showing top 5 rows

