In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = (
    SparkSession.builder
    .appName("spark-test")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

In [3]:
articles_df = (
    spark.read
    .format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .load("file:///home/jovyan/work/data/articles.csv")
)

In [4]:
transformed_df = articles_df.select("product_type_name")

In [5]:
counted_df = (
    transformed_df
    .groupBy("product_type_name")
    .agg(F.count("product_type_name").alias("product_type_name_count"))
)

In [6]:
ordered_df = counted_df.orderBy(F.col("product_type_name_count").desc())

In [7]:
ordered_df.show(10)

+-----------------+-----------------------+
|product_type_name|product_type_name_count|
+-----------------+-----------------------+
|         Trousers|                  11169|
|            Dress|                  10362|
|          Sweater|                   9302|
|          T-shirt|                   7904|
|              Top|                   4155|
|           Blouse|                   3979|
|           Jacket|                   3940|
|           Shorts|                   3939|
|            Shirt|                   3405|
|         Vest top|                   2991|
+-----------------+-----------------------+
only showing top 10 rows



In [None]:
spark.stop()