In [1]:
spark

In [2]:
import sys
from pyspark.sql.window import Window
from pyspark.sql.functions import *


In [4]:
revenue_df = spark.read.format("csv")\
              .option("header", "true")\
              .option("inferSchema", "true")\
              .load("../data/product-revenue/*.csv")

In [5]:
revenue_df.printSchema()

root
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- revenue: integer (nullable = true)



In [6]:
revenue_df.show()

+----------+----------+-------+
|   product|  category|revenue|
+----------+----------+-------+
|      Thin|Cell Phone|   6000|
|    Normal|    Tablet|   1500|
|      Mini|    Tablet|   5500|
|Ultra thin|Cell Phone|   5000|
| Very thin|Cell Phone|   6000|
|       Big|    Tablet|   2500|
|  Bendable|Cell Phone|   3000|
|  Foldable|Cell Phone|   3000|
|       Pro|    Tablet|   4500|
|      Pro2|    Tablet|   6500|
+----------+----------+-------+



### Group Aggregate Functions

In [7]:
revenue_df.groupBy("category")\
          .agg(sum("revenue"), collect_list("product"))\
          .show()

+----------+------------+---------------------+
|  category|sum(revenue)|collect_list(product)|
+----------+------------+---------------------+
|    Tablet|       20500| [Normal, Mini, Bi...|
|Cell Phone|       23000| [Thin, Ultra thin...|
+----------+------------+---------------------+



###  Window Aggregate Functions

####  What is the difference between the revenue of each product and the revenue of the best-selling product in the same category of that product?

In [7]:
windowSpec = Window \
    .partitionBy(revenue_df['category']) \
    .orderBy(revenue_df['revenue'].desc()) \
    .rangeBetween(-sys.maxsize, sys.maxsize)

In [8]:
revenue_difference = \
  (max(revenue_df['revenue']).over(windowSpec) - revenue_df['revenue'])

In [9]:
revenue_df.select(
  revenue_df['product'],
  revenue_df['category'],
  revenue_df['revenue'],
  revenue_difference.alias("revenue_difference")).show()

+----------+----------+-------+------------------+
|   product|  category|revenue|revenue_difference|
+----------+----------+-------+------------------+
|      Pro2|    Tablet|   6500|               0.0|
|      Mini|    Tablet|   5500|            1000.0|
|       Pro|    Tablet|   4500|            2000.0|
|       Big|    Tablet|   2500|            4000.0|
|    Normal|    Tablet|   1500|            5000.0|
|      Thin|Cell Phone|   6000|               0.0|
| Very thin|Cell Phone|   6000|               0.0|
|Ultra thin|Cell Phone|   5000|            1000.0|
|  Bendable|Cell Phone|   3000|            3000.0|
|  Foldable|Cell Phone|   3000|            3000.0|
+----------+----------+-------+------------------+



### Rollup

In [31]:
revenue_df.rollup( "category", "product")\
          .agg(sum("revenue"), grouping_id()).show(100)

+----------+----------+------------+-------------+
|  category|   product|sum(revenue)|grouping_id()|
+----------+----------+------------+-------------+
|    Tablet|      null|       20500|            1|
|Cell Phone| Very thin|        6000|            0|
|    Tablet|      Mini|        5500|            0|
|      null|      null|       43500|            3|
|    Tablet|       Pro|        4500|            0|
|Cell Phone|      null|       23000|            1|
|Cell Phone|      Thin|        6000|            0|
|Cell Phone|  Bendable|        3000|            0|
|Cell Phone|  Foldable|        3000|            0|
|Cell Phone|Ultra thin|        5000|            0|
|    Tablet|       Big|        2500|            0|
|    Tablet|    Normal|        1500|            0|
|    Tablet|      Pro2|        6500|            0|
+----------+----------+------------+-------------+



### Cube

In [30]:
revenue_df.cube("category", "product")\
          .agg(sum("revenue"), grouping_id()).show(100)

+----------+----------+------------+-------------+
|  category|   product|sum(revenue)|grouping_id()|
+----------+----------+------------+-------------+
|      null|  Bendable|        3000|            2|
|      null| Very thin|        6000|            2|
|      null|    Normal|        1500|            2|
|    Tablet|      null|       20500|            1|
|      null|      Mini|        5500|            2|
|Cell Phone| Very thin|        6000|            0|
|      null|      Pro2|        6500|            2|
|    Tablet|      Mini|        5500|            0|
|      null|      null|       43500|            3|
|    Tablet|       Pro|        4500|            0|
|      null|  Foldable|        3000|            2|
|      null|       Pro|        4500|            2|
|      null|      Thin|        6000|            2|
|Cell Phone|      null|       23000|            1|
|Cell Phone|      Thin|        6000|            0|
|Cell Phone|  Bendable|        3000|            0|
|Cell Phone|  Foldable|        