In [1]:
from pyspark.sql import SparkSession
spark = SparkSession\
  .builder\
  .appName("SGD_Chapter07")\
  .getOrCreate()

In [17]:
import numpy as np
arr = np.array([1.0, 2.0, 3.2])
arr.tolist()

[1.0, 2.0, 3.2]

In [18]:
from pyspark.sql.types import StructType, StructField, FloatType
from pyspark.sql.functions import lit
from pyspark.sql import SQLContext
sc = spark.sparkContext
sqlContext = SQLContext(spark)
# arrSchema = StructType([StructField("arr", FloatType(), True)])
# r_rdd = sc.parallelize(arr.tolist)
df_arr = spark.createDataFrame(arr.tolist(), FloatType())

In [19]:
df_arr.show()

+-----+
|value|
+-----+
|  1.0|
|  2.0|
|  3.2|
+-----+



In [20]:
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/home/jagadeesh/git/Spark-The-Definitive-Guide/data/retail-data/all/*.csv")\
  .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")

In [21]:
# count method used as an action
df.count() == 541909

True

In [22]:
df.count()

541909

In [23]:
# count method is used as a Lazy evalution(transformation) 
from pyspark.sql.functions import count
df.select(count("StockCode")).show()

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [24]:
# count(*) --> count null values (including rows containing all nulls)
# count("col_name") --> spark will not count the null values, when counting an individual column
df.select(count("*")).show()

+--------+
|count(1)|
+--------+
|  541909|
+--------+



In [25]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [26]:
from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [27]:
df.select(countDistinct("InvoiceNo")).show()

+-------------------------+
|count(DISTINCT InvoiceNo)|
+-------------------------+
|                    25900|
+-------------------------+



In [28]:
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode")).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3804|
+--------------------------------+



In [29]:
from pyspark.sql.functions import first, last, min, max, sum, avg, sumDistinct
df.select(min("Quantity"), max("Quantity"), first("StockCode"), last("StockCode"), sum("Quantity"), sumDistinct("Quantity")).show()

+-------------+-------------+-----------------------+----------------------+-------------+----------------------+
|min(Quantity)|max(Quantity)|first(StockCode, false)|last(StockCode, false)|sum(Quantity)|sum(DISTINCT Quantity)|
+-------------+-------------+-----------------------+----------------------+-------------+----------------------+
|       -80995|        80995|                  22030|                85099B|      5176450|                 29310|
+-------------+-------------+-----------------------+----------------------+-------------+----------------------+



In [30]:
from pyspark.sql.functions import expr, avg
df.select(
  count("Quantity").alias("total_transctions"),
  sum("Quantity").alias("total_purchases"),
  avg("Quantity").alias("avg_purchases"),
  expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
  "total_purchases/total_transctions",
  "avg_purchases",
  "mean_purchases").show()

+-------------------------------------+----------------+----------------+
|(total_purchases / total_transctions)|   avg_purchases|  mean_purchases|
+-------------------------------------+----------------+----------------+
|                     9.55224954743324|9.55224954743324|9.55224954743324|
+-------------------------------------+----------------+----------------+



In [32]:
from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show(5, False)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [33]:
df.groupBy("InvoiceNo", "CustomerId").count().show()

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536846|     14573|   76|
|   537026|     12395|   12|
|   537883|     14437|    5|
|   538068|     17978|   12|
|   538279|     14952|    7|
|   538800|     16458|   10|
|   538942|     17346|   12|
|  C539947|     13854|    1|
|   540096|     13253|   16|
|   540530|     14755|   27|
|   541225|     14099|   19|
|   541978|     13551|    4|
|   542093|     17677|   16|
|   543188|     12567|   63|
|   543590|     17377|   19|
|  C543757|     13115|    1|
|  C544318|     12989|    1|
|   544578|     12365|    1|
|   545165|     16339|   20|
|   545289|     14732|   30|
+---------+----------+-----+
only showing top 20 rows



In [34]:
df.groupby("InvoiceNo").count().show()

+---------+-----+
|InvoiceNo|count|
+---------+-----+
|   536596|    6|
|   536938|   14|
|   537252|    1|
|   537691|   20|
|   538041|    1|
|   538184|   26|
|   538517|   53|
|   538879|   19|
|   539275|    6|
|   539630|   12|
|   540499|   24|
|   540540|   22|
|  C540850|    1|
|   540976|   48|
|   541432|    4|
|   541518|  101|
|   541783|   35|
|   542026|    9|
|   542375|    6|
|  C542604|    8|
+---------+-----+
only showing top 20 rows



In [36]:
from pyspark.sql.functions import count
df.groupBy("InvoiceNo").agg(
   count("Quantity").alias("quan"),
   expr("count(Quantity)")).show()

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
|   537252|   1|              1|
|   537691|  20|             20|
|   538041|   1|              1|
|   538184|  26|             26|
|   538517|  53|             53|
|   538879|  19|             19|
|   539275|   6|              6|
|   539630|  12|             12|
|   540499|  24|             24|
|   540540|  22|             22|
|  C540850|   1|              1|
|   540976|  48|             48|
|   541432|   4|              4|
|   541518| 101|            101|
|   541783|  35|             35|
|   542026|   9|              9|
|   542375|   6|              6|
|  C542604|   8|              8|
+---------+----+---------------+
only showing top 20 rows



In [39]:
df.selectExpr("InvoiceNo", "Quantity").show()

+---------+--------+
|InvoiceNo|Quantity|
+---------+--------+
|   536365|       6|
|   536365|       6|
|   536365|       8|
|   536365|       6|
|   536365|       6|
|   536365|       2|
|   536365|       6|
|   536366|       6|
|   536366|       6|
|   536367|      32|
|   536367|       6|
|   536367|       6|
|   536367|       8|
|   536367|       6|
|   536367|       6|
|   536367|       3|
|   536367|       2|
|   536367|       3|
|   536367|       3|
|   536367|       4|
+---------+--------+
only showing top 20 rows



In [62]:
from pyspark.sql.functions import stddev_pop, asc, desc
df.groupBy(col("InvoiceNo")==536367).count().show()

+--------------------+------+
|(InvoiceNo = 536367)| count|
+--------------------+------+
|                null|  9291|
|                true|    12|
|               false|532606|
+--------------------+------+



In [63]:
from pyspark.sql.functions import col
df.groupby("InvoiceNo").agg(
   count("Quantity").alias("quan")).show()

+---------+----+
|InvoiceNo|quan|
+---------+----+
|   536596|   6|
|   536938|  14|
|   537252|   1|
|   537691|  20|
|   538041|   1|
|   538184|  26|
|   538517|  53|
|   538879|  19|
|   539275|   6|
|   539630|  12|
|   540499|  24|
|   540540|  22|
|  C540850|   1|
|   540976|  48|
|   541432|   4|
|   541518| 101|
|   541783|  35|
|   542026|   9|
|   542375|   6|
|  C542604|   8|
+---------+----+
only showing top 20 rows



In [78]:
from pyspark.sql.functions import instr
df.where(col("InvoiceNo") == 536367).select("*").show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536367|    84879|ASSORTED COLOUR B...|      32|12/1/2010 8:34|     1.69|     13047|United Kingdom|
|   536367|    22745|POPPY'S PLAYHOUSE...|       6|12/1/2010 8:34|      2.1|     13047|United Kingdom|
|   536367|    22748|POPPY'S PLAYHOUSE...|       6|12/1/2010 8:34|      2.1|     13047|United Kingdom|
|   536367|    22749|FELTCRAFT PRINCES...|       8|12/1/2010 8:34|     3.75|     13047|United Kingdom|
|   536367|    22310|IVORY KNITTED MUG...|       6|12/1/2010 8:34|     1.65|     13047|United Kingdom|
|   536367|    84969|BOX OF 6 ASSORTED...|       6|12/1/2010 8:34|     4.25|     13047|United Kingdom|
|   536367|    22623|BOX OF VINTAGE JI...|       3|12/1/2010 8:34|     4.

In [73]:
df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"), expr("stddev_pop(Quantity)")).show()

+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   536596|               1.5|  1.1180339887498947|
|   536938|33.142857142857146|  20.698023172885524|
|   537252|              31.0|                 0.0|
|   537691|              8.15|   5.597097462078001|
|   538041|              30.0|                 0.0|
|   538184|12.076923076923077|   8.142590198943392|
|   538517|3.0377358490566038|  2.3946659604837897|
|   538879|21.157894736842106|  11.811070444356483|
|   539275|              26.0|  12.806248474865697|
|   539630|20.333333333333332|  10.225241100118645|
|   540499|              3.75|  2.6653642652865788|
|   540540|2.1363636363636362|  1.0572457590557278|
|  C540850|              -1.0|                 0.0|
|   540976|10.520833333333334|   6.496760677872902|
|   541432|             12.25|  10.825317547305483|
|   541518| 23.10891089108911|  20.550782784878713|
|   541783|1

In [79]:
from pyspark.sql.functions import col, to_date
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")

In [81]:
dfWithDate.where(col("CustomerID") == 13047).show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|      date|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|   536367|    84879|ASSORTED COLOUR B...|      32|12/1/2010 8:34|     1.69|     13047|United Kingdom|2010-12-01|
|   536367|    22745|POPPY'S PLAYHOUSE...|       6|12/1/2010 8:34|      2.1|     13047|United Kingdom|2010-12-01|
|   536367|    22748|POPPY'S PLAYHOUSE...|       6|12/1/2010 8:34|      2.1|     13047|United Kingdom|2010-12-01|
|   536367|    22749|FELTCRAFT PRINCES...|       8|12/1/2010 8:34|     3.75|     13047|United Kingdom|2010-12-01|
|   536367|    22310|IVORY KNITTED MUG...|       6|12/1/2010 8:34|     1.65|     13047|United Kingdom|2010-12-01|
|   536367|    84969|BOX OF 6 ASSORTED...|       6|12/1/2010 8:34|     4.25|     13047|U

In [87]:
dfWithDate.distinct().orderBy(col("InvoiceNo")).show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|      date|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.25|     17850|United Kingdom|2010-12-01|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|2010-12-01|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|2010-12-01|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|U

In [88]:
dfWithDate.distinct().orderBy(col("InvoiceNo")).count()

536641

In [100]:
dfWithDate.groupBy("CustomerID").count().orderBy(asc("CustomerID")).show()

+----------+------+
|CustomerID| count|
+----------+------+
|      null|135080|
|     12346|     2|
|     12347|   182|
|     12348|    31|
|     12349|    73|
|     12350|    17|
|     12352|    95|
|     12353|     4|
|     12354|    58|
|     12355|    13|
|     12356|    59|
|     12357|   131|
|     12358|    19|
|     12359|   254|
|     12360|   129|
|     12361|    10|
|     12362|   274|
|     12363|    23|
|     12364|    85|
|     12365|    23|
+----------+------+
only showing top 20 rows



In [104]:
dfWithDate.select("InvoiceNo").distinct().count()

25900

In [109]:
from pyspark.sql.functions import desc, max, col
from pyspark.sql.window import Window
windowSpec = Window\
  .partitionBy("CustomerId", "Date")\
  .orderBy("Quantity")\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)
maxPurchaseQuatity = max(col("Quantity")).over(windowSpec)

In [113]:
from pyspark.sql.functions import dense_rank, rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchasedRank = rank().over(windowSpec)

In [114]:
dfWithDate.where("CustomerId IS NOT NULL").count()

406829

In [118]:
dfWithDate.where("CustomerId IS NOT NULL")\ # .orderBy("CustomerId")
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchasedRank.alias("QuantityRank"),
    purchaseDenseRank.alias("QuantityDenseRank"),
    maxPurchaseQuatity.alias("maxPurchaseQuantity")).show()

+----------+----------+--------+------------+-----------------+-------------------+
|CustomerId|      date|Quantity|QuantityRank|QuantityDenseRank|maxPurchaseQuantity|
+----------+----------+--------+------------+-----------------+-------------------+
|     12477|2011-04-14|       2|           1|                1|                  2|
|     12477|2011-04-14|       2|           1|                1|                  2|
|     12477|2011-04-14|       2|           1|                1|                  2|
|     12477|2011-04-14|       2|           1|                1|                  2|
|     12477|2011-04-14|       2|           1|                1|                  2|
|     12477|2011-04-14|       2|           1|                1|                  2|
|     12477|2011-04-14|       2|           1|                1|                  2|
|     12477|2011-04-14|       3|           8|                2|                  3|
|     12477|2011-04-14|       3|           8|                2|             

In [125]:
# GROUPING SETS operator is only available in SQL and to perform the same in DataFrames, 'rollup', 'cube' are used.
dfWithDate.dropna().count() # default "any"

406829

In [126]:
dfWithDate.drop().count()

541909

In [121]:
dfWithDate.count()

541909

In [127]:
dfNoNull = dfWithDate.dropna()

In [139]:
dfNoNull.createOrReplaceTempView("dfNoNull")

In [159]:
spark.sql(
  "SELECT CustomerId, StockCode, sum(Quantity) FROM dfNoNull\
   GROUP BY CustomerId, StockCode\
   ORDER BY CustomerId, StockCode").show()

+----------+---------+-------------+
|CustomerId|StockCode|sum(Quantity)|
+----------+---------+-------------+
|     12346|    23166|            0|
|     12347|    16008|           24|
|     12347|    17021|           36|
|     12347|    20665|            6|
|     12347|    20719|           40|
|     12347|    20780|           12|
|     12347|    20782|            6|
|     12347|    20966|           10|
|     12347|    21035|            6|
|     12347|    21041|           12|
|     12347|    21064|           30|
|     12347|    21154|           10|
|     12347|    21171|           12|
|     12347|    21265|           24|
|     12347|    21578|            6|
|     12347|    21636|           12|
|     12347|    21731|           72|
|     12347|    21791|           48|
|     12347|    21832|           12|
|     12347|    21975|           48|
+----------+---------+-------------+
only showing top 20 rows



In [161]:
spark.sql(
  "SELECT CustomerId, StockCode, sum(Quantity) FROM dfNoNull\
   GROUP BY CustomerId, StockCode GROUPING SETS((CustomerId, StockCode))\
   ORDER BY CustomerId, StockCode").show()

+----------+---------+-------------+
|CustomerId|StockCode|sum(Quantity)|
+----------+---------+-------------+
|     12346|    23166|            0|
|     12347|    16008|           24|
|     12347|    17021|           36|
|     12347|    20665|            6|
|     12347|    20719|           40|
|     12347|    20780|           12|
|     12347|    20782|            6|
|     12347|    20966|           10|
|     12347|    21035|            6|
|     12347|    21041|           12|
|     12347|    21064|           30|
|     12347|    21154|           10|
|     12347|    21171|           12|
|     12347|    21265|           24|
|     12347|    21578|            6|
|     12347|    21636|           12|
|     12347|    21731|           72|
|     12347|    21791|           48|
|     12347|    21832|           12|
|     12347|    21975|           48|
+----------+---------+-------------+
only showing top 20 rows



In [168]:
dfNoNull.withColumnRenamed("date", "Date").columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country',
 'Date']

In [169]:
df1 = dfNoNull.withColumnRenamed("date", "Date")

In [170]:
df1.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|      Date|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|2010-12-01|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|2010-12-01|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
+---------+---------+--------------------+--------+--------------+---------+----------+-

In [190]:
# cube takes the rollup one level deeper. Rather than treating elements hierarchically, a cube does the same thing across all dimensions
from pyspark.sql.functions import sum
cubeDF = df1.cube("Date", "Country").agg(sum(col("Quantity"))).select(col("Date"), col("Country"), "sum(Quantity)").orderBy("Date")

In [191]:
cubeDF.show()

+----+--------------------+-------------+
|Date|             Country|sum(Quantity)|
+----+--------------------+-------------+
|null|               Italy|         7999|
|null|            Portugal|        16044|
|null|               Japan|        25218|
|null|             Finland|        10666|
|null|           Australia|        83653|
|null|             Germany|       117448|
|null|                null|      4906888|
|null|           Singapore|         5234|
|null|United Arab Emirates|          982|
|null|                 RSA|          352|
|null|                 USA|         1034|
|null|              Cyprus|         6317|
|null|               Spain|        26824|
|null|             Denmark|         8188|
|null|         Unspecified|         1789|
|null|  European Community|          497|
|null|      Czech Republic|          592|
|null|              Norway|        19247|
|null|             Lebanon|          386|
|null|     Channel Islands|         9479|
+----+--------------------+-------

In [193]:
cubeDF.where("Country IS NULL").show(5)
cubeDF.where("Date IS NULL").show(5)

+----------+-------+-------------+
|      Date|Country|sum(Quantity)|
+----------+-------+-------------+
|      null|   null|      4906888|
|2010-12-01|   null|        24032|
|2010-12-02|   null|        20855|
|2010-12-03|   null|        11548|
|2010-12-05|   null|        16394|
+----------+-------+-------------+
only showing top 5 rows

+----+------------------+-------------+
|Date|           Country|sum(Quantity)|
+----+------------------+-------------+
|null|European Community|          497|
|null|            Norway|        19247|
|null|           Denmark|         8188|
|null|    Czech Republic|          592|
|null|               USA|         1034|
+----+------------------+-------------+
only showing top 5 rows



In [194]:
cubeDF.where("(Country IS NULL) AND (Date IS NULL)").show()

+----+-------+-------------+
|Date|Country|sum(Quantity)|
+----+-------+-------------+
|null|   null|      4906888|
+----+-------+-------------+



In [186]:
rolledUpDF = df1.rollup("Date", "Country").agg(sum(col("Quantity")))\
   .select("Date", "Country", "`sum(Quantity)`").orderBy("Date")

In [187]:
rolledUpDF.show()

+----------+--------------+-------------+
|      Date|       Country|sum(Quantity)|
+----------+--------------+-------------+
|      null|          null|      4906888|
|2010-12-01|   Netherlands|           97|
|2010-12-01|        France|          449|
|2010-12-01|       Germany|          117|
|2010-12-01|     Australia|          107|
|2010-12-01|United Kingdom|        21167|
|2010-12-01|          EIRE|          243|
|2010-12-01|          null|        24032|
|2010-12-01|        Norway|         1852|
|2010-12-02|          EIRE|            4|
|2010-12-02|          null|        20855|
|2010-12-02|       Germany|          146|
|2010-12-02|United Kingdom|        20705|
|2010-12-03|       Belgium|          528|
|2010-12-03|   Switzerland|          110|
|2010-12-03|       Germany|          170|
|2010-12-03|         Italy|          164|
|2010-12-03|         Spain|          400|
|2010-12-03|        France|          239|
|2010-12-03|      Portugal|           65|
+----------+--------------+-------

In [189]:
rolledUpDF.where("Country IS NULL").show()
rolledUpDF.where("Date IS NULL").show()

+----------+-------+-------------+
|      Date|Country|sum(Quantity)|
+----------+-------+-------------+
|      null|   null|      4906888|
|2010-12-01|   null|        24032|
|2010-12-02|   null|        20855|
|2010-12-03|   null|        11548|
|2010-12-05|   null|        16394|
|2010-12-06|   null|        16095|
|2010-12-07|   null|        19351|
|2010-12-08|   null|        21275|
|2010-12-09|   null|        16904|
|2010-12-10|   null|        15388|
|2010-12-12|   null|        10561|
|2010-12-13|   null|        15234|
|2010-12-14|   null|        17108|
|2010-12-15|   null|        18169|
|2010-12-16|   null|        29482|
|2010-12-17|   null|        10517|
|2010-12-19|   null|         3735|
|2010-12-20|   null|        12617|
|2010-12-21|   null|        10888|
|2010-12-22|   null|         3053|
+----------+-------+-------------+
only showing top 20 rows

+----+-------+-------------+
|Date|Country|sum(Quantity)|
+----+-------+-------------+
|null|   null|      4906888|
+----+-------+----

In [210]:
from pyspark.sql.functions import grouping_id, expr, desc
df1.cube("Date", "Country").agg(grouping_id(), sum(col("Quantity")))\
   .orderBy(desc("grouping_id()")).show()

+----+--------------------+-------------+-------------+
|Date|             Country|grouping_id()|sum(Quantity)|
+----+--------------------+-------------+-------------+
|null|                null|            3|      4906888|
|null|               Italy|            2|         7999|
|null|             Finland|            2|        10666|
|null|               Japan|            2|        25218|
|null|     Channel Islands|            2|         9479|
|null|              Cyprus|            2|         6317|
|null|                 RSA|            2|          352|
|null|            Portugal|            2|        16044|
|null|             Denmark|            2|         8188|
|null|                 USA|            2|         1034|
|null|United Arab Emirates|            2|          982|
|null|             Lebanon|            2|          386|
|null|         Unspecified|            2|         1789|
|null|             Germany|            2|       117448|
|null|           Australia|            2|       

In [211]:
#pivots make it possible for you to convert a row into a column
pivoted = dfWithDate.groupby("date").pivot("Country").sum()

In [212]:
pivoted.show(5)

+----------+---------------------------------------+------------------------+-----------------------------------------+-------------------------------------+----------------------+---------------------------------------+-------------------------------------+----------------------+---------------------------------------+-------------------------------------+----------------------+---------------------------------------+------------------------------------+---------------------+--------------------------------------+------------------------------------+---------------------+--------------------------------------+---------------------------------------------+------------------------------+-----------------------------------------------+------------------------------------+---------------------+--------------------------------------+--------------------------------------------+-----------------------------+----------------------------------------------+-------------------------------------

In [221]:
pivoted.where("date > '2011-12-05'").select("date").show()

+----------+
|      date|
+----------+
|2011-12-06|
|2011-12-09|
|2011-12-08|
|2011-12-07|
+----------+



In [217]:
pivoted.columns

['date',
 'Australia_sum(CAST(Quantity AS BIGINT))',
 'Australia_sum(UnitPrice)',
 'Australia_sum(CAST(CustomerID AS BIGINT))',
 'Austria_sum(CAST(Quantity AS BIGINT))',
 'Austria_sum(UnitPrice)',
 'Austria_sum(CAST(CustomerID AS BIGINT))',
 'Bahrain_sum(CAST(Quantity AS BIGINT))',
 'Bahrain_sum(UnitPrice)',
 'Bahrain_sum(CAST(CustomerID AS BIGINT))',
 'Belgium_sum(CAST(Quantity AS BIGINT))',
 'Belgium_sum(UnitPrice)',
 'Belgium_sum(CAST(CustomerID AS BIGINT))',
 'Brazil_sum(CAST(Quantity AS BIGINT))',
 'Brazil_sum(UnitPrice)',
 'Brazil_sum(CAST(CustomerID AS BIGINT))',
 'Canada_sum(CAST(Quantity AS BIGINT))',
 'Canada_sum(UnitPrice)',
 'Canada_sum(CAST(CustomerID AS BIGINT))',
 'Channel Islands_sum(CAST(Quantity AS BIGINT))',
 'Channel Islands_sum(UnitPrice)',
 'Channel Islands_sum(CAST(CustomerID AS BIGINT))',
 'Cyprus_sum(CAST(Quantity AS BIGINT))',
 'Cyprus_sum(UnitPrice)',
 'Cyprus_sum(CAST(CustomerID AS BIGINT))',
 'Czech Republic_sum(CAST(Quantity AS BIGINT))',
 'Czech Republic_