In [0]:
import pyspark.sql.functions as F

In [0]:
invoices_df = spark.read\
    .format("csv")\
        .option("header", "true")\
            .option("inferSchema", "true")\
                .option("samplingRatio", 0.0001)\
                    .load("/FileStore/tables/invoices.csv")


invoices_df.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|     null|WHITE HANGING HEA...|       6|01-12-2010 8.26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|01-12-2010 8.26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|01-12-2010 8.26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|01-12-2010 8.26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|01-12-2010 8.26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [0]:
invoices_df.printSchema()

root
 |-- InvoiceNo: integer (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
invoices_df

In [0]:
NumInvoices = F.countDistinct("InvoiceNo").alias("NumInvoices")


ex_summary = invoices_df\
                .withColumn("InvoiceDate", F.expr("to_date(InvoiceDate, 'dd-MM-yyyy H.mm')"))\
                    .withColumn('year', F.expr("year(InvoiceDate)"))\
                        .withColumn("week", F.expr("weekofyear(InvoiceDate)"))\
                            .groupBy("Country", "week")\
                                .agg(NumInvoices)


ex_summary.show(5)

+---------------+----+-----------+
|        Country|week|NumInvoices|
+---------------+----+-----------+
|           EIRE|  17|          1|
|         Norway|  47|          1|
|Channel Islands|  46|          1|
|         France|  10|          2|
|         France|   6|          3|
+---------------+----+-----------+
only showing top 5 rows



In [0]:
ex_summary.write\
    .format("parquet")\
        .mode("overwrite")\
            .save("/FileStore/tables/my_invoices")

In [0]:
%fs ls /FileStore/tables/my_invoices

path,name,size,modificationTime
dbfs:/FileStore/tables/my_invoices/_SUCCESS,_SUCCESS,0,1753069457000
dbfs:/FileStore/tables/my_invoices/_committed_5573294304808682415,_committed_5573294304808682415,123,1753069457000
dbfs:/FileStore/tables/my_invoices/_started_5573294304808682415,_started_5573294304808682415,0,1753069455000
dbfs:/FileStore/tables/my_invoices/part-00000-tid-5573294304808682415-161a153a-cfa8-4388-9ebc-0cdeb2e69ed0-43-1-c000.snappy.parquet,part-00000-tid-5573294304808682415-161a153a-cfa8-4388-9ebc-0cdeb2e69ed0-43-1-c000.snappy.parquet,3868,1753069457000
