In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.shuffle.useOldFetchProtocol', 'true').\
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/itv015970/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
order_schema = "order_id long, order_date string, customer_id long, order_status string" 

In [3]:
orders_df = spark.read \
.format("csv") \
.schema(order_schema) \
.load("/public/trendytech/retail_db/ordersnew")

In [4]:
orders_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|    2480|2013-08-07 00:00:...|       3807|    COMPLETE|
|   30479|2014-01-30 00:00:...|       9265|    COMPLETE|
|    2481|2013-08-07 00:00:...|       2476|    COMPLETE|
|   30481|2014-01-30 00:00:...|       9240|    COMPLETE|
|    2483|2013-08-07 00:00:...|      10453|    COMPLETE|
|   30484|2014-01-30 00:00:...|       2876|    COMPLETE|
|    2484|2013-08-07 00:00:...|       9256|    COMPLETE|
|   30485|2014-01-30 00:00:...|       1069|    COMPLETE|
|    2488|2013-08-07 00:00:...|       1255|    COMPLETE|
|   30486|2014-01-30 00:00:...|       1151|    COMPLETE|
|    2491|2013-08-07 00:00:...|        247|    COMPLETE|
|   30487|2014-01-30 00:00:...|       6772|    COMPLETE|
|    2495|2013-08-07 00:00:...|       9011|    COMPLETE|
|   30489|2014-01-30 00:00:...|       5717|    COMPLETE|
|    2498|2013-08-07 00:00:...|

In [5]:
orders_df.createOrReplaceTempView("orders")

In [9]:
spark.sql("""
select customer_id, date_format(order_date, 'MMMM') as order_month,
count(1) as total_count from orders
group by customer_id, order_month order by order_month""").show()

+-----------+-----------+-----------+
|customer_id|order_month|total_count|
+-----------+-----------+-----------+
|      11141|      April|       2007|
|       3259|      April|       2007|
|      10064|      April|       2007|
|       8311|      April|       2385|
|        820|      April|       2007|
|       4538|      April|       2007|
|       1354|      April|       2007|
|      12396|      April|       2007|
|      11018|      April|        375|
|      10244|      April|        375|
|       1562|      April|        750|
|       2033|      April|        375|
|      11659|      April|        375|
|      10016|      April|        375|
|       3266|      April|        375|
|      12369|      April|        375|
|         49|      April|        375|
|       7185|      April|        375|
|       9043|      April|        750|
|       5677|      April|        375|
+-----------+-----------+-----------+
only showing top 20 rows



In [7]:
spark.sql("""
select customer_id, date_format(order_date, 'MMMM') as order_month,
count(1) as total_count, first(date_format(order_date, 'MM')) as month_num from orders
group by customer_id, order_month order by month_num""").show()

+-----------+-----------+-----------+---------+
|customer_id|order_month|total_count|month_num|
+-----------+-----------+-----------+---------+
|       2346|    January|       2010|       01|
|       2335|    January|        375|       01|
|         18|    January|       4020|       01|
|       4249|    January|       2760|       01|
|        440|    January|        375|       01|
|       1182|    January|        375|       01|
|       1606|    January|       2010|       01|
|       2787|    January|       2010|       01|
|       4318|    January|       2385|       01|
|       1198|    January|        375|       01|
|       9905|    January|       2010|       01|
|      10148|    January|       2010|       01|
|      10209|    January|        375|       01|
|      10635|    January|       2010|       01|
|      11156|    January|        375|       01|
|      11347|    January|       2010|       01|
|       4409|    January|       2010|       01|
|       7268|    January|       2010|   

In [10]:
# this code follows sort aggregate--> first sorts and then aggregates
spark.sql("""
select customer_id, date_format(order_date, 'MMMM') as order_month,
count(1) as total_count, first(date_format(order_date, 'MM')) as month_num from orders
group by customer_id, order_month order by month_num""").write.format("noop").mode("overwrite").save()

In [11]:
# This code follows hash aggregate-->
spark.sql("""
select customer_id, date_format(order_date, 'MMMM') as order_month,
count(1) as total_count, first(int(date_format(order_date, 'MM'))) as month_num from orders
group by customer_id, order_month order by month_num""").write.format("noop").mode("overwrite").save()