In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
.appName("Retail Sales Analysis") \
.getOrCreate()

In [3]:
sales_data = [
("T001","North","Delhi","Store-01","Laptop","2024-01-01",75000),
("T002","North","Delhi","Store-01","Mobile","2024-01-02",32000),
("T003","North","Chandigarh","Store-02","Tablet","2024-01-03",26000),
("T004","South","Bangalore","Store-03","Laptop","2024-01-01",78000),
("T005","South","Chennai","Store-04","Mobile","2024-01-02",30000),
("T006","South","Bangalore","Store-03","Tablet","2024-01-03",24000),
("T007","East","Kolkata","Store-05","Laptop","2024-01-01",72000),
("T008","East","Kolkata","Store-05","Mobile","2024-01-02",28000),
("T009","East","Patna","Store-06","Tablet","2024-01-03",23000),
("T010","West","Mumbai","Store-07","Laptop","2024-01-01",80000),
("T011","West","Mumbai","Store-07","Mobile","2024-01-02",35000),
("T012","West","Pune","Store-08","Tablet","2024-01-03",27000),
("T013","North","Delhi","Store-01","Laptop","2024-01-04",76000),
("T014","South","Chennai","Store-04","Laptop","2024-01-04",79000),
("T015","East","Patna","Store-06","Mobile","2024-01-04",29000),
("T016","West","Pune","Store-08","Laptop","2024-01-04",77000),
("T017","North","Chandigarh","Store-02","Mobile","2024-01-05",31000),
("T018","South","Bangalore","Store-03","Mobile","2024-01-05",34000),
("T019","East","Kolkata","Store-05","Tablet","2024-01-05",25000),
("T020","West","Mumbai","Store-07","Tablet","2024-01-05",29000),
("T021","North","Delhi","Store-01","Tablet","2024-01-06",28000),
("T022","South","Chennai","Store-04","Tablet","2024-01-06",26000),
("T023","East","Patna","Store-06","Laptop","2024-01-06",74000),
("T024","West","Pune","Store-08","Mobile","2024-01-06",33000)
]
columns = [
"txn_id","region","city","store_id",
"product","sale_date","amount"
]
df_sales = spark.createDataFrame(sales_data, columns)
df_sales.show(5)
df_sales.printSchema()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
+------+------+----------+--------+-------+----------+------+
only showing top 5 rows
root
 |-- txn_id: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- sale_date: string (nullable = true)
 |-- amount: long (nullable = true)



#EXERCISE SET 1 — SELECT OPERATION

1. Select only txn_id , region , product , and amount

In [4]:
df_sales.select("txn_id", "region", "product", "amount").show()

+------+------+-------+------+
|txn_id|region|product|amount|
+------+------+-------+------+
|  T001| North| Laptop| 75000|
|  T002| North| Mobile| 32000|
|  T003| North| Tablet| 26000|
|  T004| South| Laptop| 78000|
|  T005| South| Mobile| 30000|
|  T006| South| Tablet| 24000|
|  T007|  East| Laptop| 72000|
|  T008|  East| Mobile| 28000|
|  T009|  East| Tablet| 23000|
|  T010|  West| Laptop| 80000|
|  T011|  West| Mobile| 35000|
|  T012|  West| Tablet| 27000|
|  T013| North| Laptop| 76000|
|  T014| South| Laptop| 79000|
|  T015|  East| Mobile| 29000|
|  T016|  West| Laptop| 77000|
|  T017| North| Mobile| 31000|
|  T018| South| Mobile| 34000|
|  T019|  East| Tablet| 25000|
|  T020|  West| Tablet| 29000|
+------+------+-------+------+
only showing top 20 rows


2.  Rename amount to revenue

In [5]:
df_sales.select(
    col("amount").alias("revenue")
).show()

+-------+
|revenue|
+-------+
|  75000|
|  32000|
|  26000|
|  78000|
|  30000|
|  24000|
|  72000|
|  28000|
|  23000|
|  80000|
|  35000|
|  27000|
|  76000|
|  79000|
|  29000|
|  77000|
|  31000|
|  34000|
|  25000|
|  29000|
+-------+
only showing top 20 rows


3. Create a derived column amount_in_thousands

In [6]:
df_sales = df_sales.withColumn(
    "amount_in_thousands", col("amount") / 1000
)
df_sales.show()

+------+------+----------+--------+-------+----------+------+-------------------+
|txn_id|region|      city|store_id|product| sale_date|amount|amount_in_thousands|
+------+------+----------+--------+-------+----------+------+-------------------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|               75.0|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|               32.0|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|               26.0|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|               78.0|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|               30.0|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|               24.0|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|               72.0|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|               28.0|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|               23.0|
|  T010|  West| 

4. Select distinct combinations of region and product

In [7]:
df_sales.select("region", "product").distinct().show()

+------+-------+
|region|product|
+------+-------+
| North| Laptop|
| North| Tablet|
|  East| Tablet|
|  East| Laptop|
| South| Tablet|
| North| Mobile|
|  West| Tablet|
|  East| Mobile|
| South| Mobile|
| South| Laptop|
|  West| Mobile|
|  West| Laptop|
+------+-------+



5. Select all columns but exclude store_id

In [8]:
df_sales.drop("store_id").show()

+------+------+----------+-------+----------+------+-------------------+
|txn_id|region|      city|product| sale_date|amount|amount_in_thousands|
+------+------+----------+-------+----------+------+-------------------+
|  T001| North|     Delhi| Laptop|2024-01-01| 75000|               75.0|
|  T002| North|     Delhi| Mobile|2024-01-02| 32000|               32.0|
|  T003| North|Chandigarh| Tablet|2024-01-03| 26000|               26.0|
|  T004| South| Bangalore| Laptop|2024-01-01| 78000|               78.0|
|  T005| South|   Chennai| Mobile|2024-01-02| 30000|               30.0|
|  T006| South| Bangalore| Tablet|2024-01-03| 24000|               24.0|
|  T007|  East|   Kolkata| Laptop|2024-01-01| 72000|               72.0|
|  T008|  East|   Kolkata| Mobile|2024-01-02| 28000|               28.0|
|  T009|  East|     Patna| Tablet|2024-01-03| 23000|               23.0|
|  T010|  West|    Mumbai| Laptop|2024-01-01| 80000|               80.0|
|  T011|  West|    Mumbai| Mobile|2024-01-02| 35000

6. Create a new column sale_year extracted from sale_date

In [10]:
from pyspark.sql.functions import year
df_sales = df_sales.withColumn(
    "sale_year", year(col("sale_date")))
df_sales.show()

+------+------+----------+--------+-------+----------+------+-------------------+---------+
|txn_id|region|      city|store_id|product| sale_date|amount|amount_in_thousands|sale_year|
+------+------+----------+--------+-------+----------+------+-------------------+---------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|               75.0|     2024|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|               32.0|     2024|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|               26.0|     2024|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|               78.0|     2024|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|               30.0|     2024|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|               24.0|     2024|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|               72.0|     2024|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|               28.0

7. Reorder columns in a business-friendly format

In [11]:
df_sales = df_sales.select(
    "txn_id",
    "sale_date",
    "sale_year",
    "region",
    "city",
    "store_id",
    "product",
    "amount",
    "amount_in_thousands"
)
df_sales.show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|
+------+----------+---------+------+----------+--------+-------+------+-------------------+
|  T001|2024-01-01|     2024| North|     Delhi|Store-01| Laptop| 75000|               75.0|
|  T002|2024-01-02|     2024| North|     Delhi|Store-01| Mobile| 32000|               32.0|
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|
|  T004|2024-01-01|     2024| South| Bangalore|Store-03| Laptop| 78000|               78.0|
|  T005|2024-01-02|     2024| South|   Chennai|Store-04| Mobile| 30000|               30.0|
|  T006|2024-01-03|     2024| South| Bangalore|Store-03| Tablet| 24000|               24.0|
|  T007|2024-01-01|     2024|  East|   Kolkata|Store-05| Laptop| 72000|               72.0|
|  T008|2024-01-02|     2024|  East|   Kolkata|Store-05| Mobile| 28000|         

#EXERCISE SET 2 — FILTER OPERATIONS

1. Filter transactions where amount > 50000

In [12]:
df_sales.filter(col("amount") > 50000).show()

+------+----------+---------+------+---------+--------+-------+------+-------------------+
|txn_id| sale_date|sale_year|region|     city|store_id|product|amount|amount_in_thousands|
+------+----------+---------+------+---------+--------+-------+------+-------------------+
|  T001|2024-01-01|     2024| North|    Delhi|Store-01| Laptop| 75000|               75.0|
|  T004|2024-01-01|     2024| South|Bangalore|Store-03| Laptop| 78000|               78.0|
|  T007|2024-01-01|     2024|  East|  Kolkata|Store-05| Laptop| 72000|               72.0|
|  T010|2024-01-01|     2024|  West|   Mumbai|Store-07| Laptop| 80000|               80.0|
|  T013|2024-01-04|     2024| North|    Delhi|Store-01| Laptop| 76000|               76.0|
|  T014|2024-01-04|     2024| South|  Chennai|Store-04| Laptop| 79000|               79.0|
|  T016|2024-01-04|     2024|  West|     Pune|Store-08| Laptop| 77000|               77.0|
|  T023|2024-01-06|     2024|  East|    Patna|Store-06| Laptop| 74000|               74.0|

2. Filter only Laptop sales

In [13]:
df_sales.filter(col("product") == "Laptop").show()

+------+----------+---------+------+---------+--------+-------+------+-------------------+
|txn_id| sale_date|sale_year|region|     city|store_id|product|amount|amount_in_thousands|
+------+----------+---------+------+---------+--------+-------+------+-------------------+
|  T001|2024-01-01|     2024| North|    Delhi|Store-01| Laptop| 75000|               75.0|
|  T004|2024-01-01|     2024| South|Bangalore|Store-03| Laptop| 78000|               78.0|
|  T007|2024-01-01|     2024|  East|  Kolkata|Store-05| Laptop| 72000|               72.0|
|  T010|2024-01-01|     2024|  West|   Mumbai|Store-07| Laptop| 80000|               80.0|
|  T013|2024-01-04|     2024| North|    Delhi|Store-01| Laptop| 76000|               76.0|
|  T014|2024-01-04|     2024| South|  Chennai|Store-04| Laptop| 79000|               79.0|
|  T016|2024-01-04|     2024|  West|     Pune|Store-08| Laptop| 77000|               77.0|
|  T023|2024-01-06|     2024|  East|    Patna|Store-06| Laptop| 74000|               74.0|

3. Filter sales from North and South regions

In [18]:
df_sales = df_sales.filter(
    col("region").isin("North", "South"
))
df_sales.show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|
+------+----------+---------+------+----------+--------+-------+------+-------------------+
|  T001|2024-01-01|     2024| North|     Delhi|Store-01| Laptop| 75000|               75.0|
|  T002|2024-01-02|     2024| North|     Delhi|Store-01| Mobile| 32000|               32.0|
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|
|  T004|2024-01-01|     2024| South| Bangalore|Store-03| Laptop| 78000|               78.0|
|  T005|2024-01-02|     2024| South|   Chennai|Store-04| Mobile| 30000|               30.0|
|  T006|2024-01-03|     2024| South| Bangalore|Store-03| Tablet| 24000|               24.0|
|  T013|2024-01-04|     2024| North|     Delhi|Store-01| Laptop| 76000|               76.0|
|  T014|2024-01-04|     2024| South|   Chennai|Store-04| Laptop| 79000|         

4.  Filter sales between 25000 and 75000

In [20]:
df_sales.filter((col("amount") >= 25000) & (col("amount") <= 75000)).show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|
+------+----------+---------+------+----------+--------+-------+------+-------------------+
|  T001|2024-01-01|     2024| North|     Delhi|Store-01| Laptop| 75000|               75.0|
|  T002|2024-01-02|     2024| North|     Delhi|Store-01| Mobile| 32000|               32.0|
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|
|  T005|2024-01-02|     2024| South|   Chennai|Store-04| Mobile| 30000|               30.0|
|  T017|2024-01-05|     2024| North|Chandigarh|Store-02| Mobile| 31000|               31.0|
|  T018|2024-01-05|     2024| South| Bangalore|Store-03| Mobile| 34000|               34.0|
|  T021|2024-01-06|     2024| North|     Delhi|Store-01| Tablet| 28000|               28.0|
|  T022|2024-01-06|     2024| South|   Chennai|Store-04| Tablet| 26000|         

5. Filter transactions from Delhi stores only

In [21]:
df_sales.filter((col("city") == "Delhi")).show()

+------+----------+---------+------+-----+--------+-------+------+-------------------+
|txn_id| sale_date|sale_year|region| city|store_id|product|amount|amount_in_thousands|
+------+----------+---------+------+-----+--------+-------+------+-------------------+
|  T001|2024-01-01|     2024| North|Delhi|Store-01| Laptop| 75000|               75.0|
|  T002|2024-01-02|     2024| North|Delhi|Store-01| Mobile| 32000|               32.0|
|  T013|2024-01-04|     2024| North|Delhi|Store-01| Laptop| 76000|               76.0|
|  T021|2024-01-06|     2024| North|Delhi|Store-01| Tablet| 28000|               28.0|
+------+----------+---------+------+-----+--------+-------+------+-------------------+



6.  Apply multiple filters using both filter and where

In [22]:
df_sales.filter(col("amount")> 50000).where(col("region") == "North").show()

+------+----------+---------+------+-----+--------+-------+------+-------------------+
|txn_id| sale_date|sale_year|region| city|store_id|product|amount|amount_in_thousands|
+------+----------+---------+------+-----+--------+-------+------+-------------------+
|  T001|2024-01-01|     2024| North|Delhi|Store-01| Laptop| 75000|               75.0|
|  T013|2024-01-04|     2024| North|Delhi|Store-01| Laptop| 76000|               76.0|
+------+----------+---------+------+-----+--------+-------+------+-------------------+



7. Change the order of lters and compare explain(True)

In [23]:
df1 = df_sales.filter(col("amount") > 30000).filter(col("region") == "North")
df1.explain(True)
df2 = df_sales.filter(col("region") == "South").filter(col("amount") > 30000)
df2.explain(True)

== Parsed Logical Plan ==
'Filter '`=`('region, North)
+- Filter (amount#6L > cast(30000 as bigint))
   +- Filter region#1 IN (North,South)
      +- Filter region#1 IN (North,South)
         +- Filter ((region#1 = North) OR (region#1 = South))
            +- Filter ((region#1 = North) OR (region#1 = South))
               +- Project [txn_id#0, sale_date#5, sale_year#102, region#1, city#2, store_id#3, product#4, amount#6L, amount_in_thousands#47]
                  +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, amount_in_thousands#47, year(cast(sale_date#5 as date)) AS sale_year#102]
                     +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_thousands#47]
                        +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, sale_date: string, 

8.  Identify which lters Spark pushes down

In [24]:
df_sales.filter(col("amount") > 50000).filter(col("region")=="North").explain(True)

== Parsed Logical Plan ==
'Filter '`=`('region, North)
+- Filter (amount#6L > cast(50000 as bigint))
   +- Filter region#1 IN (North,South)
      +- Filter region#1 IN (North,South)
         +- Filter ((region#1 = North) OR (region#1 = South))
            +- Filter ((region#1 = North) OR (region#1 = South))
               +- Project [txn_id#0, sale_date#5, sale_year#102, region#1, city#2, store_id#3, product#4, amount#6L, amount_in_thousands#47]
                  +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, amount_in_thousands#47, year(cast(sale_date#5 as date)) AS sale_year#102]
                     +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_thousands#47]
                        +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, sale_date: string, 

#EXERCISE SET 3 — GROUPBY & AGGREGATE FUNCTIONS

1. Total sales amount per region

In [27]:
from pyspark.sql.functions import sum
df_sales.groupBy("region").agg(sum("amount").alias("total_sales")).show()

+------+-----------+
|region|total_sales|
+------+-----------+
| South|     271000|
| North|     268000|
+------+-----------+



2. Average sales amount per product

In [28]:
from pyspark.sql.functions import avg
df_sales.groupBy("product").agg(avg("amount").alias("avg_sales")).show()

+-------+---------+
|product|avg_sales|
+-------+---------+
| Laptop|  77000.0|
| Mobile|  31750.0|
| Tablet|  26000.0|
+-------+---------+



3. Maximum sale per city

In [29]:
from pyspark.sql.functions import max
df_sales.groupBy("city").agg(max("amount").alias("max_sale")).show()

+----------+--------+
|      city|max_sale|
+----------+--------+
| Bangalore|   78000|
|   Chennai|   79000|
|     Delhi|   76000|
|Chandigarh|   31000|
+----------+--------+



4. Minimum sale per store

In [30]:
from pyspark.sql.functions import min
df_sales.groupBy("store_id").agg(min("amount").alias("min_sale")).show()

+--------+--------+
|store_id|min_sale|
+--------+--------+
|Store-03|   24000|
|Store-01|   28000|
|Store-04|   26000|
|Store-02|   26000|
+--------+--------+



5. Count of transactions per region

In [31]:
from pyspark.sql.functions import count
df_sales.groupBy("region").agg(count("txn_id").alias("transaction_count")).show()

+------+-----------------+
|region|transaction_count|
+------+-----------------+
| South|                6|
| North|                6|
+------+-----------------+



6. Total revenue per store

In [32]:
df_sales.groupBy("store_id").agg(sum("amount").alias("total_revenue")).show()

+--------+-------------+
|store_id|total_revenue|
+--------+-------------+
|Store-03|       136000|
|Store-01|       211000|
|Store-04|       135000|
|Store-02|        57000|
+--------+-------------+



7. Region-wise product sales count

In [33]:
df_sales.groupBy("region", "product").agg(count("txn_id").alias("sales_count")).show()

+------+-------+-----------+
|region|product|sales_count|
+------+-------+-----------+
| North| Laptop|          2|
| North| Tablet|          2|
| South| Tablet|          2|
| North| Mobile|          2|
| South| Mobile|          2|
| South| Laptop|          2|
+------+-------+-----------+



8. Average transaction value per city

In [34]:
df_sales.groupBy("city").agg(avg("amount").alias("avg_transaction_value")).show()

+----------+---------------------+
|      city|avg_transaction_value|
+----------+---------------------+
| Bangalore|   45333.333333333336|
|   Chennai|              45000.0|
|     Delhi|              52750.0|
|Chandigarh|              28500.0|
+----------+---------------------+



9.  Identify regions with total sales above a threshold

In [37]:
#considering threshold 270000
df_sales.groupBy("region").agg(sum("amount").alias("total_sales")).filter(col("total_sales") > 270000).show()

+------+-----------+
|region|total_sales|
+------+-----------+
| South|     271000|
+------+-----------+



10. Use explain(True) and identify shuffle stages

---



In [38]:
df_sales.groupBy("region").agg(sum("amount").alias("total_sales")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, 'sum('amount) AS total_sales#651]
+- Filter region#1 IN (North,South)
   +- Filter region#1 IN (North,South)
      +- Filter ((region#1 = North) OR (region#1 = South))
         +- Filter ((region#1 = North) OR (region#1 = South))
            +- Project [txn_id#0, sale_date#5, sale_year#102, region#1, city#2, store_id#3, product#4, amount#6L, amount_in_thousands#47]
               +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, amount_in_thousands#47, year(cast(sale_date#5 as date)) AS sale_year#102]
                  +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_thousands#47]
                     +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
region: string, total_sales: bigint
Aggregate [region#1], [region#1,

#EXERCISE SET 4 — MULTI-DIMENSIONAL AGGREGATION

1. Region + Product wise total sales

In [39]:
df_sales.groupBy("region", "product").agg(sum("amount").alias("total_sales")).show()

+------+-------+-----------+
|region|product|total_sales|
+------+-------+-----------+
| North| Laptop|     151000|
| North| Tablet|      54000|
| South| Tablet|      50000|
| North| Mobile|      63000|
| South| Mobile|      64000|
| South| Laptop|     157000|
+------+-------+-----------+



2. City + Store wise average sales

In [40]:
df_sales.groupBy("city", "store_id").agg(avg("amount").alias("avg_sales")).show()

+----------+--------+------------------+
|      city|store_id|         avg_sales|
+----------+--------+------------------+
| Bangalore|Store-03|45333.333333333336|
|   Chennai|Store-04|           45000.0|
|Chandigarh|Store-02|           28500.0|
|     Delhi|Store-01|           52750.0|
+----------+--------+------------------+



 3. Region + City wise transaction count

In [41]:
df_sales.groupBy("region", "city").agg(count("txn_id").alias("transaction_count")).show()

+------+----------+-----------------+
|region|      city|transaction_count|
+------+----------+-----------------+
| South| Bangalore|                3|
| North|     Delhi|                4|
| North|Chandigarh|                2|
| South|   Chennai|                3|
+------+----------+-----------------+



4.  Product + Store wise max sale

In [42]:
df_sales.groupBy("product", "store_id").agg(max("amount").alias("max_sale")).show()

+-------+--------+--------+
|product|store_id|max_sale|
+-------+--------+--------+
| Laptop|Store-01|   76000|
| Tablet|Store-02|   26000|
| Mobile|Store-01|   32000|
| Laptop|Store-03|   78000|
| Tablet|Store-03|   24000|
| Mobile|Store-04|   30000|
| Tablet|Store-01|   28000|
| Laptop|Store-04|   79000|
| Tablet|Store-04|   26000|
| Mobile|Store-03|   34000|
| Mobile|Store-02|   31000|
+-------+--------+--------+



5.  Identify top-selling product per region using aggregation only

In [43]:
region_product_sales = df_sales.groupBy("region", "product").agg(sum("amount").alias("total_sales"))
region_product_sales.show()

+------+-------+-----------+
|region|product|total_sales|
+------+-------+-----------+
| North| Laptop|     151000|
| North| Tablet|      54000|
| South| Tablet|      50000|
| North| Mobile|      63000|
| South| Mobile|      64000|
| South| Laptop|     157000|
+------+-------+-----------+



#EXERCISE SET 5 — WINDOW FUNCTIONS (OVER)

In [44]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, rank, row_number, dense_rank

1. Compute running total of sales per region ordered by date

In [45]:
window_region_date = Window.partitionBy("region").orderBy("sale_date") \
.orderBy("sale_date") \
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_sales.withColumn("running_total", sum("amount").over(window_region_date)).show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+-------------+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|running_total|
+------+----------+---------+------+----------+--------+-------+------+-------------------+-------------+
|  T001|2024-01-01|     2024| North|     Delhi|Store-01| Laptop| 75000|               75.0|        75000|
|  T002|2024-01-02|     2024| North|     Delhi|Store-01| Mobile| 32000|               32.0|       107000|
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|       133000|
|  T013|2024-01-04|     2024| North|     Delhi|Store-01| Laptop| 76000|               76.0|       209000|
|  T017|2024-01-05|     2024| North|Chandigarh|Store-02| Mobile| 31000|               31.0|       240000|
|  T021|2024-01-06|     2024| North|     Delhi|Store-01| Tablet| 28000|               28.0|       268000|
|  T004|2024-01-01|     2024| South| Bangalore

2. Rank transactions by amount within each region

In [46]:
window_region_txn = Window.partitionBy("region").orderBy(df_sales["amount"].desc())
df_sales.withColumn("rank", rank().over(window_region_txn)).show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+----+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|rank|
+------+----------+---------+------+----------+--------+-------+------+-------------------+----+
|  T013|2024-01-04|     2024| North|     Delhi|Store-01| Laptop| 76000|               76.0|   1|
|  T001|2024-01-01|     2024| North|     Delhi|Store-01| Laptop| 75000|               75.0|   2|
|  T002|2024-01-02|     2024| North|     Delhi|Store-01| Mobile| 32000|               32.0|   3|
|  T017|2024-01-05|     2024| North|Chandigarh|Store-02| Mobile| 31000|               31.0|   4|
|  T021|2024-01-06|     2024| North|     Delhi|Store-01| Tablet| 28000|               28.0|   5|
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|   6|
|  T014|2024-01-04|     2024| South|   Chennai|Store-04| Laptop| 79000|               79.0|   1|
|  T004|2024-01-01|     2024| 

3.  Assign row numbers per store ordered by sale amount

In [48]:
window_store_amount = Window.partitionBy("store_id").orderBy(df_sales["amount"].desc())
df_sales.withColumn("row_number", row_number().over(window_store_amount)).show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+----------+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|row_number|
+------+----------+---------+------+----------+--------+-------+------+-------------------+----------+
|  T013|2024-01-04|     2024| North|     Delhi|Store-01| Laptop| 76000|               76.0|         1|
|  T001|2024-01-01|     2024| North|     Delhi|Store-01| Laptop| 75000|               75.0|         2|
|  T002|2024-01-02|     2024| North|     Delhi|Store-01| Mobile| 32000|               32.0|         3|
|  T021|2024-01-06|     2024| North|     Delhi|Store-01| Tablet| 28000|               28.0|         4|
|  T017|2024-01-05|     2024| North|Chandigarh|Store-02| Mobile| 31000|               31.0|         1|
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|         2|
|  T004|2024-01-01|     2024| South| Bangalore|Store-03| Laptop| 78000|  

4. Use dense rank to rank products per region

In [50]:
region_product_sales = df_sales.groupBy("region", "product").agg(sum("amount").alias("total_sales"))
window_region_product = Window.partitionBy("region").orderBy("region").orderBy(region_product_sales["total_sales"].desc())
region_product_sales.withColumn("dense_rank", dense_rank().over(window_region_product)).show()

+------+-------+-----------+----------+
|region|product|total_sales|dense_rank|
+------+-------+-----------+----------+
| North| Laptop|     151000|         1|
| North| Mobile|      63000|         2|
| North| Tablet|      54000|         3|
| South| Laptop|     157000|         1|
| South| Mobile|      64000|         2|
| South| Tablet|      50000|         3|
+------+-------+-----------+----------+



5.  Identify top 2 highest sales per region using window functions

In [52]:
region_product_sales.withColumn("rank", rank().over(window_region_product)).filter(col("rank") <= 2).show()

+------+-------+-----------+----+
|region|product|total_sales|rank|
+------+-------+-----------+----+
| North| Laptop|     151000|   1|
| North| Mobile|      63000|   2|
| South| Laptop|     157000|   1|
| South| Mobile|      64000|   2|
+------+-------+-----------+----+



6. Compare rank vs dense_rank output

In [54]:
region_product_sales.select("region", "total_sales", rank().over(window_region_product).alias("rank"),
                dense_rank().over(window_region_product).alias("dense_rank")).show()

+------+-----------+----+----------+
|region|total_sales|rank|dense_rank|
+------+-----------+----+----------+
| North|     151000|   1|         1|
| North|      63000|   2|         2|
| North|      54000|   3|         3|
| South|     157000|   1|         1|
| South|      64000|   2|         2|
| South|      50000|   3|         3|
+------+-----------+----+----------+



7. Calculate cumulative sales per store

In [56]:
window_store_date = Window.partitionBy("store_id").orderBy("sale_date")\
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_sales.withColumn("cumulative_sales", sum("amount").over(window_store_date)).show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+----------------+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|cumulative_sales|
+------+----------+---------+------+----------+--------+-------+------+-------------------+----------------+
|  T001|2024-01-01|     2024| North|     Delhi|Store-01| Laptop| 75000|               75.0|           75000|
|  T002|2024-01-02|     2024| North|     Delhi|Store-01| Mobile| 32000|               32.0|          107000|
|  T013|2024-01-04|     2024| North|     Delhi|Store-01| Laptop| 76000|               76.0|          183000|
|  T021|2024-01-06|     2024| North|     Delhi|Store-01| Tablet| 28000|               28.0|          211000|
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|           26000|
|  T017|2024-01-05|     2024| North|Chandigarh|Store-02| Mobile| 31000|               31.0|           57000|
|  T004|2024-01-01|

8.  Identify rst and last transaction per city using windows



In [58]:
from pyspark.sql.functions import first, last
window_city_date = Window.partitionBy("city").orderBy("sale_date")
df_sales.withColumn("first_transaction", first("txn_id").over(window_city_date)) \
.withColumn("last_transaction", last("txn_id").over(window_city_date)).show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+-----------------+----------------+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|first_transaction|last_transaction|
+------+----------+---------+------+----------+--------+-------+------+-------------------+-----------------+----------------+
|  T004|2024-01-01|     2024| South| Bangalore|Store-03| Laptop| 78000|               78.0|             T004|            T004|
|  T006|2024-01-03|     2024| South| Bangalore|Store-03| Tablet| 24000|               24.0|             T004|            T006|
|  T018|2024-01-05|     2024| South| Bangalore|Store-03| Mobile| 34000|               34.0|             T004|            T018|
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|             T003|            T003|
|  T017|2024-01-05|     2024| North|Chandigarh|Store-02| Mobile| 31000|               31.0|             T003|  

#EXERCISE SET 7 — DAG & PERFORMANCE OBSERVATION

1. Run explain(True) for:
Simple select
Filter
GroupBy
Window function

In [59]:
df_sales.select("region", "amount").explain(True)

== Parsed Logical Plan ==
'Project ['region, 'amount]
+- Filter region#1 IN (North,South)
   +- Filter region#1 IN (North,South)
      +- Filter ((region#1 = North) OR (region#1 = South))
         +- Filter ((region#1 = North) OR (region#1 = South))
            +- Project [txn_id#0, sale_date#5, sale_year#102, region#1, city#2, store_id#3, product#4, amount#6L, amount_in_thousands#47]
               +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, amount_in_thousands#47, year(cast(sale_date#5 as date)) AS sale_year#102]
                  +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_thousands#47]
                     +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
region: string, amount: bigint
Project [region#1, amount#6L]
+- Filter region#1 IN (North,South)
   +- Fil

In [61]:
df_sales.filter("amount>50000").explain(True)

== Parsed Logical Plan ==
'Filter ('amount > 50000)
+- Filter region#1 IN (North,South)
   +- Filter region#1 IN (North,South)
      +- Filter ((region#1 = North) OR (region#1 = South))
         +- Filter ((region#1 = North) OR (region#1 = South))
            +- Project [txn_id#0, sale_date#5, sale_year#102, region#1, city#2, store_id#3, product#4, amount#6L, amount_in_thousands#47]
               +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, amount_in_thousands#47, year(cast(sale_date#5 as date)) AS sale_year#102]
                  +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_thousands#47]
                     +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, sale_date: string, sale_year: int, region: string, city: string, store_id: string, product: 

In [62]:
df_sales.groupBy("region").agg(sum("amount")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, unresolvedalias('sum('amount))]
+- Filter region#1 IN (North,South)
   +- Filter region#1 IN (North,South)
      +- Filter ((region#1 = North) OR (region#1 = South))
         +- Filter ((region#1 = North) OR (region#1 = South))
            +- Project [txn_id#0, sale_date#5, sale_year#102, region#1, city#2, store_id#3, product#4, amount#6L, amount_in_thousands#47]
               +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, amount_in_thousands#47, year(cast(sale_date#5 as date)) AS sale_year#102]
                  +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_thousands#47]
                     +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
region: string, sum(amount): bigint
Aggregate [region#1], [region#1, s

In [64]:
window_spec = Window.partitionBy("region").orderBy("amount")
df_sales.withColumn("rank", rank().over(window_spec)).explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(rank, 'rank() windowspecdefinition('region, 'amount ASC NULLS FIRST, unspecifiedframe$()), None)]
+- Filter region#1 IN (North,South)
   +- Filter region#1 IN (North,South)
      +- Filter ((region#1 = North) OR (region#1 = South))
         +- Filter ((region#1 = North) OR (region#1 = South))
            +- Project [txn_id#0, sale_date#5, sale_year#102, region#1, city#2, store_id#3, product#4, amount#6L, amount_in_thousands#47]
               +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, amount_in_thousands#47, year(cast(sale_date#5 as date)) AS sale_year#102]
                  +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_thousands#47]
                     +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==


2. Identify:
Shuffles
Exchanges
Sorts

In [65]:
window_spec = Window.partitionBy("region").orderBy("amount")
df_sales.withColumn("rank", rank().over(window_spec)).show()

+------+----------+---------+------+----------+--------+-------+------+-------------------+----+
|txn_id| sale_date|sale_year|region|      city|store_id|product|amount|amount_in_thousands|rank|
+------+----------+---------+------+----------+--------+-------+------+-------------------+----+
|  T003|2024-01-03|     2024| North|Chandigarh|Store-02| Tablet| 26000|               26.0|   1|
|  T021|2024-01-06|     2024| North|     Delhi|Store-01| Tablet| 28000|               28.0|   2|
|  T017|2024-01-05|     2024| North|Chandigarh|Store-02| Mobile| 31000|               31.0|   3|
|  T002|2024-01-02|     2024| North|     Delhi|Store-01| Mobile| 32000|               32.0|   4|
|  T001|2024-01-01|     2024| North|     Delhi|Store-01| Laptop| 75000|               75.0|   5|
|  T013|2024-01-04|     2024| North|     Delhi|Store-01| Laptop| 76000|               76.0|   6|
|  T006|2024-01-03|     2024| South| Bangalore|Store-03| Tablet| 24000|               24.0|   1|
|  T022|2024-01-06|     2024| 

Shuffles / Exchanges: These refer to the movement of data between different partitions or nodes in your Spark cluster. Operations like groupBy or Window functions often require a shuffle (indicated by Exchange hashpartitioning(...)) to ensure all related data is processed together.

Sorts: These are operations where data is ordered based on one or more columns (indicated by Sort [...]). They are frequently a consequence of orderBy clauses in window functions or sorting requirements for certain join types, ensuring data is in the correct sequence for subsequent steps.


3. Explain why window functions introduce sorting

In [66]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

# Define a window specification that includes an orderBy clause
window_spec_sort_example = Window.partitionBy("region").orderBy(col("amount").desc())

# Apply a window function (rank) and explain the physical plan
df_sales.withColumn("rank_by_amount", rank().over(window_spec_sort_example)).explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(rank_by_amount, 'rank() windowspecdefinition('region, 'amount DESC NULLS LAST, unspecifiedframe$()), None)]
+- Filter region#1 IN (North,South)
   +- Filter region#1 IN (North,South)
      +- Filter ((region#1 = North) OR (region#1 = South))
         +- Filter ((region#1 = North) OR (region#1 = South))
            +- Project [txn_id#0, sale_date#5, sale_year#102, region#1, city#2, store_id#3, product#4, amount#6L, amount_in_thousands#47]
               +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, amount_in_thousands#47, year(cast(sale_date#5 as date)) AS sale_year#102]
                  +- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, (cast(amount#6L as double) / cast(1000 as double)) AS amount_in_thousands#47]
                     +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logica

In the `Physical Plan` section of the output above, look for `Sort [...]`. This explicitly indicates where Spark has introduced a sort operation. It will typically show sorting by the partitioning key (if present) and then by the ordering keys defined in your `orderBy` clause within the window specification.