In [1]:
!pip install pyspark



# Dataset Loading

In [3]:
sales_data = [
("T001","North","Delhi","Store-01","Laptop","2024-01-01",75000),
("T002","North","Delhi","Store-01","Mobile","2024-01-02",32000),
("T003","North","Chandigarh","Store-02","Tablet","2024-01-03",26000),
("T004","South","Bangalore","Store-03","Laptop","2024-01-01",78000),
("T005","South","Chennai","Store-04","Mobile","2024-01-02",30000),
("T006","South","Bangalore","Store-03","Tablet","2024-01-03",24000),
("T007","East","Kolkata","Store-05","Laptop","2024-01-01",72000),
("T008","East","Kolkata","Store-05","Mobile","2024-01-02",28000),
("T009","East","Patna","Store-06","Tablet","2024-01-03",23000),
("T010","West","Mumbai","Store-07","Laptop","2024-01-01",80000),
("T011","West","Mumbai","Store-07","Mobile","2024-01-02",35000),
("T012","West","Pune","Store-08","Tablet","2024-01-03",27000),
("T013","North","Delhi","Store-01","Laptop","2024-01-04",76000),
("T014","South","Chennai","Store-04","Laptop","2024-01-04",79000),
("T015","East","Patna","Store-06","Mobile","2024-01-04",29000),
("T016","West","Pune","Store-08","Laptop","2024-01-04",77000),
("T017","North","Chandigarh","Store-02","Mobile","2024-01-05",31000),
("T018","South","Bangalore","Store-03","Mobile","2024-01-05",34000),
("T019","East","Kolkata","Store-05","Tablet","2024-01-05",25000),
("T020","West","Mumbai","Store-07","Tablet","2024-01-05",29000),
("T021","North","Delhi","Store-01","Tablet","2024-01-06",28000),
("T022","South","Chennai","Store-04","Tablet","2024-01-06",26000),
("T023","East","Patna","Store-06","Laptop","2024-01-06",74000),
("T024","West","Pune","Store-08","Mobile","2024-01-06",33000)
]


In [4]:
columns = [
"txn_id","region","city","store_id",
"product","sale_date","amount"
]

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Retail Sales Analysis") \
.getOrCreate()

# Dataset show and select

In [6]:
df_sales = spark.createDataFrame(sales_data, columns)
df_sales.show(5)
df_sales.printSchema()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
+------+------+----------+--------+-------+----------+------+
only showing top 5 rows
root
 |-- txn_id: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- sale_date: string (nullable = true)
 |-- amount: long (nullable = true)



# Column selection, renaming, and derived columns.

In [9]:
from pyspark.sql.functions import col
df_trial=df_sales.select(col("txn_id") , col("region") , col("product") , col("amount").alias("revenue"))
df_trial.show()

+------+------+-------+-------+
|txn_id|region|product|revenue|
+------+------+-------+-------+
|  T001| North| Laptop|  75000|
|  T002| North| Mobile|  32000|
|  T003| North| Tablet|  26000|
|  T004| South| Laptop|  78000|
|  T005| South| Mobile|  30000|
|  T006| South| Tablet|  24000|
|  T007|  East| Laptop|  72000|
|  T008|  East| Mobile|  28000|
|  T009|  East| Tablet|  23000|
|  T010|  West| Laptop|  80000|
|  T011|  West| Mobile|  35000|
|  T012|  West| Tablet|  27000|
|  T013| North| Laptop|  76000|
|  T014| South| Laptop|  79000|
|  T015|  East| Mobile|  29000|
|  T016|  West| Laptop|  77000|
|  T017| North| Mobile|  31000|
|  T018| South| Mobile|  34000|
|  T019|  East| Tablet|  25000|
|  T020|  West| Tablet|  29000|
+------+------+-------+-------+
only showing top 20 rows


In [10]:
df_trial=df_trial.withColumn("amount_in_thousands", col("revenue")/100)
df_trial.show()

+------+------+-------+-------+-------------------+
|txn_id|region|product|revenue|amount_in_thousands|
+------+------+-------+-------+-------------------+
|  T001| North| Laptop|  75000|              750.0|
|  T002| North| Mobile|  32000|              320.0|
|  T003| North| Tablet|  26000|              260.0|
|  T004| South| Laptop|  78000|              780.0|
|  T005| South| Mobile|  30000|              300.0|
|  T006| South| Tablet|  24000|              240.0|
|  T007|  East| Laptop|  72000|              720.0|
|  T008|  East| Mobile|  28000|              280.0|
|  T009|  East| Tablet|  23000|              230.0|
|  T010|  West| Laptop|  80000|              800.0|
|  T011|  West| Mobile|  35000|              350.0|
|  T012|  West| Tablet|  27000|              270.0|
|  T013| North| Laptop|  76000|              760.0|
|  T014| South| Laptop|  79000|              790.0|
|  T015|  East| Mobile|  29000|              290.0|
|  T016|  West| Laptop|  77000|              770.0|
|  T017| Nor

In [11]:
df_sales.select("region", "product").distinct().show()

+------+-------+
|region|product|
+------+-------+
| North| Laptop|
| North| Tablet|
|  East| Tablet|
|  East| Laptop|
| South| Tablet|
| North| Mobile|
|  West| Tablet|
|  East| Mobile|
| South| Mobile|
| South| Laptop|
|  West| Mobile|
|  West| Laptop|
+------+-------+



In [12]:
df_sales.drop("store_id").show()

+------+------+----------+-------+----------+------+
|txn_id|region|      city|product| sale_date|amount|
+------+------+----------+-------+----------+------+
|  T001| North|     Delhi| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai| Mobile|2024-01-02| 30000|
|  T006| South| Bangalore| Tablet|2024-01-03| 24000|
|  T007|  East|   Kolkata| Laptop|2024-01-01| 72000|
|  T008|  East|   Kolkata| Mobile|2024-01-02| 28000|
|  T009|  East|     Patna| Tablet|2024-01-03| 23000|
|  T010|  West|    Mumbai| Laptop|2024-01-01| 80000|
|  T011|  West|    Mumbai| Mobile|2024-01-02| 35000|
|  T012|  West|      Pune| Tablet|2024-01-03| 27000|
|  T013| North|     Delhi| Laptop|2024-01-04| 76000|
|  T014| South|   Chennai| Laptop|2024-01-04| 79000|
|  T015|  East|     Patna| Mobile|2024-01-04| 29000|
|  T016|  West|      Pune| Laptop|2024-01-04| 

In [13]:
from pyspark.sql.functions import year
df_sales.withColumn("sale_year", year(col("sale_date"))).show()

+------+------+----------+--------+-------+----------+------+---------+
|txn_id|region|      city|store_id|product| sale_date|amount|sale_year|
+------+------+----------+--------+-------+----------+------+---------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|     2024|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|     2024|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|     2024|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|     2024|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|     2024|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|     2024|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|     2024|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|     2024|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|     2024|
|  T010|  West|    Mumbai|Store-07| Laptop|2024-01-01| 80000|     2024|
|  T011|  West|    Mumbai|Store-07| Mobile|2024-01-02| 35000|   

# Business Friendly Format

In [15]:
df_sales.withColumnRenamed("amount", "revenue").withColumn("sale_year", year(col("sale_date"))).select("txn_id", "sale_date", "sale_year", "region", "city", "product", "revenue").withColumn("amount_in_thousands", col("revenue") / 1000).show()


+------+----------+---------+------+----------+-------+-------+-------------------+
|txn_id| sale_date|sale_year|region|      city|product|revenue|amount_in_thousands|
+------+----------+---------+------+----------+-------+-------+-------------------+
|  T001|2024-01-01|     2024| North|     Delhi| Laptop|  75000|               75.0|
|  T002|2024-01-02|     2024| North|     Delhi| Mobile|  32000|               32.0|
|  T003|2024-01-03|     2024| North|Chandigarh| Tablet|  26000|               26.0|
|  T004|2024-01-01|     2024| South| Bangalore| Laptop|  78000|               78.0|
|  T005|2024-01-02|     2024| South|   Chennai| Mobile|  30000|               30.0|
|  T006|2024-01-03|     2024| South| Bangalore| Tablet|  24000|               24.0|
|  T007|2024-01-01|     2024|  East|   Kolkata| Laptop|  72000|               72.0|
|  T008|2024-01-02|     2024|  East|   Kolkata| Mobile|  28000|               28.0|
|  T009|2024-01-03|     2024|  East|     Patna| Tablet|  23000|             

# Row-level filtering and Predicate pushdown.

## Filter transactions where amount > 50000

In [16]:
df_sales.filter(col("amount") > 50000).show()

+------+------+---------+--------+-------+----------+------+
|txn_id|region|     city|store_id|product| sale_date|amount|
+------+------+---------+--------+-------+----------+------+
|  T001| North|    Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T004| South|Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T007|  East|  Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T010|  West|   Mumbai|Store-07| Laptop|2024-01-01| 80000|
|  T013| North|    Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|  Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T016|  West|     Pune|Store-08| Laptop|2024-01-04| 77000|
|  T023|  East|    Patna|Store-06| Laptop|2024-01-06| 74000|
+------+------+---------+--------+-------+----------+------+



## Filter only Laptop sales

In [17]:
df_sales.filter(df_sales.product=="Laptop").show()

+------+------+---------+--------+-------+----------+------+
|txn_id|region|     city|store_id|product| sale_date|amount|
+------+------+---------+--------+-------+----------+------+
|  T001| North|    Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T004| South|Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T007|  East|  Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T010|  West|   Mumbai|Store-07| Laptop|2024-01-01| 80000|
|  T013| North|    Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|  Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T016|  West|     Pune|Store-08| Laptop|2024-01-04| 77000|
|  T023|  East|    Patna|Store-06| Laptop|2024-01-06| 74000|
+------+------+---------+--------+-------+----------+------+



## Filter sales from North and South regions

In [22]:
df_sales.filter((df_sales.region=="North") | (df_sales.region=="South")).show()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|   Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|
|  T022| South|   Chennai|Store-04| Tablet|2024-01-06| 26000|
+------+------+----------+--------+-------+----------+------+



## Filter sales between 25000 and 75000

In [23]:
df_sales.filter((df_sales.amount>=25000) & (df_sales.amount<=75000)).show()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|
|  T011|  West|    Mumbai|Store-07| Mobile|2024-01-02| 35000|
|  T012|  West|      Pune|Store-08| Tablet|2024-01-03| 27000|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|
|  T020|  West|    Mumbai|Store-07| Tablet|2024-01-05| 29000|
|  T021|

## Filter transactions from Delhi stores only

In [24]:
df_sales.filter(df_sales.city=="Delhi").show()

+------+------+-----+--------+-------+----------+------+
|txn_id|region| city|store_id|product| sale_date|amount|
+------+------+-----+--------+-------+----------+------+
|  T001| North|Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T013| North|Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T021| North|Delhi|Store-01| Tablet|2024-01-06| 28000|
+------+------+-----+--------+-------+----------+------+



## Apply multiple filters using where and filter

In [25]:
df_sales.filter(col("product") == "Laptop") \
        .where(col("amount") > 50000) \
        .filter(col("region") == "North") \
        .show()

+------+------+-----+--------+-------+----------+------+
|txn_id|region| city|store_id|product| sale_date|amount|
+------+------+-----+--------+-------+----------+------+
|  T001| North|Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T013| North|Delhi|Store-01| Laptop|2024-01-04| 76000|
+------+------+-----+--------+-------+----------+------+



## Change the order of filters and compare

In [26]:
df_sales.filter(col("region") == "North") \
        .filter(col("product") == "Laptop") \
        .filter(col("amount") > 50000) \
        .explain(True)

== Parsed Logical Plan ==
'Filter '`>`('amount, 50000)
+- Filter (product#4 = Laptop)
   +- Filter (region#1 = North)
      +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, region: string, city: string, store_id: string, product: string, sale_date: string, amount: bigint
Filter (amount#6L > cast(50000 as bigint))
+- Filter (product#4 = Laptop)
   +- Filter (region#1 = North)
      +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Filter (((isnotnull(region#1) AND isnotnull(product#4)) AND isnotnull(amount#6L)) AND ((region#1 = North) AND ((product#4 = Laptop) AND (amount#6L > 50000))))
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
*(1) Filter (((isnotnull(region#1) AND isnotnull(product#4)) AND isnotnull(amount#6L)) AND ((region#1 = North) AND ((pr

## Total sales amount per region



In [27]:
df_sales.groupBy("region").sum("amount").withColumnRenamed("sum(amount)", "total_sales").show()

+------+-----------+
|region|total_sales|
+------+-----------+
| South|     271000|
|  East|     251000|
|  West|     281000|
| North|     268000|
+------+-----------+



## Average sales amount per product


In [28]:
df_sales.groupBy("product").avg("amount").withColumnRenamed("avg(amount)", "avg_sales").show()

+-------+---------+
|product|avg_sales|
+-------+---------+
| Laptop|  76375.0|
| Mobile|  31500.0|
| Tablet|  26000.0|
+-------+---------+



## Maximum sale per city


In [29]:
df_sales.groupBy("city").max("amount").withColumnRenamed("max(amount)", "max_sale").show()

+----------+--------+
|      city|max_sale|
+----------+--------+
| Bangalore|   78000|
|     Patna|   74000|
|   Chennai|   79000|
|    Mumbai|   80000|
|   Kolkata|   72000|
|      Pune|   77000|
|     Delhi|   76000|
|Chandigarh|   31000|
+----------+--------+



## Minimum sale per store

In [30]:
df_sales.groupBy("store_id").min("amount").withColumnRenamed("min(amount)", "min_sale").show()

+--------+--------+
|store_id|min_sale|
+--------+--------+
|Store-05|   25000|
|Store-06|   23000|
|Store-03|   24000|
|Store-01|   28000|
|Store-04|   26000|
|Store-07|   29000|
|Store-08|   27000|
|Store-02|   26000|
+--------+--------+



## Count of transactions per region

In [31]:
df_sales.groupBy("region").count().withColumnRenamed("count", "txn_count").show()

+------+---------+
|region|txn_count|
+------+---------+
| South|        6|
|  East|        6|
|  West|        6|
| North|        6|
+------+---------+



## Total revenue per store

In [32]:
df_sales.groupBy("store_id").sum("amount").withColumnRenamed("sum(amount)", "total_revenue").show()

+--------+-------------+
|store_id|total_revenue|
+--------+-------------+
|Store-05|       125000|
|Store-06|       126000|
|Store-03|       136000|
|Store-01|       211000|
|Store-04|       135000|
|Store-07|       144000|
|Store-08|       137000|
|Store-02|        57000|
+--------+-------------+



## Region-wise product sales count

In [33]:
df_sales.groupBy("region", "product").count().withColumnRenamed("count", "sales_count").show()

+------+-------+-----------+
|region|product|sales_count|
+------+-------+-----------+
| North| Laptop|          2|
| North| Tablet|          2|
|  East| Tablet|          2|
|  East| Laptop|          2|
| South| Tablet|          2|
| North| Mobile|          2|
|  West| Tablet|          2|
|  East| Mobile|          2|
| South| Mobile|          2|
| South| Laptop|          2|
|  West| Mobile|          2|
|  West| Laptop|          2|
+------+-------+-----------+



## Average transaction value per city

In [34]:
df_sales.groupBy("city").avg("amount").withColumnRenamed("avg(amount)", "avg_txn_value").show()

+----------+------------------+
|      city|     avg_txn_value|
+----------+------------------+
| Bangalore|45333.333333333336|
|     Patna|           42000.0|
|   Chennai|           45000.0|
|    Mumbai|           48000.0|
|   Kolkata|41666.666666666664|
|      Pune|45666.666666666664|
|     Delhi|           52750.0|
|Chandigarh|           28500.0|
+----------+------------------+



## Identify regions with total sales above a threshold

In [35]:
df_sales.groupBy("region").sum("amount") \
    .withColumnRenamed("sum(amount)", "total_sales") \
    .filter(col("total_sales") > 150000).show()

+------+-----------+
|region|total_sales|
+------+-----------+
| South|     271000|
|  East|     251000|
|  West|     281000|
| North|     268000|
+------+-----------+



## Use explain(True) and identify shuffle stages

In [36]:
df_sales.groupBy("region").sum("amount").explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, unresolvedalias('sum(amount#6L))]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
region: string, sum(amount): bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#517L]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#517L]
+- Project [region#1, amount#6L]
   +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[region#1], functions=[sum(amount#6L)], output=[region#1, sum(amount)#517L])
   +- Exchange hashpartitioning(region#1, 200), ENSURE_REQUIREMENTS, [plan_id=700]
      +- HashAggregate(keys=[region#1], functions=[partial_sum(amount#6L)], output=[region#1, sum#519L])
    

# Region + Product wise total sale

In [37]:
df_sales.groupBy("region", "product") \
        .sum("amount") \
        .withColumnRenamed("sum(amount)", "total_sales") \
        .show()

+------+-------+-----------+
|region|product|total_sales|
+------+-------+-----------+
| North| Laptop|     151000|
| North| Tablet|      54000|
|  East| Tablet|      48000|
|  East| Laptop|     146000|
| South| Tablet|      50000|
| North| Mobile|      63000|
|  West| Tablet|      56000|
|  East| Mobile|      57000|
| South| Mobile|      64000|
| South| Laptop|     157000|
|  West| Mobile|      68000|
|  West| Laptop|     157000|
+------+-------+-----------+



# City + Store wise average sales

In [38]:
df_sales.groupBy("city", "store_id") \
        .avg("amount") \
        .withColumnRenamed("avg(amount)", "avg_sales") \
        .show()

+----------+--------+------------------+
|      city|store_id|         avg_sales|
+----------+--------+------------------+
| Bangalore|Store-03|45333.333333333336|
|     Patna|Store-06|           42000.0|
|   Chennai|Store-04|           45000.0|
|      Pune|Store-08|45666.666666666664|
|Chandigarh|Store-02|           28500.0|
|   Kolkata|Store-05|41666.666666666664|
|    Mumbai|Store-07|           48000.0|
|     Delhi|Store-01|           52750.0|
+----------+--------+------------------+



# Region + City wise transaction coun

In [39]:
df_sales.groupBy("region", "city") \
        .count() \
        .withColumnRenamed("count", "txn_count") \
        .show()

+------+----------+---------+
|region|      city|txn_count|
+------+----------+---------+
|  West|    Mumbai|        3|
| South| Bangalore|        3|
| North|     Delhi|        4|
| North|Chandigarh|        2|
| South|   Chennai|        3|
|  West|      Pune|        3|
|  East|   Kolkata|        3|
|  East|     Patna|        3|
+------+----------+---------+



# Product + Store wise max sal

In [40]:
df_sales.groupBy("product", "store_id") \
        .max("amount") \
        .withColumnRenamed("max(amount)", "max_sale") \
        .show()

+-------+--------+--------+
|product|store_id|max_sale|
+-------+--------+--------+
| Tablet|Store-06|   23000|
| Laptop|Store-07|   80000|
| Laptop|Store-01|   76000|
| Tablet|Store-02|   26000|
| Mobile|Store-01|   32000|
| Laptop|Store-03|   78000|
| Tablet|Store-08|   27000|
| Tablet|Store-03|   24000|
| Mobile|Store-04|   30000|
| Mobile|Store-07|   35000|
| Mobile|Store-05|   28000|
| Laptop|Store-05|   72000|
| Tablet|Store-01|   28000|
| Tablet|Store-07|   29000|
| Laptop|Store-08|   77000|
| Mobile|Store-08|   33000|
| Laptop|Store-04|   79000|
| Tablet|Store-05|   25000|
| Tablet|Store-04|   26000|
| Laptop|Store-06|   74000|
+-------+--------+--------+
only showing top 20 rows


# Identify top-selling product per region

In [41]:
from pyspark.sql import Window
from pyspark.sql.functions import sum, row_number

df_region_product = df_sales.groupBy("region", "product") \
                            .sum("amount") \
                            .withColumnRenamed("sum(amount)", "total_sales")

windowSpec = Window.partitionBy("region").orderBy(col("total_sales").desc())

df_top_product = df_region_product.withColumn("rank", row_number().over(windowSpec)) \
                                  .filter(col("rank") == 1) \
                                  .drop("rank")

df_top_product.show()

+------+-------+-----------+
|region|product|total_sales|
+------+-------+-----------+
|  East| Laptop|     146000|
| North| Laptop|     151000|
| South| Laptop|     157000|
|  West| Laptop|     157000|
+------+-------+-----------+



# Window Functions

In [42]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, rank, row_number, dense_rank, first, last
from pyspark.sql.functions import col

## Compute running total of sales per region ordered by date

In [43]:
windowSpec = Window.partitionBy("region").orderBy("sale_date") \
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_running_total = df_sales.withColumn("running_total",
                                       sum("amount").over(windowSpec))
df_running_total.show()

+------+------+----------+--------+-------+----------+------+-------------+
|txn_id|region|      city|store_id|product| sale_date|amount|running_total|
+------+------+----------+--------+-------+----------+------+-------------+
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|        72000|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|       100000|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|       123000|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|       152000|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|       177000|
|  T023|  East|     Patna|Store-06| Laptop|2024-01-06| 74000|       251000|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|        75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|       107000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|       133000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|       209000|
|  T017| Nor

## Rank transactions by amount within each region

In [44]:
windowSpec = Window.partitionBy("region").orderBy(col("amount").desc())

df_ranked = df_sales.withColumn("rank_in_region", rank().over(windowSpec))
df_ranked.show()

+------+------+----------+--------+-------+----------+------+--------------+
|txn_id|region|      city|store_id|product| sale_date|amount|rank_in_region|
+------+------+----------+--------+-------+----------+------+--------------+
|  T023|  East|     Patna|Store-06| Laptop|2024-01-06| 74000|             1|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|             2|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|             3|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|             4|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|             5|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|             6|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|             1|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|             2|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|             3|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|             4|

## Assign row numbers per store ordered by sale amount

In [45]:
windowSpec = Window.partitionBy("store_id").orderBy(col("amount").desc())

df_rownum = df_sales.withColumn("row_number_in_store", row_number().over(windowSpec))
df_rownum.show()

+------+------+----------+--------+-------+----------+------+-------------------+
|txn_id|region|      city|store_id|product| sale_date|amount|row_number_in_store|
+------+------+----------+--------+-------+----------+------+-------------------+
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|                  1|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|                  2|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|                  3|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|                  4|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|                  1|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|                  2|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|                  1|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|                  2|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|                  3|
|  T014| South| 

## Use dense rank to rank products per region

In [46]:
windowSpec = Window.partitionBy("region").orderBy(col("amount").desc())

df_dense_rank = df_sales.withColumn("dense_rank_in_region", dense_rank().over(windowSpec))
df_dense_rank.show()

+------+------+----------+--------+-------+----------+------+--------------------+
|txn_id|region|      city|store_id|product| sale_date|amount|dense_rank_in_region|
+------+------+----------+--------+-------+----------+------+--------------------+
|  T023|  East|     Patna|Store-06| Laptop|2024-01-06| 74000|                   1|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|                   2|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|                   3|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|                   4|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|                   5|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|                   6|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|                   1|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|                   2|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|                   3|
|  T

## Identify top 2 highest sales per region using window functions

In [47]:
windowSpec = Window.partitionBy("region").orderBy(col("amount").desc())

df_top2 = df_sales.withColumn("rank", row_number().over(windowSpec)) \
                  .filter(col("rank") <= 2)
df_top2.show()

+------+------+---------+--------+-------+----------+------+----+
|txn_id|region|     city|store_id|product| sale_date|amount|rank|
+------+------+---------+--------+-------+----------+------+----+
|  T023|  East|    Patna|Store-06| Laptop|2024-01-06| 74000|   1|
|  T007|  East|  Kolkata|Store-05| Laptop|2024-01-01| 72000|   2|
|  T013| North|    Delhi|Store-01| Laptop|2024-01-04| 76000|   1|
|  T001| North|    Delhi|Store-01| Laptop|2024-01-01| 75000|   2|
|  T014| South|  Chennai|Store-04| Laptop|2024-01-04| 79000|   1|
|  T004| South|Bangalore|Store-03| Laptop|2024-01-01| 78000|   2|
|  T010|  West|   Mumbai|Store-07| Laptop|2024-01-01| 80000|   1|
|  T016|  West|     Pune|Store-08| Laptop|2024-01-04| 77000|   2|
+------+------+---------+--------+-------+----------+------+----+



## Compare rank vs dense_rank outpu

In [48]:
windowSpec = Window.partitionBy("region").orderBy(col("amount").desc())

df_compare = df_sales.withColumn("rank", rank().over(windowSpec)) \
                     .withColumn("dense_rank", dense_rank().over(windowSpec))
df_compare.show()

+------+------+----------+--------+-------+----------+------+----+----------+
|txn_id|region|      city|store_id|product| sale_date|amount|rank|dense_rank|
+------+------+----------+--------+-------+----------+------+----+----------+
|  T023|  East|     Patna|Store-06| Laptop|2024-01-06| 74000|   1|         1|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|   2|         2|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|   3|         3|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|   4|         4|
|  T019|  East|   Kolkata|Store-05| Tablet|2024-01-05| 25000|   5|         5|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|   6|         6|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|   1|         1|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|   2|         2|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|   3|         3|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|   

## Calculate cumulative sales per stor

In [49]:
windowSpec = Window.partitionBy("store_id").orderBy("sale_date") \
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_cumulative = df_sales.withColumn("cumulative_sales",
                                    sum("amount").over(windowSpec))
df_cumulative.show()

+------+------+----------+--------+-------+----------+------+----------------+
|txn_id|region|      city|store_id|product| sale_date|amount|cumulative_sales|
+------+------+----------+--------+-------+----------+------+----------------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|           75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|          107000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|          183000|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|          211000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|           26000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|           57000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|           78000|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|          102000|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|          136000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-0

## Identify first and last transaction per city using windows

In [50]:
windowSpec = Window.partitionBy("city").orderBy("sale_date")

df_first_last = df_sales.withColumn("first_txn", first("txn_id").over(windowSpec)) \
                        .withColumn("last_txn", last("txn_id").over(windowSpec))
df_first_last.show()

+------+------+----------+--------+-------+----------+------+---------+--------+
|txn_id|region|      city|store_id|product| sale_date|amount|first_txn|last_txn|
+------+------+----------+--------+-------+----------+------+---------+--------+
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|     T004|    T004|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|     T004|    T006|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|     T004|    T018|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|     T003|    T003|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|     T003|    T017|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|     T005|    T005|
|  T014| South|   Chennai|Store-04| Laptop|2024-01-04| 79000|     T005|    T014|
|  T022| South|   Chennai|Store-04| Tablet|2024-01-06| 26000|     T005|    T022|
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|     T001|    T001|
|  T002| North|     Delhi|St

# DAG & PERFORMANCE OBSERVATION

## Simple Select

In [51]:
df_sales.select("txn_id", "region").explain(True)

== Parsed Logical Plan ==
'Project ['txn_id, 'region]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, region: string
Project [txn_id#0, region#1]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Project [txn_id#0, region#1]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
*(1) Project [txn_id#0, region#1]
+- *(1) Scan ExistingRDD[txn_id#0,region#1,city#2,store_id#3,product#4,sale_date#5,amount#6L]



## Filter


In [52]:
df_sales.filter(col("amount") > 50000).explain(True)

== Parsed Logical Plan ==
'Filter '`>`('amount, 50000)
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, region: string, city: string, store_id: string, product: string, sale_date: string, amount: bigint
Filter (amount#6L > cast(50000 as bigint))
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Filter (isnotnull(amount#6L) AND (amount#6L > 50000))
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
*(1) Filter (isnotnull(amount#6L) AND (amount#6L > 50000))
+- *(1) Scan ExistingRDD[txn_id#0,region#1,city#2,store_id#3,product#4,sale_date#5,amount#6L]



## Group-By

In [53]:
df_sales.groupBy("region").sum("amount").explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, unresolvedalias('sum(amount#6L))]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
region: string, sum(amount): bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#909L]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Optimized Logical Plan ==
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#909L]
+- Project [region#1, amount#6L]
   +- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[region#1], functions=[sum(amount#6L)], output=[region#1, sum(amount)#909L])
   +- Exchange hashpartitioning(region#1, 200), ENSURE_REQUIREMENTS, [plan_id=1601]
      +- HashAggregate(keys=[region#1], functions=[partial_sum(amount#6L)], output=[region#1, sum#911L])
   

## Window Function

In [54]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

windowSpec = Window.partitionBy("region").orderBy("sale_date") \
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_sales.withColumn("running_total", sum("amount").over(windowSpec)).explain(True)

== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(running_total, 'sum('amount) windowspecdefinition('region, 'sale_date ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())), None)]
+- LogicalRDD [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L], false

== Analyzed Logical Plan ==
txn_id: string, region: string, city: string, store_id: string, product: string, sale_date: string, amount: bigint, running_total: bigint
Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, running_total#912L]
+- Project [txn_id#0, region#1, city#2, store_id#3, product#4, sale_date#5, amount#6L, running_total#912L, running_total#912L]
   +- Window [sum(amount#6L) windowspecdefinition(region#1, sale_date#5 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS running_total#912L], [region#1], [sale_date#5 ASC NULLS FIRST]
      +- Project [txn_id#0, region#1, city#2, store_i