# E-Commerce Orders

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, to_date, month, avg, max, count, dense_rank, row_number
from pyspark.sql.window import Window

# create spark session
spark = SparkSession.builder.appName("orders_analysis").getOrCreate()

# sample data
data = [
    (1, "alice", "laptop", "electronics", 2, 50000, "2023-01-15"),
    (2, "bob", "t-shirt", "clothing", 1, 800, "2023-01-20"),
    (3, "carol", "bookshelf", "furniture", 1, 7000, "2023-02-10"),
    (4, "dave", "novel", "books", 3, 300, "2023-01-05"),
    (5, "eve", "sofa", "furniture", 2, 15000, "2023-03-12"),
    (6, "frank", "tv", "electronics", 1, 40000, "2023-01-25"),
    (7, "grace", "jeans", "clothing", 4, 1200, "2023-02-18"),
    (8, "heidi", "desk", "furniture", 1, 9000, "2023-03-05"),
    (9, "ivan", "textbook", "books", 2, 500, "2023-04-10"),
    (10, "judy", "headphones", "electronics", 2, 2500, "2023-01-11"),
    (11, "mallory", "jacket", "clothing", 2, 2500, "2023-04-15"),
    (12, "oscar", "lamp", "furniture", 5, 1000, "2023-01-18"),
]

# define schema
columns = ["orderid", "customername", "product", "category", "quantity", "unitprice", "orderdate"]

# create dataframe
df = spark.createDataFrame(data, columns)

# convert orderdate to date type
df = df.withColumn("orderdate", to_date("orderdate"))

# create local and global views
df.createOrReplaceTempView("orders_local")
df.createGlobalTempView("orders_global")

#Part A: Local View – orders_local
##1. List all orders placed for "Electronics" with a Quantity of 2 or more.

In [11]:
spark.sql("""
    select *
    from orders_local
    where category = 'electronics' and quantity >= 2
""").show()

+-------+------------+----------+-----------+--------+---------+----------+
|orderid|customername|   product|   category|quantity|unitprice| orderdate|
+-------+------------+----------+-----------+--------+---------+----------+
|      1|       alice|    laptop|electronics|       2|    50000|2023-01-15|
|     10|        judy|headphones|electronics|       2|     2500|2023-01-11|
+-------+------------+----------+-----------+--------+---------+----------+



## 2. Calculate TotalAmount (Quantity × UnitPrice) for each order.

In [10]:
spark.sql("""
    select *,
           quantity * unitprice as totalamount
    from orders_local
""").show()

+-------+------------+----------+-----------+--------+---------+----------+-----------+
|orderid|customername|   product|   category|quantity|unitprice| orderdate|totalamount|
+-------+------------+----------+-----------+--------+---------+----------+-----------+
|      1|       alice|    laptop|electronics|       2|    50000|2023-01-15|     100000|
|      2|         bob|   t-shirt|   clothing|       1|      800|2023-01-20|        800|
|      3|       carol| bookshelf|  furniture|       1|     7000|2023-02-10|       7000|
|      4|        dave|     novel|      books|       3|      300|2023-01-05|        900|
|      5|         eve|      sofa|  furniture|       2|    15000|2023-03-12|      30000|
|      6|       frank|        tv|electronics|       1|    40000|2023-01-25|      40000|
|      7|       grace|     jeans|   clothing|       4|     1200|2023-02-18|       4800|
|      8|       heidi|      desk|  furniture|       1|     9000|2023-03-05|       9000|
|      9|        ivan|  textbook

## 3. Show the total number of orders per Category .

In [9]:
spark.sql("""
    select category,
           count(*) as total_orders
    from orders_local
    group by category
""").show()

+-----------+------------+
|   category|total_orders|
+-----------+------------+
|      books|           2|
|  furniture|           4|
|electronics|           3|
|   clothing|           3|
+-----------+------------+



## 4. List orders placed in "January 2023" only.

In [12]:
spark.sql("""
    select *
    from orders_local
    where date_format(orderdate, 'yyyy-MM') = '2023-01'
""").show()

+-------+------------+----------+-----------+--------+---------+----------+
|orderid|customername|   product|   category|quantity|unitprice| orderdate|
+-------+------------+----------+-----------+--------+---------+----------+
|      1|       alice|    laptop|electronics|       2|    50000|2023-01-15|
|      2|         bob|   t-shirt|   clothing|       1|      800|2023-01-20|
|      4|        dave|     novel|      books|       3|      300|2023-01-05|
|      6|       frank|        tv|electronics|       1|    40000|2023-01-25|
|     10|        judy|headphones|electronics|       2|     2500|2023-01-11|
|     12|       oscar|      lamp|  furniture|       5|     1000|2023-01-18|
+-------+------------+----------+-----------+--------+---------+----------+



## 5. Show the average UnitPrice per category.

In [13]:
spark.sql("""
    select category,
           round(avg(unitprice), 2) as avg_unitprice
    from orders_local
    group by category
""").show()

+-----------+-------------+
|   category|avg_unitprice|
+-----------+-------------+
|      books|        400.0|
|  furniture|       8000.0|
|electronics|     30833.33|
|   clothing|       1500.0|
+-----------+-------------+



## 6. Find the order with the highest total amount.

In [14]:
spark.sql("""
    select *,
           quantity * unitprice as totalamount
    from orders_local
    order by totalamount desc
    limit 1
""").show()

+-------+------------+-------+-----------+--------+---------+----------+-----------+
|orderid|customername|product|   category|quantity|unitprice| orderdate|totalamount|
+-------+------------+-------+-----------+--------+---------+----------+-----------+
|      1|       alice| laptop|electronics|       2|    50000|2023-01-15|     100000|
+-------+------------+-------+-----------+--------+---------+----------+-----------+



## 7. Drop the local view

In [16]:
spark.catalog.dropTempView("orders_local")

False

#Part B: Global View – orders_global
##1. Display all "Furniture" orders with TotalAmount above 10,000.

In [17]:
spark.sql("""
    select *,
           quantity * unitprice as totalamount
    from global_temp.orders_global
    where category = 'furniture' and (quantity * unitprice) > 10000
""").show()

+-------+------------+-------+---------+--------+---------+----------+-----------+
|orderid|customername|product| category|quantity|unitprice| orderdate|totalamount|
+-------+------------+-------+---------+--------+---------+----------+-----------+
|      5|         eve|   sofa|furniture|       2|    15000|2023-03-12|      30000|
+-------+------------+-------+---------+--------+---------+----------+-----------+



#2. Create a column called DiscountFlag :
##Mark "Yes" if Quantity > 3
##Otherwise "No"

In [18]:
spark.sql("""
    select *,
           case when quantity > 3 then 'yes' else 'no' end as discountflag
    from global_temp.orders_global
""").show()

+-------+------------+----------+-----------+--------+---------+----------+------------+
|orderid|customername|   product|   category|quantity|unitprice| orderdate|discountflag|
+-------+------------+----------+-----------+--------+---------+----------+------------+
|      1|       alice|    laptop|electronics|       2|    50000|2023-01-15|          no|
|      2|         bob|   t-shirt|   clothing|       1|      800|2023-01-20|          no|
|      3|       carol| bookshelf|  furniture|       1|     7000|2023-02-10|          no|
|      4|        dave|     novel|      books|       3|      300|2023-01-05|          no|
|      5|         eve|      sofa|  furniture|       2|    15000|2023-03-12|          no|
|      6|       frank|        tv|electronics|       1|    40000|2023-01-25|          no|
|      7|       grace|     jeans|   clothing|       4|     1200|2023-02-18|         yes|
|      8|       heidi|      desk|  furniture|       1|     9000|2023-03-05|          no|
|      9|        ivan

## 3. List customers who ordered more than 1 product type

In [19]:
spark.sql("""
    select customername
    from global_temp.orders_global
    group by customername
    having count(distinct category) > 1
""").show()

+------------+
|customername|
+------------+
+------------+



## 4. Count number of orders per month across the dataset.

In [20]:
spark.sql("""
    select date_format(orderdate, 'yyyy-MM') as month,
           count(*) as order_count
    from global_temp.orders_global
    group by date_format(orderdate, 'yyyy-MM')
    order by month
""").show()

+-------+-----------+
|  month|order_count|
+-------+-----------+
|2023-01|          6|
|2023-02|          2|
|2023-03|          2|
|2023-04|          2|
+-------+-----------+



## 5. Rank all products by total quantity sold across all orders using a window
##function.

In [22]:
spark.sql("""
    select product,
           sum(quantity) as total_quantity,
           rank() over (order by sum(quantity) desc) as product_rank
    from global_temp.orders_global
    group by product
""").show()

+----------+--------------+------------+
|   product|total_quantity|product_rank|
+----------+--------------+------------+
|      lamp|             5|           1|
|     jeans|             4|           2|
|     novel|             3|           3|
|    laptop|             2|           4|
|      sofa|             2|           4|
|  textbook|             2|           4|
|    jacket|             2|           4|
|headphones|             2|           4|
| bookshelf|             1|           9|
|        tv|             1|           9|
|   t-shirt|             1|           9|
|      desk|             1|           9|
+----------+--------------+------------+



## 6. Run a query using a new SparkSession and the global view.

In [23]:
from pyspark.sql import SparkSession

new_spark = SparkSession.builder.getOrCreate()

new_spark.sql("""
    select *
    from global_temp.orders_global
    where category = 'books'
""").show()

+-------+------------+--------+--------+--------+---------+----------+
|orderid|customername| product|category|quantity|unitprice| orderdate|
+-------+------------+--------+--------+--------+---------+----------+
|      4|        dave|   novel|   books|       3|      300|2023-01-05|
|      9|        ivan|textbook|   books|       2|      500|2023-04-10|
+-------+------------+--------+--------+--------+---------+----------+



#Bonus Challenges
## 1. Save a filtered subset (only "Books" category) as a new global temp view.

In [24]:
spark.sql("""
    select *
    from global_temp.orders_global
    where category = 'books'
""").createOrReplaceGlobalTempView("books_only")

## 2. Find the most purchased product per category.

In [26]:
spark.sql("""
    select *
    from (
        select category,
               product,
               sum(quantity) as total_sold,
               dense_rank() over (partition by category order by sum(quantity) desc) as rnk
        from global_temp.orders_global
        group by category, product
    ) where rnk = 1
""").show()

+-----------+----------+----------+---+
|   category|   product|total_sold|rnk|
+-----------+----------+----------+---+
|      books|     novel|         3|  1|
|   clothing|     jeans|         4|  1|
|electronics|    laptop|         2|  1|
|electronics|headphones|         2|  1|
|  furniture|      lamp|         5|  1|
+-----------+----------+----------+---+



## 3. Create a view that excludes all "Clothing" orders and call it
##"filtered_orders" .

In [27]:
spark.sql("""
    select *
    from global_temp.orders_global
    where category != 'clothing'
""").createOrReplaceGlobalTempView("filtered_orders")