In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("ProductSalesAnalysis") \
    .getOrCreate()
spark


In [76]:
csv_data="""
OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North"""
with open("sales.csv", "w") as f:
    f.write(csv_data)

In [77]:
df=spark.read.csv("sales.csv",header=True,inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Region : string (nullable = true)

+-------+-------+-----------+--------+---------+-------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region |
+-------+-------+-----------+--------+---------+-------+
|   1001| Mobile|Electronics|       2|    15000|  North|
|   1002| Laptop|Electronics|       1|    55000|  South|
|   1003|T-Shirt|    Apparel|       3|      500|   East|
|   1004|  Jeans|    Apparel|       2|     1200|  North|
|   1005|     TV|Electronics|       1|    40000|   West|
+-------+-------+-----------+--------+---------+-------+
only showing top 5 rows



In [78]:
from pyspark.sql.functions import col
df=df.withColumn("TotalPrice",col("Quantity")*col("UnitPrice"))
df.show()

+-------+----------+-----------+--------+---------+-------+----------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region |TotalPrice|
+-------+----------+-----------+--------+---------+-------+----------+
|   1001|    Mobile|Electronics|       2|    15000|  North|     30000|
|   1002|    Laptop|Electronics|       1|    55000|  South|     55000|
|   1003|   T-Shirt|    Apparel|       3|      500|   East|      1500|
|   1004|     Jeans|    Apparel|       2|     1200|  North|      2400|
|   1005|        TV|Electronics|       1|    40000|   West|     40000|
|   1006|     Shoes|   Footwear|       4|     2000|  South|      8000|
|   1007|     Watch|Accessories|       2|     3000|   East|      6000|
|   1008|Headphones|Electronics|       3|     2500|  North|      7500|
+-------+----------+-----------+--------+---------+-------+----------+



In [79]:
from pyspark.sql.functions import sum
Total_revenue=df.select(sum("TotalPrice").alias("Total Revenue"))
Total_revenue.show()

+-------------+
|Total Revenue|
+-------------+
|       150400|
+-------------+



In [80]:
df.groupBy("Category").agg(sum("TotalPrice").alias("Category-wise revenue")).orderBy(col("Category-wise revenue").desc()).show()

+-----------+---------------------+
|   Category|Category-wise revenue|
+-----------+---------------------+
|Electronics|               132500|
|   Footwear|                 8000|
|Accessories|                 6000|
|    Apparel|                 3900|
+-----------+---------------------+



In [81]:
from pyspark.sql.functions import count
df.groupBy("Region ").agg(count("OrderID").alias("Region with highest orders")).orderBy(col("Region with highest orders")).show()

+-------+--------------------------+
|Region |Region with highest orders|
+-------+--------------------------+
|   West|                         1|
|  South|                         2|
|   East|                         2|
|  North|                         3|
+-------+--------------------------+



In [82]:
df.groupBy("Category").agg(sum("UnitPrice").alias("Avg-unit price")).orderBy(col("Avg-unit price").desc()).show()

+-----------+--------------+
|   Category|Avg-unit price|
+-----------+--------------+
|Electronics|        112500|
|Accessories|          3000|
|   Footwear|          2000|
|    Apparel|          1700|
+-----------+--------------+



In [83]:
df.filter(col("TotalPrice")>30000).show()

+-------+-------+-----------+--------+---------+-------+----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region |TotalPrice|
+-------+-------+-----------+--------+---------+-------+----------+
|   1002| Laptop|Electronics|       1|    55000|  South|     55000|
|   1005|     TV|Electronics|       1|    40000|   West|     40000|
+-------+-------+-----------+--------+---------+-------+----------+



In [84]:
from pyspark.sql.functions import when
df=df.withColumn("HighValueOrder",when(col("TotalPrice")>20000,"Yes").otherwise("No"))
df.show()

+-------+----------+-----------+--------+---------+-------+----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region |TotalPrice|HighValueOrder|
+-------+----------+-----------+--------+---------+-------+----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000|  North|     30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000|  South|     55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|   East|      1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200|  North|      2400|            No|
|   1005|        TV|Electronics|       1|    40000|   West|     40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000|  South|      8000|            No|
|   1007|     Watch|Accessories|       2|     3000|   East|      6000|            No|
|   1008|Headphones|Electronics|       3|     2500|  North|      7500|            No|
+-------+----------+-----------+--------+---------+---

In [87]:
df.filter((col("HighValueOrder")=="Yes") & (col("Region ")=="North")).show()

+-------+-------+-----------+--------+---------+-------+----------+--------------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region |TotalPrice|HighValueOrder|
+-------+-------+-----------+--------+---------+-------+----------+--------------+
|   1001| Mobile|Electronics|       2|    15000|  North|     30000|           Yes|
+-------+-------+-----------+--------+---------+-------+----------+--------------+



In [90]:
df.filter(col("HighValueOrder")=="Yes").groupBy("Region ").agg(count("orderID").alias("HighValue-OrderCount")).show()

+-------+--------------------+
|Region |HighValue-OrderCount|
+-------+--------------------+
|  South|                   1|
|   West|                   1|
|  North|                   1|
+-------+--------------------+



In [91]:
df.write.option("header",True).csv("high_value_orders.csv")