In [0]:
import pandas as pd

# Create the dataset
data = [
    (101, "Laptop", "Electronics", 55000, 10),
    (102, "Smartphone", "Electronics", 30000, 25),
    (103, "Chair", "Furniture", 2500, 50),
    (104, "Book", "Stationery", 400, 200),
    (105, "Headphones", "Electronics", 1500, 100),
    (106, "Table", "Furniture", 3200, 40),
    (107, "Pen", "Stationery", 20, 500),
    (108, "Monitor", "Electronics", 12000, 15),
    (109, "Notebook", "Stationery", 60, 300),
    (110, "Sofa", "Furniture", 45000, 5),
]

columns = ["product_id", "product_name", "category", "price", "quantity"]

# Save to CSV & JSON
df = pd.DataFrame(data, columns=columns)
df.to_csv("products.csv", index=False)
df.to_json("products.json", orient="records", lines=True)


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, expr, sum as Fsum, avg, count
spark = SparkSession.builder.appName("ProductAnalysis").getOrCreate()

###1. Read the above data from CSV and JSON into a DataFrame and print the schema.

In [0]:
# Read the CSV
df_csv = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/FileStore/tables/products-2.csv")
df_csv.printSchema()

# Read the JSON
df_json = spark.read.option("inferSchema", True).json("dbfs:/FileStore/tables/products-2.json")
df_json.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)

root
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: long (nullable = true)



### 3. Convert the CSV data into Parquet format and save to disk.

In [0]:
# Read CSV
df = spark.read.csv("dbfs:/FileStore/tables/products-2.csv", header=True, inferSchema=True)

# Write to Parquet
df.write.mode("overwrite").parquet("dbfs:/FileStore/tables/products_parquet")


In [0]:
df_parquet = spark.read.parquet("dbfs:/FileStore/tables/products_parquet")
df_parquet.show()

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       101|      Laptop|Electronics|55000|      10|
|       102|  Smartphone|Electronics|30000|      25|
|       103|       Chair|  Furniture| 2500|      50|
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
|       108|     Monitor|Electronics|12000|      15|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
+----------+------------+-----------+-----+--------+



### Measure the size of CSV vs JSON vs Parquet on disk.

In [0]:
files = dbutils.fs.ls("dbfs:/FileStore/tables/")
counts = {"CSV": 0, "JSON": 0, "Parquet": 0}

for file in files:
    path = file.path.lower()
    if path.endswith(".csv"):
        counts["CSV"] += 1
    elif path.endswith(".json"):
        counts["JSON"] += 1
    elif path.endswith(".parquet") or "parquet" in path:
        counts["Parquet"] += 1

print(counts)


{'CSV': 3, 'JSON': 3, 'Parquet': 1}


### Add a column total_revenue = price * quantity for each record.

In [0]:
df = df_csv.withColumn("total_revenue", col("price") * col("quantity"))
df.show()

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       101|      Laptop|Electronics|55000|      10|       550000|
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       103|       Chair|  Furniture| 2500|      50|       125000|
|       104|        Book| Stationery|  400|     200|        80000|
|       105|  Headphones|Electronics| 1500|     100|       150000|
|       106|       Table|  Furniture| 3200|      40|       128000|
|       107|         Pen| Stationery|   20|     500|        10000|
|       108|     Monitor|Electronics|12000|      15|       180000|
|       109|    Notebook| Stationery|   60|     300|        18000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+



### Find the top 3 products with the highest total revenue.

In [0]:
df.orderBy(col("total_revenue").desc()).select("product_name", "total_revenue").show(3)

+------------+-------------+
|product_name|total_revenue|
+------------+-------------+
|  Smartphone|       750000|
|      Laptop|       550000|
|        Sofa|       225000|
+------------+-------------+
only showing top 3 rows


### Filter and display only Furniture products with price > 3000.

In [0]:
df.filter((col("category") == "Furniture") & (col("price") > 3000)).show()

+----------+------------+---------+-----+--------+-------------+
|product_id|product_name| category|price|quantity|total_revenue|
+----------+------------+---------+-----+--------+-------------+
|       106|       Table|Furniture| 3200|      40|       128000|
|       110|        Sofa|Furniture|45000|       5|       225000|
+----------+------------+---------+-----+--------+-------------+



###Create a new column price_band with values:
'High' if price > 10000
'Medium' if 3000 < price <= 10000
'Low' if price ≤ 3000

In [0]:
df = df.withColumn("price_band", when(col("price") > 10000, "High")
.when((col("price") > 3000) & (col("price") <= 10000), "Medium")
.otherwise("Low"))
df.select("product_name", "price", "price_band").show()


+------------+-----+----------+
|product_name|price|price_band|
+------------+-----+----------+
|      Laptop|55000|      High|
|  Smartphone|30000|      High|
|       Chair| 2500|       Low|
|        Book|  400|       Low|
|  Headphones| 1500|       Low|
|       Table| 3200|    Medium|
|         Pen|   20|       Low|
|     Monitor|12000|      High|
|    Notebook|   60|       Low|
|        Sofa|45000|      High|
+------------+-----+----------+



### Group by category and calculate total quantity sold.

In [0]:
df.groupBy("category").agg(Fsum("quantity").alias("total_quantity")).show()

+-----------+--------------+
|   category|total_quantity|
+-----------+--------------+
|  Furniture|            95|
| Stationery|          1000|
|Electronics|           150|
+-----------+--------------+



### Calculate average price of products for each category.

In [0]:
df.groupBy("category").agg(avg("price").alias("avg_price")).show()

+-----------+---------+
|   category|avg_price|
+-----------+---------+
|  Furniture|  16900.0|
| Stationery|    160.0|
|Electronics|  24625.0|
+-----------+---------+



### Count how many products fall in each price_band .

In [0]:
df.groupBy("price_band").agg(count("*").alias("product_count")).show()

+----------+-------------+
|price_band|product_count|
+----------+-------------+
|    Medium|            1|
|      High|            4|
|       Low|            5|
+----------+-------------+



### Write the filtered Electronics products (price > 5000) into a Parquet file.

In [0]:
df.filter((col("category") == "Electronics") & (col("price") > 5000))\
.write.mode("overwrite").parquet("filtered_electronics.parquet")


### Write the Stationery products into a JSON file.

In [0]:
df.filter(col("category") == "Stationery")\
.write.mode("overwrite").json("stationery_products.json")

### Load Parquet back and run a query to find which category has highest total revenue.

In [0]:
df_parquet = spark.read.parquet("dbfs:/FileStore/tables/products_parquet")
df_rev = df_parquet.withColumn("total_revenue", col("price") * col("quantity"))
df_rev.groupBy("category").agg(Fsum("total_revenue").alias("category_revenue"))\
.orderBy(col("category_revenue").desc()).show(1)
from pyspark.sql.functions import col, sum as Fsum

+-----------+----------------+
|   category|category_revenue|
+-----------+----------------+
|Electronics|         1630000|
+-----------+----------------+
only showing top 1 row


### BONUS: Create a temporary view from the DataFrame and run Spark SQL to find all products with quantity > 100 and price < 1000.

In [0]:
df.createOrReplaceTempView("products")

spark.sql("""SELECT product_id, product_name, quantity, price
FROM products
WHERE quantity > 100 AND price < 1000""").show()

+----------+------------+--------+-----+
|product_id|product_name|quantity|price|
+----------+------------+--------+-----+
|       104|        Book|     200|  400|
|       107|         Pen|     500|   20|
|       109|    Notebook|     300|   60|
+----------+------------+--------+-----+

