In [0]:
dbutils.fs.cp("file:/Workspace/Shared/product_data.json", "dbfs:/FileStore/product_data.json")

True

In [0]:
# Load the file from DBFS
df = spark.read.option("multiline", "true").json("/FileStore/product_data.json")
df.show(10)
df.printSchema()

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+

root
 |-- Category: string (nullable = true)
 |-- Price: long (nullable = true)
 |-- ProductID: long (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Stock: long (nullable = true)



In [0]:
# 2. Data Cleaning:
# Remove rows where Stock is less than 30.
# Filter the products that belong to the "Electronics" category.
df_cleaned_product = df.filter((df['Stock'] >= 30) & (df['Category'] == 'Electronics'))
df_cleaned_product.show()

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|Electronics|  300|      104|    Monitor|   45|
+-----------+-----+---------+-----------+-----+



In [0]:
# 3. Data Aggregation:
# Calculate the total stock for products in the "Furniture" category.
df_total_furniture_stock = df.filter(df['Category'] == 'Furniture').groupBy('Category').agg({'Stock': 'sum'}).withColumnRenamed('sum(Stock)', 'TotalStock')
df_total_furniture_stock.show()

# Find the average price of all products in the dataset.
df_avg_price = df.groupBy('Category').agg({'Price': 'avg'}).withColumnRenamed('avg(Price)', 'AvgPrice')
df_avg_price.show()

+---------+----------+
| Category|TotalStock|
+---------+----------+
|Furniture|        85|
+---------+----------+

+-----------+-----------------+
|   Category|         AvgPrice|
+-----------+-----------------+
|Electronics|766.6666666666666|
|  Furniture|            250.0|
+-----------+-----------------+



In [0]:
# 4. Write the Data to JSON:
# Save the cleaned and aggregated data into a new JSON file.
df_cleaned_product.coalesce(1).write.json('/FileStore/cleaned_product_data.json')