In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as spark_sum, col, round
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("Data Processing").getOrCreate()

purchases_df = spark.read.csv('purchases.csv', header=True, inferSchema=True)
users_df = spark.read.csv('users.csv', header=True, inferSchema=True)
products_df = spark.read.csv('products.csv', header=True, inferSchema=True)

purchases_df = purchases_df.dropna()
users_df = users_df.dropna()
products_df = products_df.dropna()

merged_df = purchases_df.join(products_df, 'product_id')
merged_df = merged_df.withColumn('amount', round(col('quantity') * col('price'), 2))
total_purchase_per_category = merged_df.groupBy('category').agg(round(spark_sum('amount'), 2).alias('total_amount'))

filtered_users_df = users_df.filter((col('age') >= 18) & (col('age') <= 25))
merged_age_df = purchases_df.join(filtered_users_df, 'user_id').join(products_df, 'product_id')
merged_age_df = merged_age_df.withColumn('amount', round(col('quantity') * col('price'), 2))
total_purchase_18_25 = merged_age_df.groupBy('category').agg(round(spark_sum('amount'), 2).alias('total_amount_18_25'))

from pyspark.sql.window import Window
windowSpec = Window.partitionBy()
percentage_share = total_purchase_18_25.withColumn(
    'percentage', round((col('total_amount_18_25') / spark_sum('total_amount_18_25').over(windowSpec)) * 100, 2)
)

top_3_categories = percentage_share.orderBy(col('percentage').desc()).limit(3)
total_purchase_per_category.show()
total_purchase_18_25.show()

+-----------+------------+
|   category|total_amount|
+-----------+------------+
|       Home|      1523.5|
|     Sports|      1802.5|
|Electronics|      1174.8|
|   Clothing|       790.3|
|     Beauty|       459.9|
+-----------+------------+

+-----------+------------------+
|   category|total_amount_18_25|
+-----------+------------------+
|       Home|             361.1|
|     Sports|             310.5|
|Electronics|             249.6|
|   Clothing|             245.0|
|     Beauty|              41.4|
+-----------+------------------+



In [9]:
percentage_share.show()
top_3_categories.show()

spark.stop()

+-----------+------------------+----------+
|   category|total_amount_18_25|percentage|
+-----------+------------------+----------+
|       Home|             361.1|      29.9|
|     Sports|             310.5|     25.71|
|Electronics|             249.6|     20.67|
|   Clothing|             245.0|     20.29|
|     Beauty|              41.4|      3.43|
+-----------+------------------+----------+

+-----------+------------------+----------+
|   category|total_amount_18_25|percentage|
+-----------+------------------+----------+
|       Home|             361.1|      29.9|
|     Sports|             310.5|     25.71|
|Electronics|             249.6|     20.67|
+-----------+------------------+----------+

