# Lab Assignment 4

- Name - Aryan Gupta
- Roll No.- 230150003
- Date - 20 Aug, 2025
- Course - DA331 Big Data Analytics: Tools & Techniques

## Importing Libraries

In [2]:
# !pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, sum as spark_sum, hour, countDistinct

import opendatasets as od
import numpy as np

spark = SparkSession.builder.appName("EcommerceAnalysis").getOrCreate()

## Load Data

In [4]:
# Get where the session is built
!pwd

/content


In [5]:
od.download("https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store/data?select=2019-Nov.csv")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: aryanthepain
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store
Downloading ecommerce-behavior-data-from-multi-category-store.zip to ./ecommerce-behavior-data-from-multi-category-store


100%|██████████| 4.29G/4.29G [01:06<00:00, 69.3MB/s]





KeyboardInterrupt: 

In [6]:
path = './ecommerce-behavior-data-from-multi-category-store/2019-Nov.csv'

In [7]:
df = spark.read.csv(path, header=True, inferSchema=True)

# print basic things
print(df.printSchema())
print(df.count())
print(len(df.columns))
print(df.show(5))

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)

None
67501979
9
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|

## Highest View to Cart Conversion Rate by Category

In [8]:

category_stats = df.groupBy("category_code")\
                  .pivot("event_type", ["view", "cart"])\
                  .agg(countDistinct("user_id"))

conversion_view_cart = category_stats.withColumn(
    "view_to_cart_rate",
    (col("cart") / col("view")) * 100
).orderBy(col("view_to_cart_rate").desc())

conversion_view_cart.show(1, truncate=False)

+-----------------------------------+-------+------+------------------+
|category_code                      |view   |cart  |view_to_cart_rate |
+-----------------------------------+-------+------+------------------+
|electronics.smartphone             |1574985|353154|22.422689739902285|
|appliances.environment.air_heater  |28962  |5359  |18.50355638422761 |
|appliances.environment.water_heater|29882  |5047  |16.88976641456395 |
|appliances.kitchen.washer          |159992 |26438 |16.52457622881144 |
|electronics.video.tv               |260770 |42801 |16.413314415001725|
|appliances.environment.vacuum      |175190 |28726 |16.397054626405616|
|electronics.audio.headphone        |316021 |51419 |16.270754158742616|
|appliances.iron                    |48577  |7364  |15.15943759392305 |
|NULL                               |1958987|293341|14.974116724613282|
|appliances.kitchen.microwave       |53807  |7860  |14.607764788967978|
+-----------------------------------+-------+------+------------

## Highest view to Purchase Conversion Rate by Brand During Night Hours

In [9]:
# night data
df_night = df.withColumn("hour", hour("event_time")).filter((col("hour") >= 0) & (col("hour") < 6))

# aggregate
brand_stats = df_night.groupBy("brand").pivot("event_type", ["view", "purchase"]).agg(countDistinct("user_id"))

# view to sale conversion
conversion_view_purchase_brand = brand_stats.withColumn(
    "view_to_purchase_rate",
    (col("purchase") / col("view")) * 100
).orderBy(col("view_to_purchase_rate").desc())

conversion_view_purchase_brand.show(1, truncate=False)

+-----------+----+--------+---------------------+
|brand      |view|purchase|view_to_purchase_rate|
+-----------+----+--------+---------------------+
|pixiebelles|1   |1       |100.0                |
+-----------+----+--------+---------------------+
only showing top 1 row



## Highest view to Purchase Conversion Rate by Category for the Top Brand

In [10]:
# get best company from last query
top_brand = conversion_view_purchase_brand.first()["brand"]

# get data for the company only
brand_category_stats = df_night.filter(col("brand") == top_brand) \
    .groupBy("category_code").pivot("event_type", ["view", "purchase"]).agg(countDistinct("user_id"))

# Calculate conversion rate
conversion_view_purchase_cat = brand_category_stats.withColumn(
    "view_to_purchase_rate",
    (col("purchase") / col("view")) * 100
).orderBy(col("view_to_purchase_rate").desc())

conversion_view_purchase_cat.show(1, truncate=False)

+-------------+----+--------+---------------------+
|category_code|view|purchase|view_to_purchase_rate|
+-------------+----+--------+---------------------+
|NULL         |1   |1       |100.0                |
+-------------+----+--------+---------------------+



## Highest total Revenue by Category

In [11]:
# Purchase events only
revenue_stats = df.filter(col("event_type") == "purchase") \
    .groupBy("category_code").agg(spark_sum("price").alias("total_revenue")) \
    .orderBy(col("total_revenue").desc())

revenue_stats.show(1, truncate=False)

+----------------------+--------------------+
|category_code         |total_revenue       |
+----------------------+--------------------+
|electronics.smartphone|1.7782166160999876E8|
+----------------------+--------------------+
only showing top 1 row

