In [15]:
# Import necessary libraries
from pyspark.sql import SparkSession
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("KPI Queries for Hive Tables") \
    .enableHiveSupport() \
    .getOrCreate()

In [16]:
# 1. Product View Table (emp.product_view)
# KPI 1: Total Number of Product Views
total_product_views = spark.sql("""
        SELECT COUNT(*) AS total_product_views
        FROM stream.product_view
    """)
# Show the results
total_product_views.show()


+-------------------+
|total_product_views|
+-------------------+
|                340|
+-------------------+



In [4]:
# KPI 2: Number of Product Views by Category
views_per_category = spark.sql("""
    SELECT category, COUNT(*) AS views_per_category
    FROM stream.product_view
    GROUP BY category
    ORDER BY views_per_category DESC
""")
views_per_category.show()


+--------------+------------------+
|      category|views_per_category|
+--------------+------------------+
|      Clothing|                98|
|Home & Kitchen|                93|
|   Electronics|                78|
|         Books|                71|
+--------------+------------------+



In [5]:
# KPI 3: Unique Customers per Category
unique_customers_per_category = spark.sql("""
    SELECT category, COUNT(DISTINCT customerId) AS unique_customers
    FROM stream.product_view
    GROUP BY category
    ORDER BY unique_customers DESC
""")
unique_customers_per_category.show()

+--------------+----------------+
|      category|unique_customers|
+--------------+----------------+
|      Clothing|              98|
|Home & Kitchen|              93|
|   Electronics|              78|
|         Books|              71|
+--------------+----------------+



In [7]:
# 2. Add to Cart Table (emp.add_to_cart)
# KPI 1: Total Number of Add to Cart Actions
total_add_to_cart = spark.sql("""
    SELECT COUNT(*) AS total_add_to_cart
    FROM stream.add_to_cart
""")
total_add_to_cart.show()

+-----------------+
|total_add_to_cart|
+-----------------+
|              293|
+-----------------+



In [6]:
# KPI 2: Number of Add to Cart Actions by Product
add_to_cart_per_product = spark.sql("""
    SELECT productId, COUNT(*) AS add_to_cart_per_product
    FROM stream.add_to_cart
    GROUP BY productId
    ORDER BY add_to_cart_per_product DESC
""")
add_to_cart_per_product.show()

+---------+-----------------------+
|productId|add_to_cart_per_product|
+---------+-----------------------+
|     2244|                      2|
|     8334|                      2|
|     5766|                      2|
|     9732|                      2|
|     8089|                      1|
|     2701|                      1|
|     9544|                      1|
|     8865|                      1|
|     1987|                      1|
|     4685|                      1|
|     5170|                      1|
|     8975|                      1|
|     4403|                      1|
|     7101|                      1|
|     5990|                      1|
|     6280|                      1|
|     5473|                      1|
|     9071|                      1|
|     8496|                      1|
|     8328|                      1|
+---------+-----------------------+
only showing top 20 rows



In [7]:
# KPI 3: Total Quantity Added to Cart by Customer
total_quantity_added_by_customer = spark.sql("""
    SELECT customerId, SUM(quantity) AS total_quantity_added
    FROM stream.add_to_cart
    GROUP BY customerId
    ORDER BY total_quantity_added DESC
""")
total_quantity_added_by_customer.show()

+----------+--------------------+
|customerId|total_quantity_added|
+----------+--------------------+
|     77111|                   5|
|     85819|                   5|
|     15037|                   5|
|     44634|                   5|
|     43838|                   5|
|     98110|                   5|
|     19197|                   5|
|     14824|                   5|
|     44973|                   5|
|     44550|                   5|
|     24351|                   5|
|     76533|                   5|
|     97641|                   5|
|     13170|                   5|
|     97970|                   5|
|     34399|                   5|
|     78422|                   5|
|     32888|                   5|
|     25644|                   5|
|     48624|                   5|
+----------+--------------------+
only showing top 20 rows



In [8]:
# 3. Purchase Table (emp.purchase)

# KPI 1: Total Revenue Generated
total_revenue = spark.sql("""
    SELECT SUM(totalAmount ) AS total_revenue
    FROM stream.purchase
""")
total_revenue.show()

+-----------------+
|    total_revenue|
+-----------------+
|71998.27000000003|
+-----------------+



In [9]:
# KPI 2: Number of Purchases by Payment Method
purchases_per_method = spark.sql("""
    SELECT paymentMethod, COUNT(*) AS purchases_per_method
    FROM stream.purchase
    GROUP BY paymentMethod
    ORDER BY purchases_per_method DESC
""")
purchases_per_method.show()

+-------------+--------------------+
|paymentMethod|purchases_per_method|
+-------------+--------------------+
|  Credit Card|                 100|
|   Debit Card|                  97|
|       PayPal|                  91|
+-------------+--------------------+



In [10]:
# KPI 3: Average Order Value
average_order_value = spark.sql("""
    SELECT AVG(totalAmount) AS average_order_value
    FROM stream.purchase
""")
average_order_value.show()

+-------------------+
|average_order_value|
+-------------------+
| 249.99399305555568|
+-------------------+



In [11]:
# 4. Recommendation Click Table (emp.recommendation_click)

# KPI 1: Total Number of Recommendation Clicks
total_recommendation_clicks = spark.sql("""
    SELECT COUNT(*) AS total_recommendation_clicks
    FROM stream.recommendation_click
""")
total_recommendation_clicks.show()

+---------------------------+
|total_recommendation_clicks|
+---------------------------+
|                        312|
+---------------------------+



In [12]:
# KPI 2: Number of Clicks by Recommendation Algorithm
clicks_per_algorithm = spark.sql("""
    SELECT algorithm, COUNT(*) AS clicks_per_algorithm
    FROM stream.recommendation_click
    GROUP BY algorithm
    ORDER BY clicks_per_algorithm DESC
""")
clicks_per_algorithm.show()

+--------------------+--------------------+
|           algorithm|clicks_per_algorithm|
+--------------------+--------------------+
|       content_based|                 158|
|collaborative_fil...|                 154|
+--------------------+--------------------+



In [13]:
# KPI 3: Unique Customers Clicking on Recommendations
unique_customers_clicking = spark.sql("""
    SELECT COUNT(DISTINCT customerId) AS unique_customers_clicking
    FROM stream.recommendation_click
""")
unique_customers_clicking.show()


+-------------------------+
|unique_customers_clicking|
+-------------------------+
|                      310|
+-------------------------+



In [14]:
spark.stop()