In [9]:
# Import necessary libraries
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder \
    .appName("HiveQueries") \
    .enableHiveSupport() \
    .getOrCreate()

In [11]:
#Total Sales Amount by Payment Method
total_sales_amount_by_payment_method = spark.sql("""
        SELECT 
            paymentMethod, 
            SUM(totalAmount) AS total_sales
        FROM 
            kafka
        WHERE 
            eventType = 'purchase'
        GROUP BY 
            paymentMethod
        ORDER BY 
            total_sales DESC;
    """)
# Show the results
total_sales_amount_by_payment_method.show()

+-------------+-----------------+
|paymentMethod|      total_sales|
+-------------+-----------------+
|   Debit Card|57267.98000000002|
|       PayPal|         45508.37|
|  Credit Card|         44203.07|
+-------------+-----------------+



In [12]:
#Most Popular Categories by Number of Events
most_popular_category = spark.sql("""
        SELECT 
            category, 
            COUNT(*) AS total_events
        FROM 
            kafka
        GROUP BY 
            category
        ORDER BY 
            total_events DESC;
    """)
# Show the results
most_popular_category.show()

+--------------+------------+
|      category|total_events|
+--------------+------------+
|          null|        1846|
|Home & Kitchen|         197|
|         Books|         173|
|   Electronics|         160|
|      Clothing|         128|
+--------------+------------+



In [17]:
#Revenue by Customer
revenue_by_customer = spark.sql("""
        SELECT 
            customerId, 
            SUM(totalAmount) AS total_revenue
        FROM 
            kafka
        WHERE 
            eventType = 'purchase'
        GROUP BY 
            customerId
        ORDER BY 
            total_revenue DESC;

    """)
# Show the results
revenue_by_customer.show()

+----------+-------------+
|customerId|total_revenue|
+----------+-------------+
|     40568|        798.8|
|     24184|        499.3|
|     36839|       497.62|
|     32730|       497.26|
|     66714|       496.19|
|     65429|        494.8|
|     41784|        493.9|
|     38514|       493.74|
|     32457|       491.46|
|     45241|       491.37|
|     29717|       490.85|
|     58979|       490.55|
|     58051|       490.06|
|     79665|       489.21|
|     11613|       488.36|
|     19200|       488.21|
|     40107|       486.33|
|     26946|       486.06|
|     89694|       485.79|
|     24218|       485.29|
+----------+-------------+
only showing top 20 rows



In [20]:
#Hourly Sales Analysis
hourly_sales_analysis = spark.sql("""
        SELECT 
            event_hour, 
            SUM(totalAmount) AS total_sales
        FROM 
            kafka
        WHERE 
            eventType = 'purchase'
        GROUP BY 
            event_hour
        ORDER BY 
            event_hour;

    """)
# Show the results
hourly_sales_analysis.show()

+----------+------------------+
|event_hour|       total_sales|
+----------+------------------+
|         3|            2350.4|
|         4|            2468.7|
|        11|11588.650000000001|
|        12|           4466.29|
|        13|           1436.93|
|        15|          16607.09|
|        16|          76627.44|
|        17|           2817.18|
|        18|          10504.49|
|        19|15367.099999999999|
|        20|           2745.15|
+----------+------------------+



In [None]:
spark.stop()