In [1]:
import pyspark
from pyspark.sql.functions import *

In [2]:
print(pyspark.__version__)

3.5.5


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("Retail_Analysis") \
        .master("local[*]") \
        .getOrCreate()

25/05/04 18:30:17 WARN Utils: Your hostname, Ameys-Mac-mini.local resolves to a loopback address: 127.0.0.1; using 192.168.1.12 instead (on interface en1)
25/05/04 18:30:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/04 18:30:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/04 18:30:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = spark.read.csv("retail_sales_data_easy.csv", inferSchema=True, header=True)

## Data Exploration and Cleaning

### Print the schema and show the first 5 rows

In [5]:
df.show()

+--------------------+-----------+------------------+----------------+---------------+--------+--------------+-------------+
|      transaction_id|customer_id|     customer_name|product_category|purchase_amount|quantity|payment_method|purchase_date|
+--------------------+-----------+------------------+----------------+---------------+--------+--------------+-------------+
|1f5324d6-2a86-434...|       8028|      Kevin Rogers|        Clothing|         369.64|       4|          Cash|   2024-01-21|
|60326ab1-d515-40c...|       5037|     Richard Arias|            Toys|          14.41|       4|   Credit Card|   2023-03-30|
|db2bd023-03a8-4d4...|       3784|     Gilbert Reese|           Books|         367.36|       1|     Gift Card|   2023-07-10|
|4977751b-8726-4d0...|       1386|     Jason Wallace|        Clothing|         404.03|       1|   Credit Card|   2023-12-15|
|5143a0c9-8354-4f8...|       8837|        Karen Rich|  Home & Kitchen|         378.51|       5|        PayPal|   2024-10-28|


In [6]:
df.show(5)

+--------------------+-----------+-------------+----------------+---------------+--------+--------------+-------------+
|      transaction_id|customer_id|customer_name|product_category|purchase_amount|quantity|payment_method|purchase_date|
+--------------------+-----------+-------------+----------------+---------------+--------+--------------+-------------+
|1f5324d6-2a86-434...|       8028| Kevin Rogers|        Clothing|         369.64|       4|          Cash|   2024-01-21|
|60326ab1-d515-40c...|       5037|Richard Arias|            Toys|          14.41|       4|   Credit Card|   2023-03-30|
|db2bd023-03a8-4d4...|       3784|Gilbert Reese|           Books|         367.36|       1|     Gift Card|   2023-07-10|
|4977751b-8726-4d0...|       1386|Jason Wallace|        Clothing|         404.03|       1|   Credit Card|   2023-12-15|
|5143a0c9-8354-4f8...|       8837|   Karen Rich|  Home & Kitchen|         378.51|       5|        PayPal|   2024-10-28|
+--------------------+-----------+------

### Count the number of distinct customers

In [7]:
df.select("customer_id").distinct().count()

58

### Find the number of nulls in each columns

In [8]:
nulls = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
nulls.show()

+--------------+-----------+-------------+----------------+---------------+--------+--------------+-------------+
|transaction_id|customer_id|customer_name|product_category|purchase_amount|quantity|payment_method|purchase_date|
+--------------+-----------+-------------+----------------+---------------+--------+--------------+-------------+
|             0|          0|            0|               0|              0|       0|             0|            0|
+--------------+-----------+-------------+----------------+---------------+--------+--------------+-------------+



### Get the count of transactions per product category

In [9]:
df.groupBy("product_category") \
    .count() \
    .withColumnRenamed("count", "number of transactions") \
    .show()

+----------------+----------------------+
|product_category|number of transactions|
+----------------+----------------------+
|  Home & Kitchen|                     8|
|     Electronics|                    12|
|        Clothing|                    12|
|           Books|                    11|
|            Toys|                    17|
+----------------+----------------------+



### What is the range of purchase dates?

In [10]:
df.createOrReplaceTempView("retail_sales")

In [11]:
spark.sql(
    "SELECT " \
    "min(purchase_date) as first_purchase_date, " \
    "max(purchase_date) as last_purchase_date FROM retail_sales"
    ) \
    .show()

+-------------------+------------------+
|first_purchase_date|last_purchase_date|
+-------------------+------------------+
|         2022-05-15|        2025-04-25|
+-------------------+------------------+



In [12]:
df.select(min(df.purchase_date).alias("first_purchase_date"), max(df.purchase_date).alias("last_purchase_date")) \
    .show()


+-------------------+------------------+
|first_purchase_date|last_purchase_date|
+-------------------+------------------+
|         2022-05-15|        2025-04-25|
+-------------------+------------------+



## Aggregations and Grouping

### What is the total revenue generated from all transactions?

In [13]:
spark.sql("""
    SELECT
        SUM(purchase_amount) as total_sales
    FROM retail_sales
"""
).show()

+-----------+
|total_sales|
+-----------+
|    18618.1|
+-----------+



In [14]:
df.select(sum(df.purchase_amount).alias("total_sales")) \
    .show()

+-----------+
|total_sales|
+-----------+
|    18618.1|
+-----------+



### Show the total quantity of products sold per category.

In [15]:
spark.sql("""
    SELECT
        product_category,
        SUM(quantity) as total_quantity
        FROM retail_sales
        GROUP BY product_category
""").show()

+----------------+--------------+
|product_category|total_quantity|
+----------------+--------------+
|  Home & Kitchen|            25|
|     Electronics|            42|
|        Clothing|            40|
|           Books|            34|
|            Toys|            50|
+----------------+--------------+



In [16]:
df.groupBy("product_category") \
    .sum("quantity") \
    .withColumnRenamed("sum(quantity)", "total_quantity") \
    .show()

+----------------+--------------+
|product_category|total_quantity|
+----------------+--------------+
|  Home & Kitchen|            25|
|     Electronics|            42|
|        Clothing|            40|
|           Books|            34|
|            Toys|            50|
+----------------+--------------+



### Which payment method was used most frequently?

In [17]:
spark.sql("""
    SELECT
        payment_method,
        count(payment_method) as count_payment_method
    FROM retail_sales
        GROUP BY payment_method
        ORDER BY count_payment_method DESC
""").show()

+--------------+--------------------+
|payment_method|count_payment_method|
+--------------+--------------------+
|   Credit Card|                  15|
|     Gift Card|                  14|
|        PayPal|                  11|
|    Debit Card|                  11|
|          Cash|                   9|
+--------------+--------------------+



In [18]:
df.groupBy("payment_method") \
    .agg(count("payment_method").alias("count_payment_method")) \
    .orderBy(desc("count_payment_method")) \
    .withColumnRenamed("count(payment_method)", "count_payment_method") \
    .show()

+--------------+--------------------+
|payment_method|count_payment_method|
+--------------+--------------------+
|   Credit Card|                  15|
|     Gift Card|                  14|
|        PayPal|                  11|
|    Debit Card|                  11|
|          Cash|                   9|
+--------------+--------------------+



### What is the average purchase amount for each product category?

In [19]:
spark.sql(
    """
    SELECT
        product_category,
        avg(purchase_amount) as average_purchase_amount
    FROM retail_sales
        GROUP BY product_category
    """
).show()

+----------------+-----------------------+
|product_category|average_purchase_amount|
+----------------+-----------------------+
|  Home & Kitchen|               308.4275|
|     Electronics|      277.2583333333334|
|        Clothing|      322.4533333333333|
|           Books|      308.3781818181819|
|            Toys|       327.175294117647|
+----------------+-----------------------+



In [20]:
df.groupBy("product_category") \
    .agg(avg("purchase_amount").alias("average_purchase_amount")) \
    .show()

+----------------+-----------------------+
|product_category|average_purchase_amount|
+----------------+-----------------------+
|  Home & Kitchen|               308.4275|
|     Electronics|      277.2583333333334|
|        Clothing|      322.4533333333333|
|           Books|      308.3781818181819|
|            Toys|       327.175294117647|
+----------------+-----------------------+



### List the top 5 highest value transactions

In [21]:
spark.sql(
    """
    SELECT
        transaction_id,
        purchase_amount
    FROM retail_sales
    ORDER BY purchase_amount DESC
    LIMIT 5
    """
).show()

+--------------------+---------------+
|      transaction_id|purchase_amount|
+--------------------+---------------+
|402d9a4e-7920-443...|         483.87|
|3eb312c4-78e0-4de...|         482.71|
|3ae6dec4-b804-4b3...|         468.24|
|012d0eda-c493-4d4...|         462.16|
|33af77a9-6445-468...|         459.28|
+--------------------+---------------+



In [22]:
df.select("transaction_id", "purchase_amount") \
    .orderBy(desc("purchase_amount")) \
    .limit(5) \
    .show()

+--------------------+---------------+
|      transaction_id|purchase_amount|
+--------------------+---------------+
|402d9a4e-7920-443...|         483.87|
|3eb312c4-78e0-4de...|         482.71|
|3ae6dec4-b804-4b3...|         468.24|
|012d0eda-c493-4d4...|         462.16|
|33af77a9-6445-468...|         459.28|
+--------------------+---------------+



## Insights

### Which customer made the highest total purchase amount?

In [23]:
spark.sql(
    """
    SELECT
        customer_id,
        customer_name,
        SUM(purchase_amount) as total_purchase_amount
    FROM retail_sales
    GROUP BY customer_id, customer_name
    ORDER BY SUM(purchase_amount) DESC
    LIMIT 1
    """
).show()

+-----------+-------------+---------------------+
|customer_id|customer_name|total_purchase_amount|
+-----------+-------------+---------------------+
|       7674|Kent Marshall|               483.87|
+-----------+-------------+---------------------+



In [24]:
df.groupBy("customer_id", "customer_name") \
    .agg(sum("purchase_amount").alias("total_purchase_amount")) \
    .orderBy(desc(sum("purchase_amount"))) \
    .limit(1) \
    .show()

+-----------+-------------+---------------------+
|customer_id|customer_name|total_purchase_amount|
+-----------+-------------+---------------------+
|       7674|Kent Marshall|               483.87|
+-----------+-------------+---------------------+



### What is the average purchase amount by payment method?

In [25]:
spark.sql(
    """
    SELECT
        payment_method,
        avg(purchase_amount) as average_purchase_amount
    FROM retail_sales
    GROUP BY payment_method
    """
).show()

+--------------+-----------------------+
|payment_method|average_purchase_amount|
+--------------+-----------------------+
|   Credit Card|                280.368|
|        PayPal|      276.4827272727273|
|          Cash|      402.2755555555555|
|     Gift Card|      276.6235714285715|
|    Debit Card|      352.5509090909091|
+--------------+-----------------------+



In [26]:
df.groupBy("payment_method") \
    .agg(avg("purchase_amount").alias("average_purchase_amount")) \
    .show()

+--------------+-----------------------+
|payment_method|average_purchase_amount|
+--------------+-----------------------+
|   Credit Card|                280.368|
|        PayPal|      276.4827272727273|
|          Cash|      402.2755555555555|
|     Gift Card|      276.6235714285715|
|    Debit Card|      352.5509090909091|
+--------------+-----------------------+



### For each year, how many transactions took place?

In [27]:
df.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- purchase_amount: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- purchase_date: date (nullable = true)



In [28]:
spark.sql(
    """
    SELECT
        year(purchase_date) as purchase_year,
        count(transaction_id) as total_transactions
    FROM retail_sales
    GROUP BY year(purchase_date)
    """
).show()

+-------------+------------------+
|purchase_year|total_transactions|
+-------------+------------------+
|         2025|                 8|
|         2023|                20|
|         2022|                12|
|         2024|                20|
+-------------+------------------+



In [29]:
df.groupBy(year("purchase_date").alias("purchase_date")) \
    .agg(count("transaction_id").alias("total_transactions")) \
    .show()

+-------------+------------------+
|purchase_date|total_transactions|
+-------------+------------------+
|         2025|                 8|
|         2023|                20|
|         2022|                12|
|         2024|                20|
+-------------+------------------+



### Add a new column called total_amount = purchase_amount * quantity. Show the updated DataFrame.

In [30]:
spark.sql(
    """
    SELECT
        purchase_amount * quantity AS total_amount
    FROM retail_sales
    """
).show()

+------------------+
|      total_amount|
+------------------+
|           1478.56|
|             57.64|
|            367.36|
|            404.03|
|           1892.55|
|            468.24|
|             51.25|
|            914.25|
|            452.45|
|           1530.88|
|            798.84|
|            180.83|
|            1214.1|
|           1635.16|
|            1131.9|
|            284.48|
|             199.6|
| 890.9100000000001|
|           1074.84|
|1131.4499999999998|
+------------------+
only showing top 20 rows



In [31]:
df2 = df.withColumn("total_amount", col("purchase_amount") * col("quantity"))
df2.select("total_amount").show()

+------------------+
|      total_amount|
+------------------+
|           1478.56|
|             57.64|
|            367.36|
|            404.03|
|           1892.55|
|            468.24|
|             51.25|
|            914.25|
|            452.45|
|           1530.88|
|            798.84|
|            180.83|
|            1214.1|
|           1635.16|
|            1131.9|
|            284.48|
|             199.6|
| 890.9100000000001|
|           1074.84|
|1131.4499999999998|
+------------------+
only showing top 20 rows



### What is the average total_amount per product category?

In [32]:
df2.groupBy("product_category") \
    .agg(avg("total_amount").alias("average_total_amount")) \
    .show()

+----------------+--------------------+
|product_category|average_total_amount|
+----------------+--------------------+
|  Home & Kitchen|  1069.8174999999999|
|     Electronics|   901.5125000000002|
|        Clothing|             1060.49|
|           Books|   959.6081818181821|
|            Toys|   869.5723529411765|
+----------------+--------------------+



## Bonus

### Which day of the week has the highest number of purchases?

In [33]:
spark.sql(
    """
    SELECT
        dayofweek(purchase_date) as purchase_day,
        count(transaction_id) as total_transactions
    FROM retail_sales
    GROUP BY dayofweek(purchase_date)
    ORDER BY purchase_day
    """
).show()

+------------+------------------+
|purchase_day|total_transactions|
+------------+------------------+
|           1|                12|
|           2|                 9|
|           3|                 7|
|           4|                 5|
|           5|                 9|
|           6|                13|
|           7|                 5|
+------------+------------------+



In [34]:
df.groupBy(dayofweek("purchase_date").alias("purchase_day")) \
    .agg(count("transaction_id").alias("total_transactions")) \
    .orderBy("purchase_day") \
    .show()

+------------+------------------+
|purchase_day|total_transactions|
+------------+------------------+
|           1|                12|
|           2|                 9|
|           3|                 7|
|           4|                 5|
|           5|                 9|
|           6|                13|
|           7|                 5|
+------------+------------------+



### Create a temporary view and write a SQL query to find the most popular product category for each payment method.

In [35]:
spark.sql(
    """
    SELECT
        product_category,
        payment_method,
        total_transactions
    FROM (
        SELECT
            payment_method,
            product_category,
            count(*) as total_transactions,
            row_number() over (partition by payment_method order by count(*) desc) as rank
        FROM retail_sales
        GROUP BY payment_method, product_category
        ) ranked
    WHERE rank = 1
    """
).show()

+----------------+--------------+------------------+
|product_category|payment_method|total_transactions|
+----------------+--------------+------------------+
|     Electronics|          Cash|                 3|
|        Clothing|   Credit Card|                 5|
|            Toys|    Debit Card|                 4|
|            Toys|     Gift Card|                 4|
|            Toys|        PayPal|                 4|
+----------------+--------------+------------------+



### Filter and display all purchases made using PayPal over $200.

In [36]:
spark.sql(
    """
    SELECT
        product_category,
        payment_method,
        purchase_amount
    FROM retail_sales
    WHERE payment_method = "PayPal" and purchase_amount > 200
    """
).show()

+----------------+--------------+---------------+
|product_category|payment_method|purchase_amount|
+----------------+--------------+---------------+
|  Home & Kitchen|        PayPal|         378.51|
|           Books|        PayPal|         284.48|
|     Electronics|        PayPal|         342.98|
|  Home & Kitchen|        PayPal|         206.03|
|            Toys|        PayPal|         224.91|
|            Toys|        PayPal|         459.28|
|        Clothing|        PayPal|         427.28|
|           Books|        PayPal|         341.09|
+----------------+--------------+---------------+



In [37]:
df.select("product_category", "payment_method", "purchase_amount") \
    .filter((col("payment_method") == "PayPal") & (col("purchase_amount") > 200)) \
    .show()

+----------------+--------------+---------------+
|product_category|payment_method|purchase_amount|
+----------------+--------------+---------------+
|  Home & Kitchen|        PayPal|         378.51|
|           Books|        PayPal|         284.48|
|     Electronics|        PayPal|         342.98|
|  Home & Kitchen|        PayPal|         206.03|
|            Toys|        PayPal|         224.91|
|            Toys|        PayPal|         459.28|
|        Clothing|        PayPal|         427.28|
|           Books|        PayPal|         341.09|
+----------------+--------------+---------------+

