In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark, spark.version


(<pyspark.sql.session.SparkSession at 0x7ad6a40a7f90>, '3.5.0')

In [2]:
sc = spark.sparkContext
sc.parallelize(range(1000000)).sum()


499999500000

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType
     

In [4]:
spark = SparkSession.builder.appName("Retail-Data-PySpark").getOrCreate()
INPUT_ROOT = "/home/jovyan/data"

In [5]:
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("order_date", TimestampType(), True),
    StructField("order_customer_id", IntegerType(), True),
    StructField("order_status", StringType(), True)
])

order_items_schema = StructType([
    StructField("order_item_id", IntegerType(), True),
    StructField("order_item_order_id", IntegerType(), True),
    StructField("order_item_product_id", IntegerType(), True),
    StructField("order_item_quantity", IntegerType(), True),
    StructField("order_item_subtotal", DoubleType(), True),
    StructField("order_item_product_price", DoubleType(), True)
])

customers_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("customer_fname", StringType(), True),
    StructField("customer_lname", StringType(), True),
    StructField("customer_email", StringType(), True),
    StructField("customer_password", StringType(), True),
    StructField("customer_street", StringType(), True),
    StructField("customer_city", StringType(), True),
    StructField("customer_state", StringType(), True),
    StructField("customer_zipcode", StringType(), True)
])

categories_schema = StructType([
    StructField("category_id", IntegerType(), True),
    StructField("category_department_id", IntegerType(), True),
    StructField("category_name", StringType(), True)
])

products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_category_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("product_description", StringType(), True),
    StructField("product_price", DoubleType(), True),
    StructField("product_image", StringType(), True)
])

departments_schema = StructType([
    StructField("department_id", IntegerType(), True),
    StructField("department_name", StringType(), True)
])

In [6]:
orders = spark.read.option("header", False).schema(orders_schema).csv(f"{INPUT_ROOT}/orders.csv")
order_items = spark.read.option("header", False).schema(order_items_schema).csv(f"{INPUT_ROOT}/order_items.csv")
customers = spark.read.option("header", False).schema(customers_schema).csv(f"{INPUT_ROOT}/customers.csv")
categories = spark.read.option("header", False).schema(categories_schema).csv(f"{INPUT_ROOT}/categories.csv")
products = spark.read.option("header", False).schema(products_schema).csv(f"{INPUT_ROOT}/products.csv")
departments = spark.read.option("header", False).schema(departments_schema).csv(f"{INPUT_ROOT}/departments.csv")

In [7]:
orders.createOrReplaceTempView("orders")
order_items.createOrReplaceTempView("order_items")
customers.createOrReplaceTempView("customers")
categories.createOrReplaceTempView("categories")
products.createOrReplaceTempView("products")
departments.createOrReplaceTempView("departments")

### Retail SQL → Spark SQL Queries
- Each query is explained with a Markdown cell and executed in Spark SQL using spark.sql().

#### 1.Distinct order status

In [8]:
spark.sql("""
SELECT DISTINCT order_status
FROM orders
ORDER BY order_status
""").show()

+---------------+
|   order_status|
+---------------+
|       CANCELED|
|         CLOSED|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|        PENDING|
|PENDING_PAYMENT|
|     PROCESSING|
|SUSPECTED_FRAUD|
+---------------+



#### 2. Orders with COMPLETE status

In [9]:

spark.sql("SELECT * FROM orders WHERE order_status='COMPLETE'").show()

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       3|2013-07-25 00:00:00|            12111|    COMPLETE|
|       5|2013-07-25 00:00:00|            11318|    COMPLETE|
|       6|2013-07-25 00:00:00|             7130|    COMPLETE|
|       7|2013-07-25 00:00:00|             4530|    COMPLETE|
|      15|2013-07-25 00:00:00|             2568|    COMPLETE|
|      17|2013-07-25 00:00:00|             2667|    COMPLETE|
|      22|2013-07-25 00:00:00|              333|    COMPLETE|
|      26|2013-07-25 00:00:00|             7562|    COMPLETE|
|      28|2013-07-25 00:00:00|              656|    COMPLETE|
|      32|2013-07-25 00:00:00|             3960|    COMPLETE|
|      35|2013-07-25 00:00:00|             4840|    COMPLETE|
|      45|2013-07-25 00:00:00|             2636|    COMPLETE|
|      56|2013-07-25 00:00:00|            10519|    COMPLETE|
|      6

#### 3. Orders with CLOSED status

In [10]:
spark.sql("SELECT * FROM orders WHERE order_status='CLOSED'").show()

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       1|2013-07-25 00:00:00|            11599|      CLOSED|
|       4|2013-07-25 00:00:00|             8827|      CLOSED|
|      12|2013-07-25 00:00:00|             1837|      CLOSED|
|      18|2013-07-25 00:00:00|             1205|      CLOSED|
|      24|2013-07-25 00:00:00|            11441|      CLOSED|
|      25|2013-07-25 00:00:00|             9503|      CLOSED|
|      37|2013-07-25 00:00:00|             5863|      CLOSED|
|      51|2013-07-25 00:00:00|            12271|      CLOSED|
|      57|2013-07-25 00:00:00|             7073|      CLOSED|
|      61|2013-07-25 00:00:00|             4791|      CLOSED|
|      62|2013-07-25 00:00:00|             9111|      CLOSED|
|      87|2013-07-25 00:00:00|             3065|      CLOSED|
|      90|2013-07-25 00:00:00|             9131|      CLOSED|
|     10

#### 4. Orders with CLOSED or COMPLETE status

In [11]:
spark.sql("SELECT * FROM orders WHERE order_status IN ('CLOSED','COMPLETE')").show()


+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       1|2013-07-25 00:00:00|            11599|      CLOSED|
|       3|2013-07-25 00:00:00|            12111|    COMPLETE|
|       4|2013-07-25 00:00:00|             8827|      CLOSED|
|       5|2013-07-25 00:00:00|            11318|    COMPLETE|
|       6|2013-07-25 00:00:00|             7130|    COMPLETE|
|       7|2013-07-25 00:00:00|             4530|    COMPLETE|
|      12|2013-07-25 00:00:00|             1837|      CLOSED|
|      15|2013-07-25 00:00:00|             2568|    COMPLETE|
|      17|2013-07-25 00:00:00|             2667|    COMPLETE|
|      18|2013-07-25 00:00:00|             1205|      CLOSED|
|      22|2013-07-25 00:00:00|              333|    COMPLETE|
|      24|2013-07-25 00:00:00|            11441|      CLOSED|
|      25|2013-07-25 00:00:00|             9503|      CLOSED|
|      2

#### 5. Count of orders

In [12]:
spark.sql("SELECT COUNT(*) AS order_count FROM orders").show()


+-----------+
|order_count|
+-----------+
|      68883|
+-----------+



#### 6. Count of order items

In [13]:
spark.sql("SELECT COUNT(*) AS order_items_count FROM order_items").show()


+-----------------+
|order_items_count|
+-----------------+
|           172198|
+-----------------+



#### 7. Count of distinct order statuses

In [14]:
spark.sql("SELECT COUNT(DISTINCT order_status) AS distinct_status_count FROM orders").show()


+---------------------+
|distinct_status_count|
+---------------------+
|                    9|
+---------------------+




#### 8. Order revenue per order

In [15]:

spark.sql("""
SELECT order_item_order_id,
       ROUND(SUM(order_item_subtotal),2) AS order_revenue
FROM order_items
GROUP BY order_item_order_id
ORDER BY order_item_order_id
""").show()

+-------------------+-------------+
|order_item_order_id|order_revenue|
+-------------------+-------------+
|                  1|       299.98|
|                  2|       579.98|
|                  4|       699.85|
|                  5|      1129.86|
|                  7|       579.92|
|                  8|       729.84|
|                  9|       599.96|
|                 10|       651.92|
|                 11|       919.79|
|                 12|      1299.87|
|                 13|       127.96|
|                 14|       549.94|
|                 15|       925.91|
|                 16|       419.93|
|                 17|       694.84|
|                 18|       449.96|
|                 19|       699.96|
|                 20|       879.86|
|                 21|       372.91|
|                 23|       299.98|
+-------------------+-------------+
only showing top 20 rows



#### 9. Orders with >=120 per date

In [16]:
spark.sql("""
SELECT order_date, COUNT(*) AS order_count
FROM orders
WHERE order_status IN ('COMPLETE','CLOSED')
GROUP BY order_date
HAVING COUNT(*) >= 120
ORDER BY order_count DESC
""").show()

+-------------------+-----------+
|         order_date|order_count|
+-------------------+-----------+
|2013-11-03 00:00:00|        146|
|2013-10-04 00:00:00|        127|
|2013-11-05 00:00:00|        126|
|2013-09-14 00:00:00|        126|
|2013-11-11 00:00:00|        125|
|2013-11-14 00:00:00|        125|
|2013-11-30 00:00:00|        125|
|2014-06-19 00:00:00|        124|
|2014-05-06 00:00:00|        123|
|2014-07-20 00:00:00|        123|
|2013-08-06 00:00:00|        123|
|2014-04-21 00:00:00|        123|
|2014-06-09 00:00:00|        122|
|2014-03-04 00:00:00|        122|
|2014-02-25 00:00:00|        122|
|2014-07-15 00:00:00|        121|
|2013-12-06 00:00:00|        121|
|2014-03-15 00:00:00|        121|
|2013-09-05 00:00:00|        120|
|2013-10-13 00:00:00|        120|
+-------------------+-----------+
only showing top 20 rows



#### 10. Orders with revenue >=2000

In [17]:
spark.sql("""
SELECT order_item_order_id,
       ROUND(SUM(order_item_subtotal),2) AS order_revenue
FROM order_items
GROUP BY order_item_order_id
HAVING ROUND(SUM(order_item_subtotal),2) >= 2000
ORDER BY order_revenue DESC
""").show()

+-------------------+-------------+
|order_item_order_id|order_revenue|
+-------------------+-------------+
|              68703|      3449.91|
|              68724|      2859.89|
|              68858|      2839.91|
|              68809|      2779.86|
|              68766|       2699.9|
|              68821|      2629.92|
|              68806|      2629.92|
|              68778|       2629.9|
|              68848|      2399.96|
|              68875|      2399.95|
|              68859|      2349.89|
|              68816|      2329.94|
|              68837|      2299.96|
|              68736|      2259.95|
|              68722|      2199.99|
|              68883|      2149.99|
|              22030|       2039.8|
+-------------------+-------------+



#### 11. Inner Join Orders + Order Items

In [18]:
spark.sql("""
SELECT o.order_date, oi.order_item_product_id, oi.order_item_subtotal
FROM orders o
JOIN order_items oi
ON o.order_id = oi.order_item_order_id
""").show()

+-------------------+---------------------+-------------------+
|         order_date|order_item_product_id|order_item_subtotal|
+-------------------+---------------------+-------------------+
|2013-07-25 00:00:00|                  957|             299.98|
|2013-07-25 00:00:00|                 1073|             199.99|
|2013-07-25 00:00:00|                  502|              250.0|
|2013-07-25 00:00:00|                  403|             129.99|
|2013-07-25 00:00:00|                  897|              49.98|
|2013-07-25 00:00:00|                  365|             299.95|
|2013-07-25 00:00:00|                  502|              150.0|
|2013-07-25 00:00:00|                 1014|             199.92|
|2013-07-25 00:00:00|                  957|             299.98|
|2013-07-25 00:00:00|                  365|             299.95|
|2013-07-25 00:00:00|                 1014|              99.96|
|2013-07-25 00:00:00|                  957|             299.98|
|2013-07-25 00:00:00|                  4

#### 12. Left Outer Join Orders + Order Items

In [19]:
spark.sql("""
SELECT o.order_id, o.order_date,
       oi.order_item_id, oi.order_item_product_id, oi.order_item_subtotal
FROM orders o
LEFT OUTER JOIN order_items oi
ON o.order_id = oi.order_item_order_id
ORDER BY o.order_id
""").show()

+--------+-------------------+-------------+---------------------+-------------------+
|order_id|         order_date|order_item_id|order_item_product_id|order_item_subtotal|
+--------+-------------------+-------------+---------------------+-------------------+
|       1|2013-07-25 00:00:00|            1|                  957|             299.98|
|       2|2013-07-25 00:00:00|            4|                  403|             129.99|
|       2|2013-07-25 00:00:00|            3|                  502|              250.0|
|       2|2013-07-25 00:00:00|            2|                 1073|             199.99|
|       3|2013-07-25 00:00:00|         NULL|                 NULL|               NULL|
|       4|2013-07-25 00:00:00|            8|                 1014|             199.92|
|       4|2013-07-25 00:00:00|            7|                  502|              150.0|
|       4|2013-07-25 00:00:00|            6|                  365|             299.95|
|       4|2013-07-25 00:00:00|            5

#### 13. Daily Revenue temp view

In [20]:
spark.sql("""
CREATE OR REPLACE TEMP VIEW daily_revenue AS
SELECT to_date(o.order_date) AS order_date,
       ROUND(SUM(oi.order_item_subtotal),2) AS order_revenue
FROM orders o
JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN ('COMPLETE','CLOSED')
GROUP BY to_date(o.order_date)
""")
spark.sql("SELECT * FROM daily_revenue ORDER BY order_date").show()

+----------+-------------+
|order_date|order_revenue|
+----------+-------------+
|2013-07-25|     31547.23|
|2013-07-26|     54713.23|
|2013-07-27|     48411.48|
|2013-07-28|     35672.03|
|2013-07-29|      54579.7|
|2013-07-30|     49329.29|
|2013-07-31|     59212.49|
|2013-08-01|     49160.08|
|2013-08-02|     50688.58|
|2013-08-03|     43416.74|
|2013-08-04|     35093.01|
|2013-08-05|     34025.27|
|2013-08-06|     57843.89|
|2013-08-07|     45525.59|
|2013-08-08|     33549.47|
|2013-08-09|     29225.16|
|2013-08-10|     46435.04|
|2013-08-11|      31155.5|
|2013-08-12|     59014.74|
|2013-08-13|     17956.88|
+----------+-------------+
only showing top 20 rows



#### 14. Daily Product Revenue temp view

In [21]:
spark.sql("""
CREATE OR REPLACE TEMP VIEW daily_product_revenue AS
SELECT to_date(o.order_date) AS order_date,
       oi.order_item_product_id,
       ROUND(SUM(oi.order_item_subtotal),2) AS order_revenue
FROM orders o
JOIN order_items oi
ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN ('COMPLETE','CLOSED')
GROUP BY to_date(o.order_date), oi.order_item_product_id
""")
spark.sql("SELECT * FROM daily_product_revenue ORDER BY order_date, order_revenue DESC").show()
     

+----------+---------------------+-------------+
|order_date|order_item_product_id|order_revenue|
+----------+---------------------+-------------+
|2013-07-25|                 1004|      5599.72|
|2013-07-25|                  191|      5099.49|
|2013-07-25|                  957|       4499.7|
|2013-07-25|                  365|      3359.44|
|2013-07-25|                 1073|      2999.85|
|2013-07-25|                 1014|      2798.88|
|2013-07-25|                  403|      1949.85|
|2013-07-25|                  502|       1650.0|
|2013-07-25|                  627|      1079.73|
|2013-07-25|                  226|       599.99|
|2013-07-25|                   24|       319.96|
|2013-07-25|                  821|       207.96|
|2013-07-25|                  625|       199.99|
|2013-07-25|                  705|       119.99|
|2013-07-25|                  572|       119.97|
|2013-07-25|                  666|       109.99|
|2013-07-25|                  725|        108.0|
|2013-07-25|        

#### 15. Monthly Revenue with Window

In [22]:
spark.sql("""
SELECT date_format(dr.order_date,'yyyy-MM') AS order_month,
       dr.order_date,
       dr.order_revenue,
       SUM(dr.order_revenue) OVER (PARTITION BY date_format(dr.order_date,'yyyy-MM')) AS monthly_order_revenue
FROM daily_revenue dr
ORDER BY dr.order_date
""").show()

+-----------+----------+-------------+---------------------+
|order_month|order_date|order_revenue|monthly_order_revenue|
+-----------+----------+-------------+---------------------+
|    2013-07|2013-07-25|     31547.23|   333465.44999999995|
|    2013-07|2013-07-26|     54713.23|   333465.44999999995|
|    2013-07|2013-07-27|     48411.48|   333465.44999999995|
|    2013-07|2013-07-28|     35672.03|   333465.44999999995|
|    2013-07|2013-07-29|      54579.7|   333465.44999999995|
|    2013-07|2013-07-30|     49329.29|   333465.44999999995|
|    2013-07|2013-07-31|     59212.49|   333465.44999999995|
|    2013-08|2013-08-01|     49160.08|   1221828.8999999997|
|    2013-08|2013-08-02|     50688.58|   1221828.8999999997|
|    2013-08|2013-08-03|     43416.74|   1221828.8999999997|
|    2013-08|2013-08-04|     35093.01|   1221828.8999999997|
|    2013-08|2013-08-05|     34025.27|   1221828.8999999997|
|    2013-08|2013-08-06|     57843.89|   1221828.8999999997|
|    2013-08|2013-08-07|

#### 16. Total Revenue with Window

In [23]:
spark.sql("""
SELECT dr.*,
       SUM(order_revenue) OVER () AS total_order_revenue
FROM daily_revenue dr
ORDER BY dr.order_date
""").show()

+----------+-------------+--------------------+
|order_date|order_revenue| total_order_revenue|
+----------+-------------+--------------------+
|2013-07-25|     31547.23|1.5012982480000008E7|
|2013-07-26|     54713.23|1.5012982480000008E7|
|2013-07-27|     48411.48|1.5012982480000008E7|
|2013-07-28|     35672.03|1.5012982480000008E7|
|2013-07-29|      54579.7|1.5012982480000008E7|
|2013-07-30|     49329.29|1.5012982480000008E7|
|2013-07-31|     59212.49|1.5012982480000008E7|
|2013-08-01|     49160.08|1.5012982480000008E7|
|2013-08-02|     50688.58|1.5012982480000008E7|
|2013-08-03|     43416.74|1.5012982480000008E7|
|2013-08-04|     35093.01|1.5012982480000008E7|
|2013-08-05|     34025.27|1.5012982480000008E7|
|2013-08-06|     57843.89|1.5012982480000008E7|
|2013-08-07|     45525.59|1.5012982480000008E7|
|2013-08-08|     33549.47|1.5012982480000008E7|
|2013-08-09|     29225.16|1.5012982480000008E7|
|2013-08-10|     46435.04|1.5012982480000008E7|
|2013-08-11|      31155.5|1.501298248000

#### 17. Top 5 Products per Day (Window)

In [24]:
spark.sql("""
WITH daily_product_revenue_ranks AS (
  SELECT order_date,
         order_item_product_id,
         order_revenue,
         RANK() OVER (PARTITION BY order_date ORDER BY order_revenue DESC) AS rnk,
         DENSE_RANK() OVER (PARTITION BY order_date ORDER BY order_revenue DESC) AS drnk
  FROM daily_product_revenue
  WHERE date_format(order_date,'yyyy-MM')='2014-01'
)
SELECT *
FROM daily_product_revenue_ranks
WHERE drnk <= 5
ORDER BY order_date, order_revenue DESC
""").show()

+----------+---------------------+-------------+---+----+
|order_date|order_item_product_id|order_revenue|rnk|drnk|
+----------+---------------------+-------------+---+----+
|2014-01-01|                 1004|      5599.72|  1|   1|
|2014-01-01|                  191|      4399.56|  2|   2|
|2014-01-01|                  365|      3839.36|  3|   3|
|2014-01-01|                  502|       3300.0|  4|   4|
|2014-01-01|                  957|      3299.78|  5|   5|
|2014-01-02|                 1004|      4799.76|  1|   1|
|2014-01-02|                  957|       2999.8|  2|   2|
|2014-01-02|                  365|      2939.51|  3|   3|
|2014-01-02|                 1073|      2599.87|  4|   4|
|2014-01-02|                  403|      2469.81|  5|   5|
|2014-01-03|                 1004|     11599.42|  1|   1|
|2014-01-03|                  365|      6958.84|  2|   2|
|2014-01-03|                  191|      5599.44|  3|   3|
|2014-01-03|                  957|      5399.64|  4|   4|
|2014-01-03|  