In [0]:
# Import Data
aisles = spark.read.csv("/Volumes/workspace/default/instacartmarketbasketanalysis/aisles.csv", header=True, inferSchema=True)
departments = spark.read.csv("/Volumes/workspace/default/instacartmarketbasketanalysis/departments.csv", header=True, inferSchema=True)
order_products_prior = spark.read.csv("/Volumes/workspace/default/instacartmarketbasketanalysis/order_products__prior.csv", header=True, inferSchema=True)
order_products_train = spark.read.csv("/Volumes/workspace/default/instacartmarketbasketanalysis/order_products__train.csv", header=True, inferSchema=True)
orders = spark.read.csv("/Volumes/workspace/default/instacartmarketbasketanalysis/orders.csv", header=True, inferSchema=True)
products = spark.read.csv("/Volumes/workspace/default/instacartmarketbasketanalysis/products.csv", header=True, inferSchema=True)

# Create Temporary Tables
aisles.createOrReplaceTempView("aisles")
departments.createOrReplaceTempView("departments")
order_products_prior.createOrReplaceTempView("order_products_prior")
order_products_train.createOrReplaceTempView("order_products_train")
orders.createOrReplaceTempView("orders")
products.createOrReplaceTempView("products")

In [0]:
%sql
select 
  count(order_id) as total_orders, 
  (case 
     when order_dow = '0' then 'Sunday'
     when order_dow = '1' then 'Monday'
     when order_dow = '2' then 'Tuesday'
     when order_dow = '3' then 'Wednesday'
     when order_dow = '4' then 'Thursday'
     when order_dow = '5' then 'Friday'
     when order_dow = '6' then 'Saturday'              
   end) as day_of_week 
  from orders  
 group by order_dow 
 order by total_orders desc

In [0]:
%sql
select 
  count(order_id) as total_orders, 
  order_hour_of_day as hour 
  from orders 
 group by order_hour_of_day 
 order by order_hour_of_day

In [0]:
%sql
select 
  count(order_id) as total_orders, 
  order_hour_of_day as hour 
  from orders 
 group by order_hour_of_day 
 order by order_hour_of_day

In [0]:
# Organize the data by shopping basket
from pyspark.sql.functions import collect_set, col, count
rawData = spark.sql("select p.product_name, o.order_id from products p inner join order_products_train o where o.product_id = p.product_id")
baskets = rawData.groupBy('order_id').agg(collect_set('product_name').alias('items'))
baskets.createOrReplaceTempView('baskets')

In [0]:
# Visualize the distribution of basket sizes
from pyspark.sql.functions import size

basket_sizes = baskets.withColumn("basket_size", size("items"))
display(basket_sizes.select("basket_size"))

# Visualize the most common products
top_products = spark.sql("""
    SELECT product_name, COUNT(*) as count
    FROM products p
    JOIN order_products_train o ON p.product_id = o.product_id
    GROUP BY product_name
    ORDER BY count DESC
    LIMIT 20
""")
display(top_products)

# Visualize the number of orders per user
orders_per_user = orders.groupBy("user_id").agg(count("order_id").alias("num_orders"))
display(orders_per_user.select("num_orders"))

In [0]:
display(
  spark.sql(
    """
    SELECT *
    FROM orders
    LIMIT 10
    """
  )
)