In [0]:
%sql
select 
  count(order_id) as total_orders, 
  (case 
     when order_dow = '0' then 'Sunday'
     when order_dow = '1' then 'Monday'
     when order_dow = '2' then 'Tuesday'
     when order_dow = '3' then 'Wednesday'
     when order_dow = '4' then 'Thursday'
     when order_dow = '5' then 'Friday'
     when order_dow = '6' then 'Saturday'              
   end) as day_of_week 
  from workspace.instacart.orders
 group by order_dow 
 order by order_dow asc

Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select 
  count(order_id) as total_orders, 
  order_hour_of_day as hour 
  from workspace.instacart.orders
 group by order_hour_of_day 
 order by order_hour_of_day

Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT d.department, COUNT(DISTINCT p.product_id) AS products
FROM (
  SELECT product_id, try_cast(department_id AS BIGINT) AS department_id
  FROM instacart.products
) p
INNER JOIN instacart.departments d
  ON d.department_id = p.department_id
GROUP BY d.department
ORDER BY products DESC
LIMIT 10


Databricks visualization. Run in Databricks to view.


SECTION 1: DATASET OVERVIEW & DATA QUALITY


In [0]:
%sql
--Comprehensive Data Quality - Orders Table
SELECT 
'orders' as table_name,
COUNT(*) as total_records,
COUNT(DISTINCT order_id) as unique_orders,
COUNT(DISTINCT user_id) as unique_users,
COUNT(CASE WHEN order_dow IS NULL THEN 1 END) as null_dow,
COUNT(CASE WHEN order_hour_of_day IS NULL THEN 1 END) as null_hour,
MIN(order_number) as min_order_num,
MAX(order_number) as max_order_num,
ROUND(AVG(days_since_prior_order), 2) as avg_days_between_orders
FROM workspace.instacart.orders

In [0]:
%sql
--Data Quality - Products Table
SELECT 
     'products' as table_name,
     COUNT(*) as total_records,
     COUNT(DISTINCT product_id) as unique_products,
     COUNT(CASE WHEN product_name IS NULL THEN 1 END) as null_names,
     COUNT(CASE WHEN TRY_CAST(department_id AS BIGINT) IS NULL THEN 1 END) as invalid_dept_ids,
     COUNT(CASE WHEN TRY_CAST(aisle_id AS BIGINT) IS NULL THEN 1 END) as invalid_aisle_ids
 FROM workspace.instacart.products

In [0]:
%sql
--Data Quality - Order Products Table

 SELECT 
     'order_products_prior' as table_name,
     COUNT(*) as total_records,
     COUNT(DISTINCT order_id) as unique_orders,
     COUNT(DISTINCT product_id) as unique_products,
     ROUND(AVG(add_to_cart_order), 2) as avg_add_to_cart_position,
     SUM(CASE WHEN reordered = 1 THEN 1 ELSE 0 END) as reordered_items,
     ROUND(SUM(CASE WHEN reordered = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) as reorder_rate_pct
 FROM workspace.instacart.order_products_prior


SECTION 2: USER BEHAVIOR ANALYSIS


In [0]:
%sql
--User Purchase Frequency Segments

WITH user_orders AS (
     SELECT 
         user_id,
         COUNT(DISTINCT order_id) as total_orders,
         MAX(order_number) as max_order_num,
         ROUND(AVG(days_since_prior_order), 1) as avg_days_between
     FROM workspace.instacart.orders
     GROUP BY user_id
 )
 SELECT 
     CASE 
         WHEN total_orders = 1 THEN '1 order (One-time)'
         WHEN total_orders BETWEEN 2 AND 5 THEN '2-5 orders (Occasional)'
         WHEN total_orders BETWEEN 6 AND 10 THEN '6-10 orders (Regular)'
         WHEN total_orders BETWEEN 11 AND 20 THEN '11-20 orders (Frequent)'
         ELSE '20+ orders (Very Frequent)'
     END as user_segment,
     COUNT(*) as user_count,
     ROUND(AVG(total_orders), 1) as avg_orders_in_segment,
     ROUND(AVG(avg_days_between), 1) as avg_reorder_days,
     ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as pct_of_users
 FROM user_orders
 GROUP BY user_segment
 ORDER BY 
     CASE user_segment
         WHEN '1 order (One-time)' THEN 1
         WHEN '2-5 orders (Occasional)' THEN 2
         WHEN '6-10 orders (Regular)' THEN 3
         WHEN '11-20 orders (Frequent)' THEN 4
         ELSE 5
     END

In [0]:
%sql
-- Basket Size Distribution

WITH basket_sizes AS (
     SELECT 
         order_id,
         COUNT(*) as items_in_basket
     FROM workspace.instacart.order_products_prior
     GROUP BY order_id
 )
 SELECT 
     CASE 
         WHEN items_in_basket = 1 THEN '1 item'
         WHEN items_in_basket BETWEEN 2 AND 5 THEN '2-5 items (Small)'
         WHEN items_in_basket BETWEEN 6 AND 10 THEN '6-10 items (Medium)'
         WHEN items_in_basket BETWEEN 11 AND 20 THEN '11-20 items (Large)'
         WHEN items_in_basket BETWEEN 21 AND 30 THEN '21-30 items (Very Large)'
         ELSE '30+ items (Mega)'
     END as basket_category,
     COUNT(*) as order_count,
     ROUND(AVG(items_in_basket), 1) as avg_items,
     MIN(items_in_basket) as min_items,
     MAX(items_in_basket) as max_items,
     ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as pct_of_orders
 FROM basket_sizes
 GROUP BY basket_category
 ORDER BY min_items



In [0]:
%sql
-- Cell 6: Reorder Behavior Distribution

 WITH order_stats AS (
     SELECT 
         order_id,
         COUNT(*) as total_items,
         SUM(CASE WHEN reordered = 1 THEN 1 ELSE 0 END) as reordered_items,
         ROUND(SUM(CASE WHEN reordered = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) as reorder_pct
     FROM workspace.instacart.order_products_prior
     GROUP BY order_id
 )
 SELECT 
     CASE 
         WHEN reorder_pct = 0 THEN '0% (All New Items)'
         WHEN reorder_pct > 0 AND reorder_pct <= 25 THEN '1-25% (Mostly New)'
         WHEN reorder_pct > 25 AND reorder_pct <= 50 THEN '26-50% (Mixed)'
         WHEN reorder_pct > 50 AND reorder_pct <= 75 THEN '51-75% (Mostly Reorders)'
         WHEN reorder_pct > 75 AND reorder_pct < 100 THEN '76-99% (Almost All Reorders)'
         ELSE '100% (All Reorders)'
     END as reorder_category,
     COUNT(*) as order_count,
     ROUND(AVG(total_items), 1) as avg_basket_size,
     ROUND(AVG(reorder_pct), 1) as avg_reorder_pct,
     ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as pct_of_orders
 FROM order_stats
 GROUP BY reorder_category
 ORDER BY reorder_category


SECTION 3: PRODUCT POPULARITY ANALYSIS

In [0]:
%sql
--Top 30 Most Popular Products
 SELECT 
     p.product_name,
     d.department,
     a.aisle,
     COUNT(DISTINCT op.order_id) as total_orders,
     COUNT(*) as total_quantity,
     ROUND(COUNT(*) * 1.0 / COUNT(DISTINCT op.order_id), 2) as avg_qty_per_order,
     SUM(CASE WHEN op.reordered = 1 THEN 1 ELSE 0 END) as times_reordered,
     ROUND(SUM(CASE WHEN op.reordered = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) as reorder_rate_pct
 FROM workspace.instacart.order_products_prior op
 JOIN workspace.instacart.products p ON op.product_id = p.product_id
 LEFT JOIN workspace.instacart.departments d ON TRY_CAST(p.department_id AS BIGINT) = d.department_id
 LEFT JOIN workspace.instacart.aisles a ON TRY_CAST(p.aisle_id AS BIGINT) = a.aisle_id
 GROUP BY p.product_name, d.department, a.aisle
 ORDER BY total_orders DESC
 LIMIT 30



In [0]:
%sql
--Department Sales Ranking
 SELECT 
     d.department,
     COUNT(DISTINCT p.product_id) as unique_products,
     COUNT(DISTINCT op.order_id) as orders_containing_dept,
     COUNT(*) as total_items_sold,
     ROUND(AVG(CASE WHEN op.reordered = 1 THEN 1.0 ELSE 0.0 END) * 100, 1) as avg_reorder_rate,
     ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as pct_of_total_sales
 FROM workspace.instacart.order_products_prior op
 JOIN workspace.instacart.products p ON op.product_id = p.product_id
 LEFT JOIN workspace.instacart.departments d ON TRY_CAST(p.department_id AS BIGINT) = d.department_id
 WHERE d.department IS NOT NULL
 GROUP BY d.department
 ORDER BY total_items_sold DESC



In [0]:
%sql
--Top 20 Most Popular Aisles
 SELECT 
     a.aisle,
     d.department,
     COUNT(DISTINCT p.product_id) as unique_products,
     COUNT(*) as total_items_sold,
     ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as pct_of_total_sales
 FROM workspace.instacart.order_products_prior op
 JOIN workspace.instacart.products p ON op.product_id = p.product_id
 LEFT JOIN workspace.instacart.aisles a ON TRY_CAST(p.aisle_id AS BIGINT) = a.aisle_id
 LEFT JOIN workspace.instacart.departments d ON TRY_CAST(p.department_id AS BIGINT) = d.department_id
 WHERE a.aisle IS NOT NULL
 GROUP BY a.aisle, d.department
 ORDER BY total_items_sold DESC
 LIMIT 20



SECTION 4: TEMPORAL PATTERNS (ENHANCED)

In [0]:
%sql
--Enhanced Day of Week Analysis
 WITH dow_stats AS (
     SELECT 
         o.order_dow,
         CASE 
             WHEN o.order_dow = '0' THEN 'Sunday'
             WHEN o.order_dow = '1' THEN 'Monday'
             WHEN o.order_dow = '2' THEN 'Tuesday'
             WHEN o.order_dow = '3' THEN 'Wednesday'
             WHEN o.order_dow = '4' THEN 'Thursday'
             WHEN o.order_dow = '5' THEN 'Friday'
             WHEN o.order_dow = '6' THEN 'Saturday'
         END as day_name,
         COUNT(DISTINCT o.order_id) as total_orders,
         COUNT(op.product_id) as total_items,
         ROUND(COUNT(op.product_id) * 1.0 / COUNT(DISTINCT o.order_id), 2) as avg_basket_size
     FROM workspace.instacart.orders o
     JOIN workspace.instacart.order_products_prior op ON o.order_id = op.order_id
     GROUP BY o.order_dow, day_name
 )
 SELECT 
     day_name,
     total_orders,
     total_items,
     avg_basket_size,
     ROUND(total_orders * 100.0 / SUM(total_orders) OVER (), 2) as pct_of_orders,
     CASE 
         WHEN order_dow IN ('0', '6') THEN 'Weekend'
         ELSE 'Weekday'
     END as day_type
 FROM dow_stats
 ORDER BY order_dow


In [0]:
%sql
--Enhanced Hour of Day Analysis
 WITH hour_stats AS (
     SELECT 
         o.order_hour_of_day,
         CASE 
             WHEN o.order_hour_of_day BETWEEN 0 AND 5 THEN 'Night (12am-6am)'
             WHEN o.order_hour_of_day BETWEEN 6 AND 11 THEN 'Morning (6am-12pm)'
             WHEN o.order_hour_of_day BETWEEN 12 AND 17 THEN 'Afternoon (12pm-6pm)'
             ELSE 'Evening (6pm-12am)'
         END as time_period,
         COUNT(DISTINCT o.order_id) as total_orders,
         COUNT(op.product_id) as total_items,
         ROUND(COUNT(op.product_id) * 1.0 / COUNT(DISTINCT o.order_id), 2) as avg_basket_size
     FROM workspace.instacart.orders o
     JOIN workspace.instacart.order_products_prior op ON o.order_id = op.order_id
     GROUP BY o.order_hour_of_day, time_period
 )
 SELECT 
     order_hour_of_day as hour,
     time_period,
     total_orders,
     avg_basket_size,
     ROUND(total_orders * 100.0 / SUM(total_orders) OVER (), 2) as pct_of_orders
 FROM hour_stats
 ORDER BY order_hour_of_day


In [0]:
%sql
--Temporal Heatmap Data (Day x Hour)
 SELECT 
     CASE 
         WHEN o.order_dow = '0' THEN 'Sun'
         WHEN o.order_dow = '1' THEN 'Mon'
         WHEN o.order_dow = '2' THEN 'Tue'
         WHEN o.order_dow = '3' THEN 'Wed'
         WHEN o.order_dow = '4' THEN 'Thu'
         WHEN o.order_dow = '5' THEN 'Fri'
         WHEN o.order_dow = '6' THEN 'Sat'
     END as day_of_week,
     o.order_hour_of_day as hour,
     COUNT(DISTINCT o.order_id) as order_count,
     ROUND(COUNT(DISTINCT o.order_id) * 100.0 / 
           SUM(COUNT(DISTINCT o.order_id)) OVER (), 4) as pct_of_total_orders
 FROM workspace.instacart.orders o
 GROUP BY day_of_week, o.order_hour_of_day, o.order_dow
 ORDER BY o.order_dow, o.order_hour_of_day


In [0]:
%sql
-- Weekend vs Weekday Comparison
 SELECT 
     CASE 
         WHEN o.order_dow IN ('0', '6') THEN 'Weekend'
         ELSE 'Weekday'
     END as day_type,
     COUNT(DISTINCT o.order_id) as total_orders,
     COUNT(op.product_id) as total_items,
     ROUND(COUNT(op.product_id) * 1.0 / COUNT(DISTINCT o.order_id), 2) as avg_basket_size,
     ROUND(AVG(CASE WHEN op.reordered = 1 THEN 1.0 ELSE 0.0 END) * 100, 1) as reorder_rate_pct,
     ROUND(COUNT(DISTINCT o.order_id) * 100.0 / 
           SUM(COUNT(DISTINCT o.order_id)) OVER (), 2) as pct_of_orders
 FROM workspace.instacart.orders o
 JOIN workspace.instacart.order_products_prior op ON o.order_id = op.order_id
 GROUP BY day_type
 ORDER BY day_type DESC



SECTION 5: ADVANCED INSIGHTS

In [0]:
%sql
--Product Variety Analysis (by Order)
 WITH order_diversity AS (
     SELECT 
         op.order_id,
         COUNT(DISTINCT op.product_id) as unique_products,
         COUNT(DISTINCT TRY_CAST(p.department_id AS BIGINT)) as unique_departments,
         COUNT(DISTINCT TRY_CAST(p.aisle_id AS BIGINT)) as unique_aisles,
         COUNT(*) as total_items
     FROM workspace.instacart.order_products_prior op
     JOIN workspace.instacart.products p ON op.product_id = p.product_id
     GROUP BY op.order_id
 )
 SELECT 
     CASE 
         WHEN unique_departments = 1 THEN 'Single Department'
         WHEN unique_departments BETWEEN 2 AND 3 THEN '2-3 Departments'
         WHEN unique_departments BETWEEN 4 AND 5 THEN '4-5 Departments'
         ELSE '6+ Departments'
     END as department_diversity,
     COUNT(*) as order_count,
     ROUND(AVG(total_items), 1) as avg_items,
     ROUND(AVG(unique_products), 1) as avg_unique_products,
     ROUND(AVG(unique_aisles), 1) as avg_aisles_visited,
     ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as pct_of_orders
 FROM order_diversity
 GROUP BY department_diversity
 ORDER BY order_count DESC
