# Instacart Orders
The dataset is a relational set of files describing customers' orders over time.

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import isnan, when, count, col, countDistinct

In [2]:
app_name = 'dataFrame'
spark = SparkSession.builder.appName(app_name).getOrCreate()

### Load data into Spark DataFrame
Two files, order_products_prior.csv and orders.csv, are not loaded to GitHub because the file size is over 100 MB.

In [3]:
%%time
aisles_path = '..\pyspark-training\data\df-exercise-1\aisles.csv'
departments_path = '..\pyspark-training\data\df-exercise-1\departments.csv'
order_products_prior_path = '..\\Module 6 - DataFrames and Spark SQL\\case study 1 dataset\\order_products__prior.csv'
order_products_train_path = '..\pyspark-training\data\df-exercise-1\order_products__train.csv'
orders_path = '..\\Module 6 - DataFrames and Spark SQL\\case study 1 dataset\\orders.csv'
products_path = '..\pyspark-training\data\df-exercise-1\products.csv'

df_aisles = spark.read.csv(aisles_path, inferSchema=True, header=True)
df_departments = spark.read.csv(departments_path, inferSchema=True, header=True)
df_order_products_prior = spark.read.csv(order_products_prior_path, inferSchema=True, header=True)
df_order_products_train = spark.read.csv(order_products_train_path, inferSchema=True, header=True)
df_orders = spark.read.csv(orders_path, inferSchema=True, header=True)
df_products = spark.read.csv(products_path, inferSchema=True, header=True)

Wall time: 1min 19s


### Merge all data frames

In [4]:
%%time
df_aisles.createOrReplaceTempView('aisles')
df_departments.createOrReplaceTempView('departments')
df_order_products_prior.createOrReplaceTempView('prior')
df_order_products_train.createOrReplaceTempView('train')
df_orders.createOrReplaceTempView('orders')
df_products.createOrReplaceTempView('products')

df_all = spark.sql("""
                    select d.*, p.product_name, a.aisle, dp.department                           
                      from (select d.order_id, d.user_id, d.order_hour_of_day order_hour, p.product_id, p.reordered
                              from orders d 
                              left join prior p 
                                on d.order_id = p.order_id 
                              where eval_set = "prior"
                            
                            UNION 
                            
                            select d.order_id, d.user_id, d.order_hour_of_day order_hour, t.product_id, t.reordered 
                              from orders d 
                              left join train t 
                                on d.order_id = t.order_id 
                             where eval_set = "train") d
                      left join products p
                        on d.product_id = p.product_id
                      left join aisles a
                        on p.aisle_id = a.aisle_id
                      left join departments dp
                        on p.department_id = dp.department_id
                    """).cache()
df_all.show(5)

+--------+-------+----------+----------+---------+--------------------+--------------------+------------+
|order_id|user_id|order_hour|product_id|reordered|        product_name|               aisle|  department|
+--------+-------+----------+----------+---------+--------------------+--------------------+------------+
|    2142|   2086|        16|     22823|        0|Roasted Bell Peppers|pickled goods olives|      pantry|
|    8638|  57582|        18|     18339|        0|    100% Lemon Juice|       juice nectars|   beverages|
|   19984| 122975|        10|     35752|        0|Hardwood SmokedCe...|hot dogs bacon sa...|meat seafood|
|   22346| 134817|        13|      4605|        0|       Yellow Onions|    fresh vegetables|     produce|
|   26623|  47715|        16|     32303|        0| Red Seedless Grapes|        fresh fruits|     produce|
+--------+-------+----------+----------+---------+--------------------+--------------------+------------+
only showing top 5 rows

Wall time: 4min 6s


In [6]:
df_all.count()

33819106

### Check missing data

In [7]:
%%time
df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+--------+-------+----------+----------+---------+------------+-----+----------+
|order_id|user_id|order_hour|product_id|reordered|product_name|aisle|department|
+--------+-------+----------+----------+---------+------------+-----+----------+
|       0|      0|         0|         0|        0|           0|    3|         3|
+--------+-------+----------+----------+---------+------------+-----+----------+

Wall time: 4min 5s


### List the most ordered products (top 10)

In [8]:
%%time
product_cnt = df_all.groupBy('product_name').count()
#product_cnt.show()
product_cnt.orderBy(product_cnt['count'].desc()).show(10)

+--------------------+------+
|        product_name| count|
+--------------------+------+
|              Banana|491291|
|Bag of Organic Ba...|394930|
|Organic Strawberries|275577|
|Organic Baby Spinach|251705|
|Organic Hass Avocado|220877|
|     Organic Avocado|184224|
|         Large Lemon|160792|
|        Strawberries|149445|
|               Limes|146660|
|  Organic Whole Milk|142813|
+--------------------+------+
only showing top 10 rows

Wall time: 1min 29s


### Do people usually reorder the same previous ordered products?


In [9]:
df_all.groupBy('reordered').count().show()

+---------+--------+
|reordered|   count|
+---------+--------+
|        1|19955360|
|        0|13863746|
+---------+--------+



People usually reorder products than order new products.

### List most reordered products


In [10]:
product_reordered_cnt = df_all.filter('reordered = 1').groupBy('product_name').count()
product_reordered_cnt.orderBy(product_reordered_cnt['count'].desc()).show(10)

+--------------------+------+
|        product_name| count|
+--------------------+------+
|              Banana|415166|
|Bag of Organic Ba...|329275|
|Organic Strawberries|214448|
|Organic Baby Spinach|194939|
|Organic Hass Avocado|176173|
|     Organic Avocado|140270|
|  Organic Whole Milk|118684|
|         Large Lemon|112178|
| Organic Raspberries|109688|
|        Strawberries|104588|
+--------------------+------+
only showing top 10 rows



### Most importatn department and aisle (by number of products)


In [11]:
prod_by_dept_aisle = df_all.groupBy(['aisle', 'department']).agg(countDistinct('product_name').alias('cnt_product'))
prod_by_dept_aisle.orderBy(prod_by_dept_aisle['cnt_product'].desc()).show(10)

+--------------------+-------------+-----------+
|               aisle|   department|cnt_product|
+--------------------+-------------+-----------+
|             missing|      missing|       1258|
|     candy chocolate|       snacks|       1246|
|       ice cream ice|       frozen|       1091|
|vitamins supplements|personal care|       1038|
|              yogurt|   dairy eggs|       1026|
|      chips pretzels|       snacks|        989|
|                 tea|    beverages|        894|
|     packaged cheese|   dairy eggs|        891|
|        frozen meals|       frozen|        880|
|       cookies cakes|       snacks|        874|
+--------------------+-------------+-----------+
only showing top 10 rows



### Get the top 10 departments


In [12]:
dept_orders = df_all.groupBy('department').count()
dept_orders.orderBy(dept_orders['count'].desc()).show(10)

+---------------+-------+
|     department|  count|
+---------------+-------+
|        produce|9888378|
|     dairy eggs|5631067|
|         snacks|3006412|
|      beverages|2804175|
|         frozen|2336858|
|         pantry|1956819|
|         bakery|1225181|
|   canned goods|1114857|
|           deli|1095540|
|dry goods pasta| 905340|
+---------------+-------+
only showing top 10 rows



### List top 10 products ordered in the morning (6AM to 10AM)


In [13]:
from pyspark.sql.types import IntegerType

In [14]:
df_all = df_all.withColumn('order_hour_int', df_all['order_hour'].cast(IntegerType()))
product_morning = df_all.filter((df_all['order_hour_int'] >= 6) & 
                                (df_all['order_hour_int'] <= 10)).groupBy('product_name').count()
product_morning.orderBy(product_morning['count'].desc()).show(10)

+--------------------+------+
|        product_name| count|
+--------------------+------+
|              Banana|130034|
|Bag of Organic Ba...|103310|
|Organic Strawberries| 70885|
|Organic Baby Spinach| 61910|
|Organic Hass Avocado| 54537|
|     Organic Avocado| 44238|
|        Strawberries| 39794|
|         Large Lemon| 39366|
|  Organic Whole Milk| 38608|
| Organic Raspberries| 38489|
+--------------------+------+
only showing top 10 rows

