#### Data Reading and EDA 

#### Problem Statement

Analyze Instacart dataset and determine customer basket mix based on previous order history. 

In [68]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

**Reading the Data**

In [69]:
# reading aisles dataset
aisles = pd.read_csv('data/aisles.csv')

In [70]:
# reading department dataset
department = pd.read_csv('data/departments.csv')

In [71]:
# reaading random 5% of order_prior dataset 
p = 0.05
order_prior = pd.read_csv('data/order_products__prior.csv', header = 0,  skiprows=lambda i: i>0 and random.random() > p)

In [72]:
# reading random 5% of the order dataset
p = 0.05
orders = pd.read_csv('data/orders.csv', header = 0,  skiprows=lambda i: i>0 and random.random() > p)

In [73]:
# reading in products data
products = pd.read_csv('data/products.csv')

In [74]:
order_train = pd.read_csv('data/order_products__train.csv')

In [75]:
aisles.head(2)

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses


In [76]:
department.head(2)

Unnamed: 0,department_id,department
0,1,frozen
1,2,other


In [77]:
order_prior.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,4,39758,3,1
1,4,34862,8,1


In [78]:
order_prior.shape

(1619550, 4)

In [80]:
orders.head(2)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,3194192,2,prior,7,2,12,14.0
1,788338,2,prior,8,1,15,27.0


In [81]:
orders.shape

(171845, 7)

In [82]:
order_train.shape

(1384617, 4)

In [83]:
products.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13


In [84]:
order_train.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1


In [85]:
order_prior.shape

(1619550, 4)

In [86]:
# check the orders dataset for null values
orders.isnull().sum()

order_id                      0
user_id                       0
eval_set                      0
order_number                  0
order_dow                     0
order_hour_of_day             0
days_since_prior_order    10383
dtype: int64

Missing values in the 'days since prior order' column mean that these customers did not make prior purchases.

In [87]:
orders.dropna(axis = 0, inplace = True) # delete rows with missing values

**Split Orders Dataframe into Prior, Train, and Test Dataframes**

In [88]:
orders['eval_set'].value_counts(normalize = True) # breakdown of orders dataframe

prior    0.934938
train    0.041075
test     0.023987
Name: eval_set, dtype: float64

In [89]:
# create filter to separate test orders
orders_test_filtered = orders['eval_set'] == 'test'

In [90]:
# apply filter to orders dataframe 
orders_test = orders[orders_test_filtered]

In [91]:
# save test orders to csv file for modeling 
orders_test.to_csv('data/orders_test.csv')

In [92]:
# create filter to separate prior orders
orders_filtered_prior = orders['eval_set'] == 'prior'

In [93]:
# apply filter to a dataframe
orders_prior_details = orders[orders_filtered_prior]

In [94]:
# create filter to separate train orders
orders_filtered_train = orders['eval_set'] == 'train' 

In [95]:
# apply filter to a dataframe
orders_train_details = orders[orders_filtered_train]

In [96]:
orders_train_details.shape

(6632, 7)

In [97]:
order_train.shape

(1384617, 4)

In [98]:
order_prior.shape

(1619550, 4)

In [99]:
orders_prior_details.shape

(150957, 7)

**Merge products and order_prior dataframes**

In [100]:
prod_orders_prior = pd.merge(products, order_prior,  on = 'product_id') # marge products and orders

In [101]:
prod_orders_prior.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered
0,1,Chocolate Sandwich Cookies,61,19,19479,5,0
1,1,Chocolate Sandwich Cookies,61,19,82631,2,1


In [102]:
prod_orders_prior.shape # check if all rows remain

(1619550, 7)

In [103]:
len(prod_orders_prior['order_id'].unique())

1174637

**Merge products and train, prior order dataframes**

In [105]:
prod_orders_train = pd.merge(products, order_train,  on = 'product_id') # merge products and train orders

In [106]:
prod_orders_train.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered
0,1,Chocolate Sandwich Cookies,61,19,6695,7,1
1,1,Chocolate Sandwich Cookies,61,19,48361,9,0


In [107]:
prod_orders_train.shape

(1384617, 7)

**Concat prod_orders_train and prod_order_prior dataframes**

In [108]:
orders_train_prior = pd.concat([prod_orders_train, prod_orders_prior], ignore_index = True, sort = False)

In [109]:
orders_train_prior.shape

(3004167, 7)

**Add quantity column**

In [110]:
# create quantity column reflecting number of products
orders_train_prior.loc[orders_train_prior['product_name'].notna(), 'quantity'] = 1

In [141]:
# lower case product names
orders_train_prior['product_name'] = orders_train_prior['product_name'].str.lower()

In [146]:
# take out commas in product names
orders_train_prior['product_name'] = orders_train_prior['product_name'].str.replace(",","")

**Merge prod_orders_prior, orders_prior_details dataframes**

In [112]:
all_orders_prior = pd.merge(prod_orders_prior, orders_prior_details, on = 'order_id')

In [113]:
all_orders_prior.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,Chocolate Sandwich Cookies,61,19,990267,3,0,168639,prior,3,4,13,30.0
1,1,Chocolate Sandwich Cookies,61,19,1582714,1,1,65325,prior,33,1,11,3.0


In [114]:
all_orders_prior.shape

(76013, 13)

In [139]:
all_orders_prior['product_name'] = all_orders_prior['product_name'].str.lower()

In [148]:
all_orders_prior['product_name'] = all_orders_prior['product_name'].str.replace(",","")

In [140]:
all_orders_prior.to_csv('data/all_orders_prior.csv', index = False)

**Merge prod_orders_train, prod_orders_prior and all orders dataframes**

In [116]:
all_orders_train = pd.merge(prod_orders_train, orders_train_details, on = 'order_id')

In [117]:
all_orders_train.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id,order_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,Chocolate Sandwich Cookies,61,19,446766,16,1,126093,train,58,4,13,5.0
1,116,English Muffins,93,3,446766,8,1,126093,train,58,4,13,5.0


In [118]:
all_orders_train.shape

(69869, 13)

In [143]:
all_orders_train['product_name'] = all_orders_train['product_name'].str.lower()

In [149]:
all_orders_train['product_name'] = all_orders_train['product_name'].str.replace(",","")

In [152]:
all_orders_train.to_csv('data/all_orders_train.csv', index = False)

**Concat all train and prior orders**

In [120]:
all_orders = pd.concat([all_orders_train, all_orders_prior], ignore_index = True, sort = False)

In [121]:
all_orders.shape

(145882, 13)

**Add quantity column to all_orders dataframe**

In [122]:
# create quantity column reflecting number of products
all_orders.loc[all_orders['product_name'].notna(), 'quantity'] = 1

**Merge aisles, department, and all_orders dataframes**

In [123]:
all_orders = pd.merge(all_orders, aisles, on = 'aisle_id')

In [124]:
all_orders = pd.merge(all_orders, department, on = 'department_id')

In [132]:
all_orders['product_name'] = all_orders['product_name'].str.lower()

In [150]:
all_orders['product_name'] = all_orders['product_name'].str.replace(",","")

In [151]:
all_orders.to_csv('data/all_orders.csv', index = False)

In [145]:
all_orders.shape

(145882, 16)

**Morning Orders**

In [153]:
filter_hour_morning = all_orders['order_hour_of_day'] < 11
morning_orders = all_orders[filter_hour_morning]
morning_orders['product_name'] = morning_orders['product_name'].str.lower()
morning_orders['product_name'] = morning_orders['product_name'].str.replace(",","")
morning_orders.to_csv('data/morning_orders.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  morning_orders['product_name'] = morning_orders['product_name'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  morning_orders['product_name'] = morning_orders['product_name'].str.replace(",","")


**Daytime Orders**

In [154]:
filter_hour_day = all_orders['order_hour_of_day'] > 11
intermediate_orders = all_orders[filter_hour_day]
filter_day_2 = intermediate_orders['order_hour_of_day'] < 18
day_orders = intermediate_orders[filter_day_2]
day_orders['product_name'] = day_orders['product_name'].str.lower()
day_orders['product_name'] = day_orders['product_name'].str.replace(",","")
day_orders.to_csv('data/day_orders.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  day_orders['product_name'] = day_orders['product_name'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  day_orders['product_name'] = day_orders['product_name'].str.replace(",","")


**Evening Orders**

In [155]:
filter_hour_evening = all_orders['order_hour_of_day'] > 18
evening_orders = all_orders[filter_hour_evening]
evening_orders['product_name'] = evening_orders['product_name'].str.lower()
evening_orders['product_name'] = evening_orders['product_name'].str.replace(",","")
evening_orders.to_csv('data/evening_orders.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evening_orders['product_name'] = evening_orders['product_name'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evening_orders['product_name'] = evening_orders['product_name'].str.replace(",","")
