In [None]:
import numpy as np
import pandas as pd

from os import path
import re

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors
from matplotlib.colors import ListedColormap
import matplotlib.ticker as ticker

import missingno as mn

In [None]:
pd.reset_option('display.float_format')

my_cmap = ListedColormap(sns.color_palette().as_hex())

### Import Data

First, we will import all of our datasets into a single dictionary for convenience:

In [None]:
fd = ['..','data','raw']

data = {}
fn_list = ['orders.csv', 'products.csv', 'order_products__prior.csv', 'order_products__train.csv', 'departments.csv', 'aisles.csv']

for fn in fn_list:
    fp = path.join(*fd, fn)

    with open(file=fp, mode='r', encoding='utf8') as file:
        import re
        label = re.sub('\.csv$', '', fn)
        data[label] = pd.read_csv(file, encoding='utf8')

Checking the size of each dataset:

In [None]:
for k,v in data.items():
    print('{}: {} rows, {} columns; {} null values'.format(k, v.shape[0], v.shape[1], v.isnull().sum().sum()))

High volume of data on the mangitude of millions for the order/product tables. Prior data has 32 million records, which may pose problems for performance during evaluation. Worth considering methods of reducing the dimensionality.

### Null Data

Will check to see why we are missing values in the orders table:

Moving on to the days since prior order column:

In [None]:
data['orders']['days_since_prior_order'].describe()

In [None]:
plt.figure(figsize=(10,5))

ax = sns.distplot(data['orders'].dropna()['days_since_prior_order'], )

ax.set_title('Distribution of Days Since Last Order')
ax.set_xlabel('Days Since Last Order')
ax.set_ylabel('Density')

plt.show()

We can see that the majority of re-orders happen in the 0-10 day range. Again, we have an unnatural peak at the maximum value of 30 days, suggesting that this value was clipped for larger values. It may be necessary to discount this disproportionate volume of orders at 30 days.

Taking a look at the possible changes in mean time intervals between orders and the number of orders placed:

In [None]:
plt.figure(figsize=(10,5))

ax = data['orders'].groupby('order_number')['days_since_prior_order'].mean().plot(label='Mean')
ax.fill_between(x = data['orders'].groupby('order_number')['days_since_prior_order'].std().index.values,
                 y1 = data['orders'].groupby('order_number')['days_since_prior_order'].mean() - data['orders'].groupby('order_number')['days_since_prior_order'].std(),
                 y2 = data['orders'].groupby('order_number')['days_since_prior_order'].mean() + data['orders'].groupby('order_number')['days_since_prior_order'].std(),
                 alpha=0.2, label='St. Dev.')

ax.set_title('Mean Days Between Orders vs. Order Number')
ax.set_xlabel('Order Number')
ax.set_ylabel('Days Since Last Order')

ax.legend()

plt.show()

The decay in time intervals between orders as well as its standard deviation with increasing order number aligns with the notion that long term customers begin shopping on more regular intervals. This could prove useful in building recommenders specific to the number of orders & timing.

### Basic Exploration: Order Products (Priors/Train)

In [None]:
# Make a temporary copy for convenience
df = data['order_products__prior'].copy()

In [None]:
df.head()

In [None]:
# Number of orders
df['order_id'].nunique()

Taking a look at the number of items per order (i.e. basket size):

In [None]:
plt.figure(figsize=(10,5))

ax = data_prior.groupby(['order_number', 'user_id'])['aisle_id'].nunique().unstack().mean(axis=1).plot(cmap=my_cmap)

ax.set_title('Number of Aisles Shopped from vs. Order Number')
ax.set_xlabel('Order Number')
ax.set_ylabel('Mean Number of Aisles in Order')
ax.set_ylim(0,8)

plt.show()

We a similar trend with aisles, noting again that whilst there is a decrease over increasing order number the change is relatively small. Perhaps a stronger change can be observed at the individual product level:

In [None]:
plt.figure(figsize=(10,5))

ax = data_prior.groupby(['order_number', 'user_id'])['product_id'].nunique().unstack().mean(axis=1).plot(cmap=my_cmap)

ax.set_title('Number of Unique Products in Order vs. Order Number')
ax.set_xlabel('Order Number')
ax.set_ylabel('Mean Number of Unique Products in Order')
ax.set_ylim(0,12)

plt.show()

We do see a similar weak downward trend, but there is a noticeable bump around 40 order number mark which could be interpreted as customers experimenting with new products before focusing back on a consistent set of purchases. All of these trends are relatively weak however so it is hard to make any hard conclusions.

Looking at the reorder rates over time (i.e. order number) may provide a stronger impression:

In [None]:
plt.figure(figsize=(10,5))

ax = data_prior[data_prior['order_number'] > 1].groupby(['order_number', 'user_id'])['reordered'].mean().unstack().mean(axis=1).plot(cmap=my_cmap)

ax.set_title('Mean Reorder Rate vs. Order Number')
ax.set_xlabel('Order Number')
ax.set_ylabel('Mean Reorder Rate')

plt.show()

We can see that starting at around 20 orders user reorder rates stabalize in the 0.7-0.8 range, meaning 70-80% of items ordered are reorders. We do not know if these reorders are consistently the same items, but is still does show us the establishment of a consistent purchase habit beginning at 20 orders. In the context of a recommender system, it may prove useful to prioritize past ordered items more as a customer reaches this mark.

To help fill in some of the unknowns with consistent ordering habits, we will take a look at how the set of unique purchases develops as the customer places subsequent orders:

In [None]:
plot_data = data_prior.drop_duplicates(subset=['user_id', 'product_id']).groupby(['order_number','user_id'])['product_id'].count().unstack().cumsum().bfill()

mu = plot_data.mean(axis=1)
std = plot_data.std(axis=1)

plt.figure(figsize=(10,5))

ax = mu.plot()
ax.fill_between(std.keys(), mu - std.values, mu + std.values, alpha=0.2)

ax.set_ylim(bottom=0)
ax.set_title('Cumulative Number of Unique Products Ordered vs. Order Number')
ax.set_xlabel('Order Number')
ax.set_ylabel('Mean Cumulative Count')

plt.show()

It is important to note that in removing duplicates for the cumulative count approach above, values had to be backfilled so there is a certain amount of interpolation and smoothening which has introduced into this view. Regardless, the overall trend should remain true, which we can see shows a gradual decrease in the rate at which new products are ordered. There is an odd peak right at the end of this curve, but this is likely an artifact of the limited number of order data at the largest order numbers.

Moving on to distributions of orders over time of day/week, we can perform a few views utilizing violin plots to see if there are some noticeable variations in demand for different departments over the course of the week:

In [None]:
plt.figure(figsize=(15,7))
ax = sns.violinplot(data=data_prior.sample(int(1e5)), x='department', y='order_dow', cut=1, scale='area', bw=0.25)

ax.set_xticklabels(ax.get_xticklabels(), rotation=50)
ax.set_title('Distribution of Order Times by Department')
ax.set_xlabel('Department')
ax.set_ylabel('Day of Week')

ax.set_yticks(np.arange(0,7,1))
ax.set_yticklabels(['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'])

plt.show()

We can notice a few variations in peaks between different departments and days of the week. We see the previously observed increases in demand on Sunday and Monday across most departments, and some departments have unique peaks such as "bulk" on Wednesday. However, overall this plot is bit difficult to discern between in terms of singling out strong variations so perhaps separating out our weekdays from weekends and analyzing demand by hour proves more useful:

Let us see how our non-ID numerical fields are distributed:

In [None]:
pd.options.display.float_format = '{:.2f}'.format
data['orders'].drop(['order_id', 'user_id'], axis=1).describe()

Worth noting some of the ranges/scales:
- order_number: initiates at 1 vs. 0
- order_dow: 0-6, unclear if on a Mon-Sun or Sun-Sat schedule.
- order_hour_of_day: 0-23 scale (military time)

Taking a more intuitive view of distributions:

In [None]:
plt.figure(figsize=(10,5))

# Countplot of orders per dow
ax = sns.countplot(x=data['orders']['order_dow'], palette='GnBu', edgecolor='k')

# Format plot
ax.set_title('Count of Orders by Day')
ax.set_xlabel('Day of Week')
ax.set_ylabel('Count')

plt.show()

It is still unknown as to whether this scale represent Mon-Sun or Sun-Sat, but perhaps future analysis of purchase behavior can help indicate which is more likely. Initial conjecture based of counts above would be that the largest volume of purchases happen Sunday/Monday versus Monday/Tuesday, but we will ultimately need to see if other data supports this hypothesis.

In [None]:
plt.figure(figsize=(10,5))

# Countplot of orders per hour of day
ax = sns.countplot(x=data['orders']['order_hour_of_day'], palette='GnBu', edgecolor='k')

# Format plot
ax.set_title('Count of Orders by Hour of Day')
ax.set_xlabel('Hour (0-24)')
ax.set_ylabel('Count')

plt.show()

Two slights humps in the late morning and afternoon, but as a whole the majority of orders are focused around the 9AM-5PM interval.

Taking a look at the same distribution split by day:

In [None]:
fig, axs = plt.subplots(4,2, figsize=[12,12], sharey=True)

# Plot orders counts for each dow
for dow in range(7):
    ax = axs.flatten()[dow]
    sns.countplot(x=data['orders'][data['orders']['order_dow'] == dow]['order_hour_of_day'], palette='GnBu', edgecolor='k', ax=ax)
    ax.set_title('Day = {}'.format(dow))

# delete last subplot (odd number of plots since dow=7)
fig.delaxes(axs.flatten()[-1])
plt.tight_layout()
plt.show()

Days 0 and 6 appear to share a somewhat similar pattern of a single hump in orders towards the afternoon, whereas all other days show the double maximums in the morning/afternoon observed earlier. This could further suggest that day 0 is Sunday (and 6 is Saturday).

Moving on to orders and users:

In [None]:
# Number of basket sizes above 20
print(sum(df['order_id'].value_counts() > 20))

# % orders with basket sizes above 20
print(sum(df['order_id'].value_counts() > 20)/df['order_id'].nunique() * 100)

Fortunately this seems to be limited to just 20 out of our 3+ million orders. Extending our search to basket sizes above 50 yields just over 3,000 orders, which still only comprises 0.1% of our data. Once we reach basket of sizes of 20, however, we start seeing a considerable portion of data (approx. 10%). It may be worth keeping the potential for large basket sizes in consideration, particularly when working with association rules.

It is possible that these large basket sizes are a result of certain products beings ordered in large quantities (e.g. 10-20 of the same item being added vs 10-20 unique products):

In [None]:
# Count of orders with duplicate product ids
sum(df.groupby(['order_id','product_id'])['product_id'].count() > 1)

No orders exist in which the same product is accounted for more than once, indicating that the quantity of an item ordered is not captured.

Moving on to the products purchased:

In [None]:
# Number of unique products present in orders
df['product_id'].nunique()

In [None]:
# Purchase counts per individual product
df['product_id'].value_counts().describe()

In [None]:
# Number of products only purchased once
sum(df['product_id'].value_counts() == 1)

The majority of products have been purchased less than 100 times across our 3 million orders. Additionally, we have 131 products which have only been purchased once. Making recommendations for products with a limited history of purchase means relying solely on popularity of items will lead to some biased results.

We also have some large outliers, with a maximum purchase count of 470k:

In [None]:
# Products with over 100k purchases
sum(df['product_id'].value_counts() > 100000)

15 products have been purchased over 100,000 times. We will need to look into what these extremely popular products are once we join this table with the product names.

### Basic Exploration: Products

In [None]:
df = data['products'].copy()

In [None]:
# Total number of unique products
df.shape[0]

Compared to our order table, we can see that almost every product has at least 1 purchase with the exception of 11 items (46777 purchased vs. 49688 on record). Taking a look at what we are missing:

In [None]:
# Create temp merged dataframe of products with order counts
temp = pd.merge(data['products'], pd.DataFrame(data['order_products__prior'].groupby('product_id')['order_id'].count()).rename(columns={'order_id':'order_count'}), left_on='product_id', right_index=True, how='left').fillna(0)

Taking a look out the items which were never ordered:

In [None]:
temp[temp['order_count'] == 0]

Nothing immediately discernable from these items, but we can guess that these are likely niche products. As for the most popular products identified earlier with order counts on the magnitidue of 100k:

In [None]:
temp.sort_values(by='order_count', ascending=False).head(20)

Here we see that produce (fruits/vegetables) are clearly dominating in order counts. We have not taken a focused look at aisle/department counts just yet, but it is already evident that department 4 and aisle 24 are likely to be the most popular and reoccuring.

Looking through some of the product names listed above, it is evident hat we have a lot of similar items with slight variations (e.g. bananas vs. bag of organic bananas). If we explore some products sharing the same words/naming:

Whilst we have not looked at the aisles just yet, this cleaning effort may also be further complicated if aisle information is also missing. Looking at records missing department and (potentially) not missing aisle:

In [None]:
# aisle ID for missing
data['aisles'][data['aisles']['aisle'] == 'missing']

In [None]:
# Number of missing departments not missing aisle
sum((temp['department'] == 'missing') & (temp['aisle_id'] != 100))

Every instance of missing department is also missing aisle, so this will at least double the effort required in trying to interpret and assign both correct department and aisles to these items.

### Basic Exploration: Aisles

In [None]:
df = data['aisles'].copy()

In [None]:
df.shape

We have 134 aisles to work with. Performing a similar analysis of order distribution across aisles:

In [None]:
temp = pd.merge(data['products'], data['departments'], on='department_id').merge(data['aisles'], on='aisle_id')

In [None]:
depts = temp['department'].unique()
depts.sort()

n_cols = 3
n_rows = len(depts)//n_cols
fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols,10*n_cols))

for i, dept in enumerate(depts):
    mask = temp['department'] == dept
    ax = axs.flatten()[i]
    temp[mask]['aisle'].value_counts().plot.bar(ax=ax)
    ax.set_title('Product Counts: {}'.format(dept.title()))
    ax.set_xlabel('Aisle')
    ax.set_ylabel('Number of Products')

plt.tight_layout()
plt.show()

We will refrain from exploring the details of each aisle in too much depth, but we can at least see that within any department there are typically 1-3 aisles with the largest proportion of products. Repeating the above view with the number of products ordered within each aisle:

In [None]:
# Number of basket sizes above 100
sum(df['order_id'].value_counts() > 100)

In [None]:
# Number of basket sizes above 50
print(sum(df['order_id'].value_counts() > 50))

# % orders with basket sizes above 50
print(sum(df['order_id'].value_counts() > 50)/df['order_id'].nunique() * 100)

Out of curiosity we will see how many reorders we have:

In [None]:
df['reordered'].value_counts() / df.shape[0]

Approximeately 59% of the ordered items are reorders. This a good initial indicator that building recommendations of previously ordered items may prove succesful.

In [None]:
data['orders']['user_id'].nunique()

As mentioned on the Kaggle page, we are working with just over 200,000 users. Looking at the number of orders each user has on record:

In [None]:
data['orders']['user_id'].value_counts().describe()

In [None]:
plt.figure(figsize=(12,5))

# Distribution of orders per user, default bins = 20
ax = sns.distplot(data['orders']['user_id'].value_counts(), kde=False)

ax.set_title('Distribution of Order Counts per User')
ax.set_xlabel('Number of Orders')
ax.set_ylabel('Count')

plt.show()

We can see that the number of orders strongly gravitates toward the 4-10 range, with 50% of users having placed 10 orders or less. It should also be noted that order numbers appear to be capped at 100 due to the unnatural peak in volume for the maximum value of 100 orders.

Fortunately, with a minimum order count of 4 per user we have at least a basic guarantee of having some information for each user. However, the analysis above is working with the entire 3 datasets so we need to ensure our training (prior) dataset also has enough information per user:

In [None]:
df[df['product_name'].str.contains('[Bb]anana')]

In the case of 'Banana', we have 376 products with the word banana. This is not to say that all of these are products are similar or identical - for instance we have banana twin cakes versus bananas peppers which are two rather disparate items. However, there are few instances where there are minor variations of the same concept, or at the very least we see banana prevalent as a flavoring throughout these items. It may be beneficial to explore utilizing common word features to identify favorite tastes, for example in this context finding users who show a strong liking toward banana & banana flavored items to then recommend other items of similar or compatible flavors.

### Basic Exploration: Department

In [None]:
df = data['departments'].copy()

Departments appear to be a broad categorization of products, with 21 possibilities. Let us first take a look at how many products we have in each department:

In [None]:
temp = pd.merge(data['products'], data['departments'], on='department_id')

In [None]:
plt.figure(figsize=(10,5))

ax = temp.groupby('department')['product_id'].nunique().sort_values(ascending=False).plot.bar()

ax.set_title('Number of Products per Department')
ax.set_xlabel('Department')
ax.set_ylabel('Number of Products')

plt.show()

Next let us join this table with our orders to see how purchases are distributed:

In [None]:
temp = pd.merge(data['order_products__prior'], data['products'], on='product_id').merge(data['departments'], on='department_id')

In [None]:
temp.head()

In [None]:
plt.figure(figsize=(10,5))

ax = temp['department'].value_counts().sort_values(ascending=False).plot.bar()

ax.set_title('Number of Products Ordered per Department')
ax.set_xlabel('Department')
ax.set_ylabel('Count')

plt.show()

In [None]:
# % of orderered items comprising produce or dairy/eggs
sum(((temp['department'] == 'produce') | (temp['department'] == 'dairy eggs'))) / temp.shape[0]

Produce and dairy/eggs are clearly the dominating departments, making up 46% of ordered items. Compared to our original product counts per department, we can see that whilst the most products available are in personal care, they actually constitue a small fraction of purchases.

We can try to capture a rough essence of popularity by weighing these purchase counts against the proportion of products available:

In [None]:
plt.figure(figsize=(10,5))

# Plot product order counts per department as fraction of total products available in said department
ax = (temp['department'].value_counts().sort_index() / temp.groupby('department')['product_id'].nunique().sort_index()).sort_values(ascending=False).plot.bar()

ax.set_title('Weighted Department Popularity')
ax.set_xlabel('Department')
ax.set_ylabel('Relative Popularity')

plt.show()

Whilst this view does not necessarily yield much insight with regards to our recommender system approach, it is interesting from a business perspective as to the approximate value/efficiency of stocking certain items. For example, even with the limited selection of produce relative to the number of personal care products, the amount of purchases and visibility of produce items greatly exceeds that of personal care items.

One concern worth noting before proceeding to aisle exploration is the "missing" department. Taking a look at some of the items in this department:

In [None]:
data['orders'].isnull().sum()

In [None]:
data['orders'].head(20)

In [None]:
data['orders'][data['orders']['order_number'] != 1].isnull().sum()

Null values are evidently a result of no prior orders existing. This is not something that necessarily needs to be fixed, but should be kept in mind during analysis.

### Basic Exploration: Orders

Volume of data:

In [None]:
N = data['orders'][data['orders']['eval_set'] == 'prior'].groupby(['user_id'])['order_id'].count()
for i in range(3,11):
    n = sum(N >= i)
    print('Number of customers with >= {} orders: {} ({:.1f}%)'.format(i, n, n / N.shape[0] * 100))

We can see that at a cut-off of 10 or more orders we are only left with just under 50% of our original set of customers. Considering our large dataset size, however, it may be worthwhile for the sake of performance to limit ourselves to such a minimum. At the very least, the minimum of 5 orders still leaves us with almost 80% of customers to work with. Translating these possibilities to our total number of orders to be processed:

In [None]:
user_orders = data['orders'][data['orders']['eval_set'] == 'prior'].groupby(['user_id'])
for i in range(3,11):
    users = (user_orders['order_id'].count() >= i)
    users = users[users==True]
    n_orders = data['orders'][(data['orders']['eval_set'] == 'prior') & (data['orders']['user_id'].isin(users.keys()))].shape[0]
    print('Number of orders using customers with >= {} orders: {} ({:.1f}%)'.format(i, n_orders, n_orders / data['orders'][data['orders']['eval_set'] == 'prior'].shape[0] * 100))

Whilst a threshold of 10 orders resulted in losing half the customer base, we can see we still utilize over 80% of the orders on record. This somewhat mitigates concerns with potentially choosing such a threshold to reduce the volume of data we are dealing with and improve information available per customer, but at thresholds of 10 orders (and higher) it is still hard to justify losing half of the customer base in evaluation. The ultimate decision as to how much data is excluded will likely come down to performance constraints when modelling.

In [None]:
# 90th percentile for number of purchases per product
df['product_id'].value_counts().quantile(0.9)

As a whole, approximately 90% of products have been purchases 1,000 times or less. Relative to our dataset of 3 million orders, we will need to see how this sparsity translates to our recommendation capabilities. Simplifying products into broader categories may prove necessary.

In [None]:
# Number of products in 'missing' department
temp[temp['department']=='missing']['product_name'].nunique()

In [None]:
# Sample of 'missing' department products
temp[temp['department']=='missing']['product_name'].unique()[:20]

These 'missing' department items do not seem to follow any consistent theme, with a variety of foods and beverages we would expect to find in some of our other department labels. It may be necessary to discard these items from our recommender, at least in scenarios where department is being leveraged in decisioning. Alternatively, we can attempt to assign appropriate labels based off our understanding of department definitions, but this is likely to be a highly manual process of involving the 1255 records. We also do not know if Instacart has intentionally marked these items as missing due to discontinuation of products or other unique circumstances.

In [None]:
temp = pd.merge(data['order_products__prior'], data['products'], on='product_id').merge(data['departments'], on='department_id').merge(data['aisles'], on='aisle_id')

In [None]:
depts = temp['department'].unique()
depts.sort()

n_cols = 3
n_rows = len(depts)//n_cols
fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols,10*n_cols))

for i, dept in enumerate(depts):
    mask = temp['department'] == dept
    ax = axs.flatten()[i]
    (temp[mask]['aisle'].value_counts()).sort_values(ascending=False).plot.bar(ax=ax)
    ax.set_title('Order Counts: {}'.format(dept.title()))
    ax.set_xlabel('Aisle')
    ax.set_ylabel('Number of Products Ordered')

plt.tight_layout()
plt.show()

As with our departments, the most populated aisles do not necessarily correspond to the most purchased. For example, we can see that sparkling selzer water is much more succesful in terms of purchase quantity versus the number of products available.

### Full Data Exploration

Build a dataframe with all 'prior' orders & products:

In [None]:
df['order_id'].value_counts().describe()

In [None]:
plt.figure(figsize=(10,5))

ax = sns.countplot(df['order_id'].value_counts())

ax.set_xticks(np.arange(5,60,5))
ax.set_xticklabels(np.arange(5,60,5))
ax.set_xlim(right=60)

ax.set_title('Distribution of Basket Size')
ax.set_xlabel('Basket Size')
ax.set_ylabel('Count')

plt.show()

Majority of orders fall in the range of 1-10 items per order. It is also worth noting a select few basket sizes reach up to 100+ items (cut off from the graph above):

In [None]:
data['orders'].shape

In [None]:
data['orders']['eval_set'].value_counts()

In [None]:
data['orders']['eval_set'].value_counts()/data['orders'].shape[0]

Prior and train translate to training and validation datasets, respectively. Models will be fit using the prior set, and optimized according performance on the train set. Test is the ultimate testing set for final performance evaluation of our recommendations.

In [None]:
data['orders'].head()

Columns are described as follows:

- order_id: unique ID for the order
- user_id: unique ID for the user
- eval_set: prior/train/test sets, as mentioned above
- order_number: sequential order number for a given user (i.e. 1st order, 2nd order, etc.)
- order_dow: day of the week on which order was placed
- order_hour_of_day: hour of the day in which order was placed
- days_since_prior_order: days since last order was placed. Value is NaN if first order (no prior order)

In [None]:
data['orders'][data['orders']['eval_set'] == 'prior']['user_id'].value_counts().describe()

Our minimum number of orders per user in working with our training dataset is 3. In an ideal world we would want 10-20 observations per user, but we at least have more than 1 record per user (which in itself may contain multiple items ordered) to work with in producing recommendations, somewhat mitigating the cold start problem.

Exploring how much of our customer base is available for each level of order quantities:

In [None]:
data_prior['product_id'].value_counts().describe()

In [None]:
npp = data_prior['product_id'].nunique()

print('{}/{} products purchased (coverage = {:.2f}%)'.format(npp,data['products'].shape[0], 100*npp/data['products'].shape[0]))

In [None]:
pvc = data_prior['product_id'].value_counts()

for i in range(1,11):
    n = sum(data_prior['product_id'].value_counts() <= i) + 11
    print('Number of products with <= {} purchases: {} ({:.2f}%)'.format(i, n, 100*n/data['products'].shape[0]))

Fortunately, only 11 products out of the 49k have never been purchased. However, we do have a number of products with a relatively sparse purchase history with over 8000 products having only been purchased 10 or fewer times. With respect to the overall set of products available this amounts to 16% of the product inventory having very limited interactions, which will prove challenging when aiming recommending such products.

Taking a closer look at the distribution in customer purchases:

In [None]:
fig, axs = plt.subplots(1,2, figsize=(12,5))

plot_data = data_prior.groupby('user_id')['product_id'].nunique()
q_mask = plot_data < plot_data.quantile(.99)

sns.distplot(plot_data, ax=axs[0])
sns.distplot(plot_data[q_mask], ax=axs[1])

for ax in axs:    
    ax.set_xlabel('Number of Unique Products Purchased')
    ax.set_ylabel('Density')

axs[0].set_title('All Data')
axs[1].set_title('99th Percentile')
    
fig.suptitle('Distribution of Unique Product Purchases per Customer')
    
plt.show()

In [None]:
data_prior.groupby('user_id')['product_id'].nunique().describe()

In terms of unique products purchased, we seem to peak in range the 20-30 products purchased, with the major concentration of customers falling in the 25-85 range. In other words, we rarely see customers whose buying habits extend beyond a core set of products in this range. This can be interpeted in two ways: on the the one hand customers may be habitual in their purchase habits and prefer sticking to a core set of products they are familiar with (adding challenge to effective recommendations), but alternatively one can also argue that there is great potential to introduce customers to new products and increase coverage assuming they are willing to step outside their comfort zone.

Extending this view to a level higher with aisles:

In [None]:
plt.figure(figsize=(10,5))

ax = sns.distplot(data_prior.groupby('user_id')['aisle_id'].nunique())

ax.set_title('Distribution of Aisle Purchases per Customer')
ax.set_xlabel('Number of Aisles Purchased From')
ax.set_ylabel('Density')

plt.show()

In [None]:
data_prior.groupby('user_id')['aisle_id'].nunique().describe()

In [None]:
data['aisles']['aisle_id'].nunique()

We again see a majroity of customers limiting themselves to a selection of aisles from which they purchase in the 10-40 range. Relative to the total number of aisles (134), this represents a much more diverse range which is to be expected given we are dealing with fewer options. It can still be argued, however, that customers seem to favor a certain selection of aisles in their shopping habits and are less likely to expand or explore all options.

Repeating this view with distributions across departments:

In [None]:
plt.figure(figsize=(10,5))

# Plot as bar chart instead since we have a more limited list
ax = data_prior.groupby('user_id')['department_id'].nunique().value_counts().sort_index().plot.bar(cmap=my_cmap)

ax.set_title('Distribution of Department Purchases per Customer')
ax.set_xlabel('Number of Departments Purchased From')
ax.set_ylabel('Frequency')

plt.show()

With departments we see a significant rightward shift in the distribution. This indicates that whilst for individual products and aisles customers tend to limit themselves to a smaller subset, the variety in departments from which customers shop tends to be more encompassing of all available departmenents.

We can continue this analysis of shopping variety/coverage in looking at how purchase variety changes over time (i.e. order number):

In [None]:
plt.figure(figsize=(10,5))

ax = data_prior.groupby(['order_number', 'user_id'])['department_id'].nunique().unstack().mean(axis=1).plot(cmap=my_cmap)

ax.set_title('Number of Departments Shopped from vs. Order Number')
ax.set_xlabel('Order Number')
ax.set_ylabel('Mean Number of Departments in Order')
ax.set_ylim(0,5)

plt.show()

We see a gradual, albeit relatively small, decrease in the mean number of departments in orders with increasing order numbers, which might suggest that customers hone in on their preferences and "go-to's" over time. Repeating this view for aisles:

In [None]:
data_prior = pd.merge(data['orders'], data['order_products__prior'], on='order_id')\
               .merge(data['products'].merge(data['departments'], on='department_id').merge(data['aisles'], on='aisle_id'), on='product_id')

In [None]:
data_prior.shape

In [None]:
data_prior.head()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(12,5))

plot_data = data_prior['product_id'].value_counts()
q_mask = plot_data < plot_data.quantile(.90)

sns.distplot(plot_data, ax=axs[0])
sns.distplot(plot_data[q_mask], ax=axs[1])

for ax in axs:    
    ax.set_xlabel('Number of Times Product was Purchased')
    ax.set_ylabel('Density')

axs[0].set_title('All Data')
axs[1].set_title('90th Percentile')
    
fig.suptitle('Distribution of Product Purchase Counts')
    
plt.show()