In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
trainDf = pd.read_csv("order_products__train.csv")
productDf = pd.read_csv("products.csv")

In [3]:
trainDf.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [4]:
productDf.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [5]:
trainDf['reordered'] = 1
productCountDf = trainDf.groupby("product_id",as_index = False)["order_id"].count()
topLev = 100
productCountDf = productCountDf.sort_values("order_id",ascending = False)
topProdFrame = productCountDf.iloc[0:topLev,:]
topProdFrame = topProdFrame.merge(productDf,on = "product_id")
productId= topProdFrame.loc[:,["product_id"]]

In [6]:
df = trainDf[0:0]
for i in range(0,99):
    pId = productId.iloc[i]['product_id'] 
    stDf = trainDf[trainDf.product_id == pId ]
    df = df.append(stDf,ignore_index = False)
print(df.head())

     order_id  product_id  add_to_cart_order  reordered
115       226       24852                  2          1
156       473       24852                  2          1
196       878       24852                  2          1
272      1042       24852                  1          1
297      1139       24852                  1          1


In [7]:
basket = df.groupby(['order_id', 'product_id'])['reordered'].sum().unstack().reset_index().fillna(0).set_index('order_id')

In [8]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [9]:
basket_sets = basket.applymap(encode_units)
print(basket_sets.head())

product_id  196    4605   4920   5025   5077   5450   5785   5876   8174   \
order_id                                                                    
1               0      0      0      0      0      0      0      0      0   
36              0      0      0      0      0      0      0      0      0   
38              0      0      0      0      0      0      0      0      0   
96              0      0      0      0      0      0      0      0      0   
98              0      0      0      0      0      0      0      0      0   

product_id  8193   ...  46667  46906  46979  47144  47209  47626  47766  \
order_id           ...                                                    
1               0  ...      0      0      0      0      1      0      0   
36              0  ...      0      0      1      0      0      0      0   
38              0  ...      0      0      0      0      0      0      0   
96              0  ...      0      0      0      0      0      0      0   
98        

In [19]:
frequent_itemsets = apriori(basket_sets, min_support=0.02, use_colnames=True)
print(frequent_itemsets)

     support                        itemsets
0   0.024507        (100% Whole Wheat Bread)
1   0.024017      (Apple Honeycrisp Organic)
2   0.041251                     (Asparagus)
3   0.165088        (Bag of Organic Bananas)
4   0.199706                        (Banana)
..       ...                             ...
75  0.023014           (Banana, Large Lemon)
76  0.023633       (Banana, Organic Avocado)
77  0.021329  (Banana, Organic Baby Spinach)
78  0.023185  (Banana, Organic Strawberries)
79  0.020775          (Strawberries, Banana)

[80 rows x 2 columns]


In [20]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(rules)
print(rules[(rules['lift'] >= 2) & (rules['confidence'] >= 0.1)])

                 antecedents               consequents  antecedent support  \
0   (Bag of Organic Bananas)    (Organic Baby Spinach)            0.165088   
1     (Organic Baby Spinach)  (Bag of Organic Bananas)            0.104343   
2   (Bag of Organic Bananas)    (Organic Hass Avocado)            0.165088   
3     (Organic Hass Avocado)  (Bag of Organic Bananas)            0.077777   
4   (Bag of Organic Bananas)    (Organic Strawberries)            0.165088   
5     (Organic Strawberries)  (Bag of Organic Bananas)            0.116180   
6                   (Banana)             (Large Lemon)            0.199706   
7              (Large Lemon)                  (Banana)            0.086757   
8                   (Banana)         (Organic Avocado)            0.199706   
9          (Organic Avocado)                  (Banana)            0.079014   
10                  (Banana)    (Organic Baby Spinach)            0.199706   
11    (Organic Baby Spinach)                  (Banana)          

In [12]:
newproductCountDf = productCountDf.merge(productDf, left_on='product_id', right_on='product_id', how='inner')
newDf = newproductCountDf[['product_id','product_name']]
print(newDf)

       product_id                              product_name
0           24852                                    Banana
1           13176                    Bag of Organic Bananas
2           21137                      Organic Strawberries
3           21903                      Organic Baby Spinach
4           47626                               Large Lemon
...           ...                                       ...
39118       33470  Velveeta Cheesy Skillets Chicken Alfredo
39119       19973          Organic Vegetarian Refried Beans
39120       19969      12 Hour Chest Congestion Expectorant
39121        9782               Bagged Cinnamon Mini Donuts
39122       24836            Cat Litter, Scoopable, Scented

[39123 rows x 2 columns]


In [13]:
df = df.merge(newDf, left_on='product_id', right_on='product_id', how='inner')
df = df[['order_id','add_to_cart_order','reordered','product_name']]
print(df)

        order_id  add_to_cart_order  reordered              product_name
0            226                  2          1                    Banana
1            473                  2          1                    Banana
2            878                  2          1                    Banana
3           1042                  1          1                    Banana
4           1139                  1          1                    Banana
...          ...                ...        ...                       ...
312871   3405263                  7          1  Organic Broccoli Florets
312872   3410603                  1          1  Organic Broccoli Florets
312873   3411504                  4          1  Organic Broccoli Florets
312874   3412303                  1          1  Organic Broccoli Florets
312875   3420894                 13          1  Organic Broccoli Florets

[312876 rows x 4 columns]


In [14]:
basket = df.groupby(['order_id', 'product_name'])['reordered'].sum().unstack().reset_index().fillna(0).set_index('order_id')

In [15]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [16]:
basket_sets = basket.applymap(encode_units)
print(basket_sets.head())

product_name  100% Whole Wheat Bread  2% Reduced Fat Milk  \
order_id                                                    
1                                  0                    0   
36                                 0                    0   
38                                 0                    0   
96                                 0                    0   
98                                 0                    0   

product_name  Apple Honeycrisp Organic  Asparagus  Bag of Organic Bananas  \
order_id                                                                    
1                                    0          0                       1   
36                                   0          1                       0   
38                                   0          0                       0   
96                                   0          0                       0   
98                                   0          0                       1   

product_name  Banana  Blueberrie

In [17]:
frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)
print(frequent_itemsets)

      support                                      itemsets
0    0.024507                      (100% Whole Wheat Bread)
1    0.016424                         (2% Reduced Fat Milk)
2    0.024017                    (Apple Honeycrisp Organic)
3    0.041251                                   (Asparagus)
4    0.165088                      (Bag of Organic Bananas)
..        ...                                           ...
138  0.011006      (Organic Cucumber, Organic Strawberries)
139  0.010867   (Organic Hass Avocado, Organic Raspberries)
140  0.016413  (Organic Hass Avocado, Organic Strawberries)
141  0.017810   (Organic Raspberries, Organic Strawberries)
142  0.010174    (Organic Whole Milk, Organic Strawberries)

[143 rows x 2 columns]


In [18]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# print(rules)
rules = rules[['antecedents','consequents','antecedent support','consequent support','support','confidence','lift']]
rules = rules[(rules['lift'] >= 2) & (rules['confidence'] >= 0.1)]
print(rules)

                               antecedents               consequents  \
6                 (Bag of Organic Bananas)    (Organic Hass Avocado)   
7                   (Organic Hass Avocado)  (Bag of Organic Bananas)   
9   (Organic Large Extra Fancy Fuji Apple)  (Bag of Organic Bananas)   
44                                 (Limes)             (Large Lemon)   
45                           (Large Lemon)                   (Limes)   
46                           (Large Lemon)         (Organic Avocado)   
47                       (Organic Avocado)             (Large Lemon)   
52                                 (Limes)         (Organic Avocado)   
53                       (Organic Avocado)                   (Limes)   
56                      (Organic Cilantro)                   (Limes)   
57                                 (Limes)        (Organic Cilantro)   
68                   (Organic Blueberries)    (Organic Strawberries)   
69                  (Organic Strawberries)     (Organic Blueberr

Code from mlxtend

In [None]:
def generate_new_combinations(old_combinations):
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:
            yield from old_tuple
            yield item

In [None]:
def generate_new_combinations_low_memory(old_combinations, X, min_support,
                                         is_sparse):

    items_types_in_previous_step = np.unique(old_combinations.flatten())
    rows_count = X.shape[0]
    threshold = min_support * rows_count
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        if is_sparse:
            mask_rows = X[:, old_tuple].toarray().all(axis=1)
            X_cols = X[:, valid_items].toarray()
            supports = X_cols[mask_rows].sum(axis=0)
        else:
            mask_rows = X[:, old_tuple].all(axis=1)
            supports = X[mask_rows][:, valid_items].sum(axis=0)
        valid_indices = (supports >= threshold).nonzero()[0]
        for index in valid_indices:
            yield supports[index]
            yield from old_tuple
            yield valid_items[index]

In [None]:
def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
            low_memory=False):

    def _support(_x, _n_rows, _is_sparse):
        out = (np.sum(_x, axis=0) / _n_rows)
        return np.array(out).reshape(-1)

    if min_support <= 0.:
        raise ValueError('`min_support` must be a positive '
                         'number within the interval `(0, 1]`. '
                         'Got %s.' % min_support)

    fpc.valid_input_check(df)

    if hasattr(df, "sparse"):
        if df.size == 0:
            X = df.values
        else:
            X = df.sparse.to_coo().tocsc()
        is_sparse = True
    else:
        X = df.values
        is_sparse = False
    support = _support(X, X.shape[0], is_sparse)
    ary_col_idx = np.arange(X.shape[1])
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])

    all_ones = np.ones((int(rows_count), 1))

    while max_itemset and max_itemset < (max_len or float('inf')):
        next_max_itemset = max_itemset + 1
        if low_memory:
            combin = generate_new_combinations_low_memory(
                itemset_dict[max_itemset], X, min_support, is_sparse)
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset + 1)

            if combin.size == 0:
                break
            if verbose:
                print(
                    '\rProcessing %d combinations | Sampling itemset size %d' %
                    (combin.size, next_max_itemset), end="")

            itemset_dict[next_max_itemset] = combin[:, 1:]
            support_dict[next_max_itemset] = combin[:, 0].astype(float) \
                / rows_count
            max_itemset = next_max_itemset
        else:
            combin = generate_new_combinations(itemset_dict[max_itemset])
            combin = np.fromiter(combin, dtype=int)
            combin = combin.reshape(-1, next_max_itemset)

            if combin.size == 0:
                break
            if verbose:
                print(
                    '\rProcessing %d combinations | Sampling itemset size %d' %
                    (combin.size, next_max_itemset), end="")

            if is_sparse:
                _bools = X[:, combin[:, 0]] == all_ones
                for n in range(1, combin.shape[1]):
                    _bools = _bools & (X[:, combin[:, n]] == all_ones)
            else:
                _bools = np.all(X[:, combin], axis=2)

            support = _support(np.array(_bools), rows_count, is_sparse)
            _mask = (support >= min_support).reshape(-1)
            if any(_mask):
                itemset_dict[next_max_itemset] = np.array(combin[_mask])
                support_dict[next_max_itemset] = np.array(support[_mask])
                max_itemset = next_max_itemset
            else:
                # Exit condition
                break

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
                             dtype='object')

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ['support', 'itemsets']
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
                                                      mapping[i] for i in x]))
    res_df = res_df.reset_index(drop=True)

    if verbose:
        print()

    return res_df