In [113]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from mlxtend.frequent_patterns import apriori, association_rules
import plotly.express as px
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/market-basket-optimisation/Market_Basket_Optimisation.csv


# Notebook Link

https://www.kaggle.com/aliessamali/unsupervised-association-rules-for-market-basket/

# Exploring the Data

In [79]:
df = pd.read_csv("/kaggle/input/market-basket-optimisation/Market_Basket_Optimisation.csv", header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


The dataset consists of the items that are purchased together in each transaction in a Supermarket.

In [18]:
# Get unique values from the entire DataFrame
unique_values = df.values.flatten()
num_unique_values = len(set(unique_values))

print(f"Unique values in the entire DataFrame: {list(set(unique_values))}")
print(f"Number of unique values: {num_unique_values}")

Unique values in the entire DataFrame: ['tea', 'milk', 'green grapes', 'whole wheat pasta', 'french fries', 'ham', 'champagne', 'eggplant', 'cider', 'burgers', 'antioxydant juice', 'turkey', 'tomato sauce', 'magazines', 'vegetables mix', 'shallot', 'butter', 'corn', 'whole wheat rice', 'frozen vegetables', 'oatmeal', 'body spray', 'fresh tuna', 'muffins', 'zucchini', 'parmesan cheese', 'protein bar', 'pickles', 'almonds', 'extra dark chocolate', 'red wine', ' asparagus', 'mashed potato', 'mint', 'honey', 'light cream', 'low fat yogurt', 'spinach', 'energy drink', 'green tea', 'energy bar', 'salmon', 'cottage cheese', 'avocado', 'chili', 'oil', 'salad', 'pancakes', 'chocolate', 'cream', 'strong cheese', 'herb & pepper', 'soda', 'grated cheese', 'mint green tea', 'hand protein bar', 'frozen smoothie', 'cooking oil', 'ground beef', 'meatballs', 'melons', nan, 'candy bars', 'ketchup', 'dessert wine', 'napkins', 'blueberries', 'tomatoes', 'rice', 'pet food', 'white wine', 'pasta', 'strawber

So, there are a total of 121 unique items.

In [87]:
# Apply one-hot encoding
df_encoded = pd.get_dummies(df.stack(), dtype = bool).groupby(level=0).max()

# Display the one-hot encoded DataFrame
df_encoded.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [88]:
# Count the frequency of each item
item_freq = df_encoded.sum().sort_values(ascending=False)

# Create a bar plot using Plotly
fig = px.bar(x=item_freq.index, y=item_freq.values, labels={'x': 'Items', 'y': 'Frequency'}, title='Item Frequencies')
fig.update_xaxes(tickangle=-45)
fig.show()

In [89]:
# Create a bar plot of the top 10 only
fig = px.bar(x=item_freq.index[:10], y=item_freq.values[:10], labels={'x': 'Items', 'y': 'Frequency'}, title='Item Frequencies')
fig.update_xaxes(tickangle=-45)
fig.show()

Top 10 bought products

# Algorithm

In [95]:
# Apply Apriori algorithm
market_basket_frequent_items = apriori(df_encoded, min_support=0.02, use_colnames = True, verbose = 1, max_len=2)
market_basket_frequent_items['length'] = market_basket_frequent_items['itemsets'].apply(lambda x: len(x))
print(market_basket_frequent_items[market_basket_frequent_items.length==2])

Processing 2756 combinations | Sampling itemset size 2
      support                            itemsets  length
53   0.028796                     (eggs, burgers)       2
54   0.021997             (burgers, french fries)       2
55   0.024397            (burgers, mineral water)       2
56   0.021464                (spaghetti, burgers)       2
57   0.027463               (cake, mineral water)       2
58   0.022797            (chicken, mineral water)       2
59   0.033196                   (eggs, chocolate)       2
60   0.034395           (french fries, chocolate)       2
61   0.022930      (frozen vegetables, chocolate)       2
62   0.023464              (chocolate, green tea)       2
63   0.023064            (ground beef, chocolate)       2
64   0.032129                   (milk, chocolate)       2
65   0.052660          (chocolate, mineral water)       2
66   0.039195              (spaghetti, chocolate)       2
67   0.020131        (cooking oil, mineral water)       2
68   0.036395    

In [108]:
# Show associations with lift more than 1.5 in descending order
market_basket_rules = association_rules(market_basket_frequent_items, metric='lift', min_threshold=1.5)
market_basket_rules.drop_duplicates(['support'], inplace=True)
top_association_rules = market_basket_rules.sort_values(by='lift', ascending=False)
top_association_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
18,(spaghetti),(ground beef),0.17411,0.098254,0.039195,0.225115,2.291162,0.022088,1.163716,0.682343
28,(spaghetti),(olive oil),0.17411,0.065858,0.02293,0.1317,1.999758,0.011464,1.075829,0.605334
26,(soup),(mineral water),0.050527,0.238368,0.023064,0.456464,1.914955,0.01102,1.401255,0.503221
8,(milk),(frozen vegetables),0.129583,0.095321,0.023597,0.182099,1.910382,0.011245,1.106099,0.54749
0,(eggs),(burgers),0.179709,0.087188,0.028796,0.160237,1.83783,0.013128,1.086988,0.555754
24,(mineral water),(olive oil),0.238368,0.065858,0.027596,0.115772,1.757904,0.011898,1.056449,0.566075
34,(spaghetti),(tomatoes),0.17411,0.068391,0.020931,0.120214,1.757755,0.009023,1.058905,0.521973
16,(ground beef),(mineral water),0.098254,0.238368,0.040928,0.416554,1.747522,0.017507,1.305401,0.474369
14,(milk),(ground beef),0.129583,0.098254,0.021997,0.169753,1.727704,0.009265,1.086118,0.483903
32,(spaghetti),(shrimp),0.17411,0.071457,0.021197,0.121746,1.70376,0.008756,1.05726,0.500143


In [124]:
# Convert frozenset objects to strings for plotting
top_association_rules['antecedents_str'] = top_association_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
top_association_rules['consequents_str'] = top_association_rules['consequents'].apply(lambda x: ', '.join(list(x)))

# Combine antecedents and consequents for plotting
top_association_rules['items'] = top_association_rules['antecedents_str'] + ' -> ' + top_association_rules['consequents_str']

# Create a bar plot to visualize top association rules
fig = px.bar(top_association_rules, x='items', y='lift',
             labels={'x': 'Item Combination', 'y': 'Lift'},
             title=f'Top {len(top_association_rules)} Most Bought Together Item Combinations')
fig.update_xaxes(tickangle=-45)
fig.show()

Most Bought Together Item Combinations