In [92]:
import pandas as pd

In [93]:
# Loading dataset
df = pd.read_csv("Groceries_dataset.csv")

In [94]:
# Analysing dataset
print("5 first rows:")
print(df.head())

5 first rows:
   Member_number        Date   itemDescription
0           1808  21-07-2015    tropical fruit
1           2552  05-01-2015        whole milk
2           2300  19-09-2015         pip fruit
3           1187  12-12-2015  other vegetables
4           3037  01-02-2015        whole milk


In [95]:
# Overall information
print("\n information about dataset:")
print(df.info())


 information about dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB
None


In [96]:
# Statistics
print(" Statistics:")
print(df.describe(include='all'))

 Statistics:
        Member_number        Date itemDescription
count    38765.000000       38765           38765
unique            NaN         728             167
top               NaN  21-01-2015      whole milk
freq              NaN          96            2502
mean      3003.641868         NaN             NaN
std       1153.611031         NaN             NaN
min       1000.000000         NaN             NaN
25%       2002.000000         NaN             NaN
50%       3005.000000         NaN             NaN
75%       4007.000000         NaN             NaN
max       5000.000000         NaN             NaN


In [97]:
#Checking for empty spaces
print("\n🔹 Amount of empty spaces::")
print(df.isnull().sum())


🔹 Amount of empty spaces::
Member_number      0
Date               0
itemDescription    0
dtype: int64


In [98]:
# Grouping products by each purchase (unique association of buyer and date)
transactions = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index()

In [99]:
# Adding "basket" column
transactions.rename(columns={'itemDescription': 'Basket'}, inplace=True)

In [100]:
# printing first 5 baskets
print("Baskets:")
print(transactions.head())

Baskets:
   Member_number        Date  \
0           1000  15-03-2015   
1           1000  24-06-2014   
2           1000  24-07-2015   
3           1000  25-11-2015   
4           1000  27-05-2015   

                                              Basket  
0  [sausage, whole milk, semi-finished bread, yog...  
1                  [whole milk, pastry, salty snack]  
2                     [canned beer, misc. beverages]  
3                        [sausage, hygiene articles]  
4                         [soda, pickled vegetables]  


In [101]:
from mlxtend.preprocessing import TransactionEncoder

In [102]:
# Extracting only the list of baskets
basket_list = transactions['Basket'].tolist()

In [103]:
# Transformer initialization
te = TransactionEncoder()
te_array = te.fit(basket_list).transform(basket_list)

In [104]:
# Creating a one-hot DataFrame
basket_df = pd.DataFrame(te_array, columns=te.columns_)

In [105]:
#Checking the size and first lines
print("Shape of one-hot matrix:", basket_df.shape)
print("First 5 rows of matrix:")
print(basket_df.head())

Shape of one-hot matrix: (14963, 167)
First 5 rows of matrix:
   Instant food products  UHT-milk  abrasive cleaner  artif. sweetener  \
0                  False     False             False             False   
1                  False     False             False             False   
2                  False     False             False             False   
3                  False     False             False             False   
4                  False     False             False             False   

   baby cosmetics   bags  baking powder  bathroom cleaner   beef  berries  \
0           False  False          False             False  False    False   
1           False  False          False             False  False    False   
2           False  False          False             False  False    False   
3           False  False          False             False  False    False   
4           False  False          False             False  False    False   

   ...  turkey  vinegar  waffl

In [106]:
from mlxtend.frequent_patterns import apriori

In [107]:
# We perform Apriori to find frequent product sets
frequent_itemsets = apriori(basket_df, min_support=0.002, use_colnames=True)

In [108]:
# Results:
print("Most popular products:", len(frequent_itemsets))
print(" Top 10 results:")
print(frequent_itemsets.sort_values(by="support", ascending=False).head(10))

Most popular products: 330
 Top 10 results:
      support            itemsets
123  0.157923        (whole milk)
78   0.122101  (other vegetables)
93   0.110005        (rolls/buns)
104  0.097106              (soda)
124  0.085879            (yogurt)
94   0.069572   (root vegetables)
116  0.067767    (tropical fruit)
7    0.060683     (bottled water)
99   0.060349           (sausage)
24   0.053131      (citrus fruit)


In [109]:
from mlxtend.frequent_patterns import association_rules

In [110]:
# Generating association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

In [111]:
# Sort by the lift metric in descending order
top_rules = rules.sort_values(by="lift", ascending=False).head(10)

In [112]:
# top 10 rules:
print("10 rules with:")
print(top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

10 rules with:
            antecedents          consequents   support  confidence      lift
12               (curd)            (sausage)  0.002941    0.087302  1.446615
13            (sausage)               (curd)  0.002941    0.048726  1.446615
4         (brown bread)        (canned beer)  0.002406    0.063943  1.362937
5         (canned beer)        (brown bread)  0.002406    0.051282  1.362937
21  (frozen vegetables)            (sausage)  0.002072    0.073986  1.225966
20            (sausage)  (frozen vegetables)  0.002072    0.034330  1.225966
2        (bottled beer)            (sausage)  0.003342    0.073746  1.222000
3             (sausage)       (bottled beer)  0.003342    0.055371  1.222000
15        (frankfurter)   (other vegetables)  0.005146    0.136283  1.116150
14   (other vegetables)        (frankfurter)  0.005146    0.042146  1.116150
