In [1]:
import numpy as np
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

In [2]:
df = pd.read_csv("groceries_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [4]:
df.describe()

Unnamed: 0,Member_number
count,38765.0
mean,3003.641868
std,1153.611031
min,1000.0
25%,2002.0
50%,3005.0
75%,4007.0
max,5000.0


In [5]:
df['itemDescription'].unique()

array(['tropical fruit', 'whole milk', 'pip fruit', 'other vegetables',
       'rolls/buns', 'pot plants', 'citrus fruit', 'beef', 'frankfurter',
       'chicken', 'butter', 'fruit/vegetable juice',
       'packaged fruit/vegetables', 'chocolate', 'specialty bar',
       'butter milk', 'bottled water', 'yogurt', 'sausage', 'brown bread',
       'hamburger meat', 'root vegetables', 'pork', 'pastry',
       'canned beer', 'berries', 'coffee', 'misc. beverages', 'ham',
       'turkey', 'curd cheese', 'red/blush wine',
       'frozen potato products', 'flour', 'sugar', 'frozen meals',
       'herbs', 'soda', 'detergent', 'grapes', 'processed cheese', 'fish',
       'sparkling wine', 'newspapers', 'curd', 'pasta', 'popcorn',
       'finished products', 'beverages', 'bottled beer', 'dessert',
       'dog food', 'specialty chocolate', 'condensed milk', 'cleaner',
       'white wine', 'meat', 'ice cream', 'hard cheese', 'cream cheese ',
       'liquor', 'pickled vegetables', 'liquor (appetizer

In [6]:
transactions = df.groupby('Member_number')["itemDescription"].apply(list).tolist()
encoder = TransactionEncoder()
transaction_matrix = encoder.fit(transactions).transform(transactions)
encoded_df = pd.DataFrame(transaction_matrix, columns = encoder.columns_)
encoded_df

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,True,False,...,False,False,False,True,False,True,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3893,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3894,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3895,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3896,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,True,False


In [15]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
frequent_itemsets = apriori(encoded_df, min_support = 0.05, use_colnames = True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold = 1).sort_values(by="zhangs_metric", ascending = True)
rules.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
113,(tropical fruit),(other vegetables),0.23371,0.376603,0.091329,0.390779,1.037642,0.003313,1.023269,0.04734
58,(coffee),(whole milk),0.114931,0.458184,0.055156,0.479911,1.04742,0.002497,1.041776,0.051152
112,(other vegetables),(tropical fruit),0.376603,0.23371,0.091329,0.242507,1.037642,0.003313,1.011614,0.058191
40,(canned beer),(soda),0.165213,0.313494,0.054643,0.330745,1.055029,0.00285,1.025777,0.062481
111,(soda),(other vegetables),0.313494,0.376603,0.124166,0.396072,1.051695,0.006103,1.032237,0.071601
180,(soda),(whole milk),0.313494,0.458184,0.151103,0.481997,1.051973,0.007465,1.045971,0.071966
14,(bottled water),(rolls/buns),0.213699,0.349666,0.079271,0.370948,1.060863,0.004548,1.033832,0.072964
41,(soda),(canned beer),0.313494,0.165213,0.054643,0.174304,1.055029,0.00285,1.011011,0.075977
110,(other vegetables),(soda),0.376603,0.313494,0.124166,0.3297,1.051695,0.006103,1.024178,0.078849
96,(pastry),(other vegetables),0.177527,0.376603,0.071575,0.403179,1.070567,0.004718,1.044529,0.080143


In [18]:
'''
Вывод: 5 наиболее "ассоциативных" комбинаций согласно метрике Чанга:
    Молоко, Вода, Овощи
    Молоко, Йогурт, Овощи
    Молоко, Йогурт, Булочки
    Молоко, Овощи, Булочки
    Молоко, Йогурт, Сода

Наименее ассоциативные(при условии min_support >0.05) согласно метрике Чанга:
    Тропические фрукты, Овощи
    Кофе, Молоко
    Пиво в банках, Сода
    Сода, Овощи
    Сода, Молоко
'''

'\nВывод: 5 наиболее "ассоциативных" комбинаций согласно метрике Чанга:\n    Молоко, Вода, Овощи\n    Молоко, Йогурт, Овощи\n    Молоко, Йогурт, Булочки\n    Молоко, Овощи, Булочки\n    Молоко, Йогурт, Сода\n\nНаименее ассоциативные(при условии min_support >0.05) согласно метрике Чанга:\n    Тропические фрукты, Овощи\n    Кофе, Молоко\n    Пиво в банках, Сода\n    Сода, Овощи\n    Сода, Молоко\n'