In [11]:
import pandas as pd

# http://rasbt.github.io/mlxtend/
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

Učitavanje podataka o kupovinama

In [2]:
data = pd.read_csv('Grocery Products Purchase.csv')

In [3]:
data

Unnamed: 0,Product 1,Product 2,Product 3,Product 4,Product 5,Product 6,Product 7,Product 8,Product 9,Product 10,...,Product 23,Product 24,Product 25,Product 26,Product 27,Product 28,Product 29,Product 30,Product 31,Product 32
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,sausage,chicken,beef,hamburger meat,citrus fruit,grapes,root vegetables,whole milk,butter,whipped/sour cream,...,,,,,,,,,,
9831,cooking chocolate,,,,,,,,,,...,,,,,,,,,,
9832,chicken,citrus fruit,other vegetables,butter,yogurt,frozen dessert,domestic eggs,rolls/buns,rum,cling film/bags,...,,,,,,,,,,
9833,semi-finished bread,bottled water,soda,bottled beer,,,,,,,...,,,,,,,,,,


Preprocesiranje transakcija u oblik "lista listi"

In [4]:
transactions = []
for index, row in data.iterrows():
    items = [x for x in row.values if isinstance(x, str)]
    transactions.append(items)

Transformacija liste proizcoda u matricu čiji redovi odgovaraju transakcijama a kolone pojedinačnim predmetima. Presek reda $i$ i kolone $j$ sadrži indikator da li se proizvod $j$ nalazi u transakciji $j$

In [7]:
te = TransactionEncoder()
item_set = te.fit_transform(transactions)

In [8]:
item_set

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ...,  True, False, False],
       ...,
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

Nazivi kolona koji odgovaraju pojedinačnim proizvodima

In [9]:
te.columns_

['Instant food products',
 'UHT-milk',
 'abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'baby food',
 'bags',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer',
 'bottled water',
 'brandy',
 'brown bread',
 'butter',
 'butter milk',
 'cake bar',
 'candles',
 'candy',
 'canned beer',
 'canned fish',
 'canned fruit',
 'canned vegetables',
 'cat food',
 'cereals',
 'chewing gum',
 'chicken',
 'chocolate',
 'chocolate marshmallow',
 'citrus fruit',
 'cleaner',
 'cling film/bags',
 'cocoa drinks',
 'coffee',
 'condensed milk',
 'cooking chocolate',
 'cookware',
 'cream',
 'cream cheese',
 'curd',
 'curd cheese',
 'decalcifier',
 'dental care',
 'dessert',
 'detergent',
 'dish cleaner',
 'dishes',
 'dog food',
 'domestic eggs',
 'female sanitary products',
 'finished products',
 'fish',
 'flour',
 'flower (seeds)',
 'flower soil/fertilizer',
 'frankfurter',
 'frozen chicken',
 'frozen dessert',
 'frozen fish',
 'frozen fruits',
 'frozen me

Transformacija dobijene matrice u DataFrame objekat, radi bolje preglednosti

In [13]:
item_set_df = pd.DataFrame(data=item_set, columns=te.columns_)
item_set_df

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,False,False,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,True,False,False
9831,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9832,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
9833,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Izdvajanje čestih skupova proizvoda pomoću Apriori i FPGrowth algoritama

In [14]:
## Apriori
# sets = apriori(
#     item_set_df, 
#     min_support=0.1, 
#     use_colnames=True, 
#     max_len=None, 
#     verbose=1, 
#     low_memory=False
# )

# FPGrowth
sets = fpgrowth(
    item_set_df, 
    min_support=0.01, 
    use_colnames=True, 
    max_len=None, 
    verbose=0
)

Primer izdvojenog skupa čestih proizvoda i vrednosti njihove podrške

In [15]:
sets

Unnamed: 0,support,itemsets
0,0.082766,(citrus fruit)
1,0.058566,(margarine)
2,0.017692,(semi-finished bread)
3,0.139502,(yogurt)
4,0.104931,(tropical fruit)
...,...,...
328,0.010168,"(rolls/buns, frozen vegetables)"
329,0.012405,"(frozen vegetables, yogurt)"
330,0.014235,"(other vegetables, onions)"
331,0.012100,"(onions, whole milk)"


Pronalaženje pravila pridruživanja na osnovu čestih skupova

In [16]:
rules = association_rules(sets, metric='lift', min_threshold=2)

Sortiranje čestih skupova na osnovu pouzdanosti

In [17]:
rules.sort_values(by=['confidence'], ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
11,"(root vegetables, citrus fruit)",(other vegetables),0.017692,0.193493,0.010371,0.586207,3.029608,0.006948,1.949059
41,"(root vegetables, tropical fruit)",(other vegetables),0.021047,0.193493,0.012303,0.584541,3.020999,0.008231,1.941244
79,"(yogurt, curd)",(whole milk),0.017285,0.255516,0.010066,0.582353,2.279125,0.005649,1.782567
69,"(other vegetables, butter)",(whole milk),0.020031,0.255516,0.011490,0.573604,2.244885,0.006371,1.745992
46,"(root vegetables, tropical fruit)",(whole milk),0.021047,0.255516,0.011998,0.570048,2.230969,0.006620,1.731553
...,...,...,...,...,...,...,...,...,...
155,(whole milk),"(domestic eggs, other vegetables)",0.255516,0.022267,0.012303,0.048150,2.162336,0.006613,1.027191
51,(whole milk),"(root vegetables, tropical fruit)",0.255516,0.021047,0.011998,0.046956,2.230969,0.006620,1.027185
72,(whole milk),"(other vegetables, butter)",0.255516,0.020031,0.011490,0.044966,2.244885,0.006371,1.026110
133,(whole milk),"(whipped/sour cream, yogurt)",0.255516,0.020742,0.010880,0.042579,2.052747,0.005580,1.022807


Primer korišćenja fpgrowth-py ([link](https://pypi.org/project/fpgrowth-py/])) biblioteke sa nešto drugačijim interfejsom

In [None]:
# from fpgrowth_py import fpgrowth


# itemSetList = [['eggs', 'bacon', 'soup'],
#                 ['eggs', 'bacon', 'apple'],
#                 ['soup', 'bacon', 'banana']]
# freqItemSet, rules = fpgrowth(itemSetList, minSupRatio=0.5, minConf=0.5)
# print(rules)
# # [[{'beer'}, {'rice'}, 0.6666666666666666], [{'rice'}, {'beer'}, 1.0]]
# # rules[0] --> rules[1], confidence = rules[2]