In [1]:
import time # for calculation of times
import numpy as np # for efficient list
import pandas as pd # for efficient tables
from matplotlib import pyplot as plt # for visualization
import seaborn as sns # for visualization
from mlxtend.frequent_patterns import apriori # machine learning xtend tool for apriori algorithm
from mlxtend.frequent_patterns import association_rules # machine learning xtend tool for association rules
from mlxtend.frequent_patterns import fpgrowth # machine learning xtend tool for fp growth algorithm

In [2]:
plt.style.use('fivethirtyeight')

In [3]:
path = 'groceries.txt'
f = open(path, 'r') # opening file pointer and reading from path in only read mode

data = [] 
for line in f.readlines(): # iterating each line from file
    l = line.split(',') # spliting each line by ',' seperater
    l = l[:-1:1] # skipping last element of list which is -> '\n'
    if l != []:
        data.append(l)
f.close() # closing file pointer
data

[['citrus fruit', 'semi-finished bread', 'margarine'],
 ['tropical fruit', 'yogurt'],
 ['pip fruit', 'yogurt', 'cream cheese'],
 ['other vegetables', 'whole milk', 'condensed milk'],
 ['whole milk', 'butter', 'yogurt', 'rice'],
 ['other vegetables', 'UHT-milk', 'rolls/buns', 'bottled beer'],
 ['whole milk'],
 ['tropical fruit', 'other vegetables', 'white bread', 'bottled water'],
 ['citrus fruit',
  'tropical fruit',
  'whole milk',
  'butter',
  'curd',
  'yogurt',
  'flour',
  'bottled water'],
 ['frankfurter', 'rolls/buns'],
 ['chicken'],
 ['butter', 'sugar', 'fruit/vegetable juice'],
 ['butter milk'],
 ['tropical fruit', 'cream cheese', 'processed cheese', 'detergent'],
 ['tropical fruit',
  'root vegetables',
  'other vegetables',
  'frozen dessert',
  'rolls/buns',
  'flour',
  'sweet spreads',
  'salty snack',
  'waffles',
  'candy'],
 ['bottled water'],
 ['sausage', 'rolls/buns', 'soda'],
 ['brown bread', 'soda', 'fruit/vegetable juice', 'canned beer', 'newspapers'],
 ['yogurt'

In [4]:
columns = set([]) # empty set since set skip the same entries
for l in data: # a list in data
    for e in l: # a element in list
        columns.add(e)
columns = list(columns) # convert set to list to easy iterate
columns = sorted(columns) # sort elements by alphabetical order
columns

['Instant food products',
 'UHT-milk',
 'abrasive cleaner',
 'artif. sweetener',
 'baby cosmetics',
 'baby food',
 'baking powder',
 'bathroom cleaner',
 'beef',
 'berries',
 'beverages',
 'bottled beer',
 'bottled water',
 'brandy',
 'brown bread',
 'butter',
 'butter milk',
 'cake bar',
 'candles',
 'candy',
 'canned beer',
 'canned fish',
 'canned fruit',
 'canned vegetables',
 'cat food',
 'cereals',
 'chewing gum',
 'chicken',
 'chocolate',
 'chocolate marshmallow',
 'citrus fruit',
 'cleaner',
 'cling film/bags',
 'cocoa drinks',
 'coffee',
 'condensed milk',
 'cooking chocolate',
 'cookware',
 'cream',
 'cream cheese',
 'curd',
 'curd cheese',
 'decalcifier',
 'dental care',
 'dessert',
 'detergent',
 'dish cleaner',
 'dishes',
 'dog food',
 'domestic eggs',
 'female sanitary products',
 'finished products',
 'fish',
 'flour',
 'flower (seeds)',
 'flower soil/fertilizer',
 'frankfurter',
 'frozen chicken',
 'frozen dessert',
 'frozen fish',
 'frozen fruits',
 'frozen meals',
 'f

In [5]:
bool_data = []
for l in data: # a list in data
    temp = []
    for c in columns: # every column in columns
        if c in l: # if column is in the list
            temp.append(True) # append the value True 
        else:
            temp.append(False) # append the value False
    bool_data.append(temp) 

In [6]:
data = pd.DataFrame(bool_data, columns = columns) # creating data frame

In [7]:
data.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False


In [8]:
data.isnull().sum().sum()

0

In [9]:
data.dtypes

Instant food products    bool
UHT-milk                 bool
abrasive cleaner         bool
artif. sweetener         bool
baby cosmetics           bool
                         ... 
white bread              bool
white wine               bool
whole milk               bool
yogurt                   bool
zwieback                 bool
Length: 167, dtype: object

<font color=SpringGreen><h1>Apriori cells</h1></font>

In [10]:
apr = apriori(df = data, min_support = 0.05, use_colnames = True)
apr

Unnamed: 0,support,itemsets
0,0.063445,(beef)
1,0.053153,(bottled beer)
2,0.10852,(bottled water)
3,0.068395,(brown bread)
4,0.068265,(butter)
5,0.053022,(chicken)
6,0.103179,(citrus fruit)
7,0.053022,(coffee)
8,0.064878,(curd)
9,0.073476,(domestic eggs)


In [11]:
%timeit -n 10 -r 1 apriori(df = data, min_support = 0.05, use_colnames = True)

13.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


In [12]:
ar = association_rules(apr, min_threshold=0.3)
ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(root vegetables),(other vegetables),0.134315,0.230849,0.058755,0.437439,1.894912,0.027748,1.367231
1,(whole milk),(other vegetables),0.301199,0.230849,0.093147,0.309256,1.339644,0.023616,1.11351
2,(other vegetables),(whole milk),0.230849,0.301199,0.093147,0.403499,1.339644,0.023616,1.171501
3,(yogurt),(other vegetables),0.163106,0.230849,0.052762,0.323482,1.40127,0.015109,1.136926
4,(rolls/buns),(whole milk),0.190464,0.301199,0.063705,0.334473,1.110475,0.006338,1.049998
5,(root vegetables),(whole milk),0.134315,0.301199,0.060448,0.450048,1.494192,0.019993,1.27066
6,(tropical fruit),(whole milk),0.130146,0.301199,0.052241,0.401401,1.33268,0.013041,1.167396
7,(yogurt),(whole milk),0.163106,0.301199,0.068265,0.41853,1.38955,0.019137,1.201785


<font color=SpringGreen><h1>FP Growth tree cells</h1></font>

In [13]:
fp = fpgrowth(data, min_support = 0.05, use_colnames = True)
fp

Unnamed: 0,support,itemsets
0,0.103179,(citrus fruit)
1,0.062923,(margarine)
2,0.163106,(yogurt)
3,0.130146,(tropical fruit)
4,0.091454,(pip fruit)
5,0.301199,(whole milk)
6,0.230849,(other vegetables)
7,0.068265,(butter)
8,0.190464,(rolls/buns)
9,0.053153,(bottled beer)


In [14]:
%timeit -n 10 -r 1 fpgrowth(data, min_support = 0.05, use_colnames = True)

90.7 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
