In [1]:
# importing module
import pandas as pd

# dataset
data = pd.read_csv("db/Market_Basket_Optimisation.csv")

# printing the shape of the dataset
data.shape

(7500, 20)

In [3]:
# importing module
import numpy as np

# Gather All Items of Each Transactions into Numpy Array
transaction = []
for i in range(0, data.shape[0]):
    for j in range(0, data.shape[1]):
        transaction.append(data.values[i,j])

# converting to numpy array
transaction = np.array(transaction)
print(transaction)
#  Transform Them a Pandas DataFrame
df = pd.DataFrame(transaction, columns=["items"]) 



# Put 1 to Each Item For Making Countable Table, to be able to perform Group By
df["incident_count"] = 1 

#  Delete NaN Items from Dataset
indexNames = df[df['items'] == "nan" ].index
df.drop(indexNames , inplace=True)


# Making a New Appropriate Pandas DataFrame for Visualizations  
df_table = df.groupby("items").sum().sort_values("incident_count", ascending=False).reset_index()

#  Initial Visualizations
df_table.head(10).style.background_gradient(cmap='Greens')


['burgers' 'meatballs' 'eggs' ... 'nan' 'nan' 'nan']


Unnamed: 0,items,incident_count
0,mineral water,1787
1,eggs,1348
2,spaghetti,1306
3,french fries,1282
4,chocolate,1230
5,green tea,990
6,milk,972
7,ground beef,737
8,frozen vegetables,715
9,pancakes,713


In [6]:
records=[]
for i in range(0,7500):
    records.append([str(data.values[i,j]) for j in range(0,20)])

# importing the required module
from mlxtend.preprocessing import TransactionEncoder

# initializing the transactionEncoder
te = TransactionEncoder()
te_ary = te.fit(records).transform(records)
dataset = pd.DataFrame(te_ary, columns=te.columns_)

# dataset after encoded
dataset

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7496,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7497,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7498,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# select top 50 items
first50 = df_table["items"].head(50).values 

# Extract Top50
dataset = dataset.loc[:,first50] 

# shape of the dataset
dataset.shape

(7500, 50)

In [8]:
# importing the required module
from mlxtend.frequent_patterns import apriori, association_rules


# Extracting the most frequest itemsets via Mlxtend.
# The length column has been added to increase ease of filtering.
frequent_itemsets = apriori(dataset, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# printing the frequent itemset
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.238267,(mineral water),1
1,0.179733,(eggs),1
2,0.174133,(spaghetti),1
3,0.170933,(french fries),1
4,0.163867,(chocolate),1
...,...,...,...
229,0.010933,"(chocolate, mineral water, ground beef)",3
230,0.011067,"(ground beef, mineral water, milk)",3
231,0.011067,"(mineral water, frozen vegetables, milk)",3
232,0.010533,"(chocolate, spaghetti, eggs)",3


In [9]:
# printing the frequntly items 
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.05) ]

Unnamed: 0,support,itemsets,length
50,0.050933,"(mineral water, eggs)",2
51,0.059733,"(spaghetti, mineral water)",2
53,0.052667,"(chocolate, mineral water)",2


In [10]:
# printing the frequntly items with length 3
frequent_itemsets[ (frequent_itemsets['length'] == 3) ].head(3)

Unnamed: 0,support,itemsets,length
217,0.014267,"(spaghetti, mineral water, eggs)",3
218,0.013467,"(chocolate, mineral water, eggs)",3
219,0.013067,"(mineral water, milk, eggs)",3


In [11]:
#  We set our metric as "Lift" to define whether antecedents & consequents are dependent our not
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
rules["antecedents_length"] = rules["antecedents"].apply(lambda x: len(x))
rules["consequents_length"] = rules["consequents"].apply(lambda x: len(x))
rules.sort_values("lift",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_length,consequents_length
218,(herb & pepper),(ground beef),0.049467,0.098267,0.016000,0.323450,3.291555,0.011139,1.332841,1,1
219,(ground beef),(herb & pepper),0.098267,0.049467,0.016000,0.162822,3.291555,0.011139,1.135402,1,1
295,(ground beef),"(spaghetti, mineral water)",0.098267,0.059733,0.017067,0.173677,2.907540,0.011197,1.137893,1,2
290,"(spaghetti, mineral water)",(ground beef),0.059733,0.098267,0.017067,0.285714,2.907540,0.011197,1.262427,2,1
312,(olive oil),"(spaghetti, mineral water)",0.065733,0.059733,0.010267,0.156187,2.614731,0.006340,1.114306,1,2
...,...,...,...,...,...,...,...,...,...,...,...
60,(low fat yogurt),(eggs),0.076400,0.179733,0.016800,0.219895,1.223453,0.003068,1.051483,1,1
122,(escalope),(french fries),0.079333,0.170933,0.016400,0.206723,1.209376,0.002839,1.045116,1,1
123,(french fries),(escalope),0.170933,0.079333,0.016400,0.095944,1.209376,0.002839,1.018373,1,1
165,(shrimp),(green tea),0.071333,0.132000,0.011333,0.158879,1.203625,0.001917,1.031956,1,1


In [12]:
# Sort values based on confidence
rules.sort_values("confidence",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_length,consequents_length
270,"(ground beef, eggs)",(mineral water),0.020000,0.238267,0.010133,0.506667,2.126469,0.005368,1.544054,2,1
327,"(ground beef, milk)",(mineral water),0.022000,0.238267,0.011067,0.503030,2.111207,0.005825,1.532756,2,1
321,"(chocolate, ground beef)",(mineral water),0.023067,0.238267,0.010933,0.473988,1.989319,0.005437,1.448130,2,1
334,"(frozen vegetables, milk)",(mineral water),0.023600,0.238267,0.011067,0.468927,1.968075,0.005444,1.434328,2,1
34,(soup),(mineral water),0.050533,0.238267,0.023067,0.456464,1.915771,0.011026,1.401441,1,1
...,...,...,...,...,...,...,...,...,...,...,...
47,(mineral water),(red wine),0.238267,0.028133,0.010933,0.045887,1.631053,0.004230,1.018607,1,1
313,(mineral water),"(spaghetti, olive oil)",0.238267,0.022933,0.010267,0.043089,1.878880,0.004802,1.021063,1,2
48,(mineral water),(cereals),0.238267,0.025733,0.010267,0.043089,1.674442,0.004135,1.018137,1,1
271,(mineral water),"(ground beef, eggs)",0.238267,0.020000,0.010133,0.042529,2.126469,0.005368,1.023530,1,2
