In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [2]:
df = pd.read_csv('Association_Rule_Mining_Data.csv')
df.head()

Unnamed: 0,Row Labels,Almonds,Poha,Banana,Beans,Besan,Boiled Rice,Bread,Brinjals,Buns & Pavs,...,Sooji & Rava,Soya Products,Sugar,Sugar Cubes,Sunflower Oils,Toor Dal,Urad Dal,Utensil Scrub Pads,Washing Bars,Whole Spices
0,6468572,,,1.0,,,,,,,...,,,1.0,1.0,,,,,,
1,6486475,,,,,,1.0,,,1.0,...,,,,,,1.0,,1.0,,
2,6504964,1.0,,,,,1.0,,,,...,,,,,,,1.0,,,
3,6529569,,,1.0,,,,,,,...,,,1.0,,,1.0,1.0,,,
4,6549521,,,1.0,,,,,,,...,,,,1.0,,,,,,


## Create Boolean Dataset : Replace null values with False and then replace the 1's with True

In [3]:
df = df.fillna(False).replace(1.0,True)
df.head()

Unnamed: 0,Row Labels,Almonds,Poha,Banana,Beans,Besan,Boiled Rice,Bread,Brinjals,Buns & Pavs,...,Sooji & Rava,Soya Products,Sugar,Sugar Cubes,Sunflower Oils,Toor Dal,Urad Dal,Utensil Scrub Pads,Washing Bars,Whole Spices
0,6468572,False,False,True,False,False,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False
1,6486475,False,False,False,False,False,True,False,False,True,...,False,False,False,False,False,True,False,True,False,False
2,6504964,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,6529569,False,False,True,False,False,False,False,False,False,...,False,False,True,False,False,True,True,False,False,False
4,6549521,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


## Dropping the row labels to identify the items with their respective Index values

In [4]:
df = df.drop('Row Labels', axis = 1)
df.head()

Unnamed: 0,Almonds,Poha,Banana,Beans,Besan,Boiled Rice,Bread,Brinjals,Buns & Pavs,Cakes,...,Sooji & Rava,Soya Products,Sugar,Sugar Cubes,Sunflower Oils,Toor Dal,Urad Dal,Utensil Scrub Pads,Washing Bars,Whole Spices
0,False,False,True,False,False,False,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,True,True,...,False,False,False,False,False,True,False,True,False,False
2,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,True,False,False,False,False,False,False,True,...,False,False,True,False,False,True,True,False,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


## Data set has values for different rows therefore, setting those values to True

In [5]:
print(df["Namkeen"].value_counts())
print('------')
print(df["Other Dals"].value_counts())

False    29
True     20
3.0       1
Name: Namkeen, dtype: int64
------
False    25
True     24
2.0       1
Name: Other Dals, dtype: int64


In [6]:
df = df.replace(2.0,True).replace(3.0,True)
df.head()

Unnamed: 0,Almonds,Poha,Banana,Beans,Besan,Boiled Rice,Bread,Brinjals,Buns & Pavs,Cakes,...,Sooji & Rava,Soya Products,Sugar,Sugar Cubes,Sunflower Oils,Toor Dal,Urad Dal,Utensil Scrub Pads,Washing Bars,Whole Spices
0,False,False,True,False,False,False,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False
1,False,False,False,False,False,True,False,False,True,True,...,False,False,False,False,False,True,False,True,False,False
2,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,True,False,False,False,False,False,False,True,...,False,False,True,False,False,True,True,False,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


## Using Apriori algorithm with min_support = 0.2

In [7]:
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)
print(frequent_itemsets)

     support                                           itemsets
0       0.72                                           (Banana)
1       0.48                                            (Beans)
2       0.40                                      (Boiled Rice)
3       0.20                                          (Cashews)
4       0.20                                             (Eggs)
..       ...                                                ...
163     0.20  (Beans, Other Vegetables, Root Vegetables, Oth...
164     0.20       (Beans, Sugar, Other Vegetables, Other Dals)
165     0.20               (Beans, Sugar, Other Dals, Urad Dal)
166     0.20         (Sugar, Other Dals, Boiled Rice, Urad Dal)
167     0.20    (Other Dals, Sugar, Other Vegetables, Urad Dal)

[168 rows x 2 columns]


In [8]:
# finding the length of each itemset generated by the above algorithm

frequent_itemsets["itemsets_length"] = frequent_itemsets["itemsets"].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,itemsets_length
0,0.72,(Banana),1
1,0.48,(Beans),1
2,0.40,(Boiled Rice),1
3,0.20,(Cashews),1
4,0.20,(Eggs),1
...,...,...,...
163,0.20,"(Beans, Other Vegetables, Root Vegetables, Oth...",4
164,0.20,"(Beans, Sugar, Other Vegetables, Other Dals)",4
165,0.20,"(Beans, Sugar, Other Dals, Urad Dal)",4
166,0.20,"(Sugar, Other Dals, Boiled Rice, Urad Dal)",4


## Generating the rules for the itemsets
### 'min_thresold' value set for 'confidence' metric

In [9]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Liquid Soaps & Bars),(Banana),0.22,0.72,0.20,0.909091,1.262626,0.0416,3.080000
1,(Namkeen),(Banana),0.42,0.72,0.38,0.904762,1.256614,0.0776,2.940000
2,(Other Vegetables),(Banana),0.38,0.72,0.32,0.842105,1.169591,0.0464,1.773333
3,(Root Vegetables),(Banana),0.44,0.72,0.40,0.909091,1.262626,0.0832,3.080000
4,(Snacky Nuts),(Banana),0.40,0.72,0.36,0.900000,1.250000,0.0720,2.800000
...,...,...,...,...,...,...,...,...,...
116,"(Boiled Rice, Urad Dal)","(Sugar, Other Dals)",0.24,0.36,0.20,0.833333,2.314815,0.1136,3.840000
117,"(Sugar, Other Dals, Other Vegetables)",(Urad Dal),0.24,0.36,0.20,0.833333,2.314815,0.1136,3.840000
118,"(Other Dals, Other Vegetables, Urad Dal)",(Sugar),0.20,0.50,0.20,1.000000,2.000000,0.1000,inf
119,"(Sugar, Other Vegetables, Urad Dal)",(Other Dals),0.22,0.50,0.20,0.909091,1.818182,0.0900,5.500000


## Filtering conditions :
### Confidence > 75%
### Lift > 1.2 

In [10]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))

In [11]:
rules_filtered = rules[ (rules['antecedent_len'] > 2) & (rules['confidence'] > 0.75) & (rules['lift'] > 1.2) ]

In [12]:
print(rules_filtered.shape)
print('------')
print(rules_filtered)

(37, 10)
------
                                         antecedents         consequents  \
73                       (Beans, Other Dals, Banana)   (Root Vegetables)   
74              (Beans, Other Dals, Root Vegetables)            (Banana)   
75                  (Beans, Banana, Root Vegetables)        (Other Dals)   
76             (Other Dals, Banana, Root Vegetables)             (Beans)   
78        (Beans, Other Vegetables, Root Vegetables)            (Banana)   
79       (Other Vegetables, Banana, Root Vegetables)             (Beans)   
81                      (Beans, Banana, Snacky Nuts)   (Root Vegetables)   
82             (Beans, Snacky Nuts, Root Vegetables)            (Banana)   
83                            (Beans, Sugar, Banana)   (Root Vegetables)   
85                      (Beans, Sugar Cubes, Banana)       (Snacky Nuts)   
86                 (Beans, Sugar Cubes, Snacky Nuts)            (Banana)   
87                      (Beans, Banana, Snacky Nuts)       (Sugar Cubes)

## Sorting the values with 'lift' in descending order

In [13]:
rules_filtered.sort_values('lift', ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
113,"(Sugar, Other Dals, Boiled Rice)",(Urad Dal),0.22,0.36,0.2,0.909091,2.525253,0.1208,7.04,3
117,"(Sugar, Other Dals, Other Vegetables)",(Urad Dal),0.24,0.36,0.2,0.833333,2.314815,0.1136,3.84,3
111,"(Beans, Sugar, Other Dals)",(Urad Dal),0.24,0.36,0.2,0.833333,2.314815,0.1136,3.84,3
98,"(Sugar Cubes, Banana, Root Vegetables)",(Snacky Nuts),0.22,0.4,0.2,0.909091,2.272727,0.112,6.6,3
85,"(Beans, Sugar Cubes, Banana)",(Snacky Nuts),0.22,0.4,0.2,0.909091,2.272727,0.112,6.6,3
108,"(Beans, Sugar, Other Dals)",(Other Vegetables),0.24,0.38,0.2,0.833333,2.192982,0.1088,3.72,3
73,"(Beans, Other Dals, Banana)",(Root Vegetables),0.26,0.44,0.24,0.923077,2.097902,0.1256,7.28,3
87,"(Beans, Banana, Snacky Nuts)",(Sugar Cubes),0.24,0.4,0.2,0.833333,2.083333,0.104,3.6,3
90,"(Other Dals, Banana, Other Vegetables)",(Root Vegetables),0.22,0.44,0.2,0.909091,2.066116,0.1032,6.16,3
118,"(Other Dals, Other Vegetables, Urad Dal)",(Sugar),0.2,0.5,0.2,1.0,2.0,0.1,inf,3


In [14]:
rules_filtered.columns

Index(['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction', 'antecedent_len'],
      dtype='object')

## Sorting the values with 'lift', 'consequent support' and 'confidence' in descending order

In [15]:
rules_filtered.sort_values(['consequent support', 'confidence', 'lift'], ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
86,"(Beans, Sugar Cubes, Snacky Nuts)",(Banana),0.2,0.72,0.2,1.0,1.388889,0.056,inf,3
100,"(Sugar Cubes, Snacky Nuts, Root Vegetables)",(Banana),0.2,0.72,0.2,1.0,1.388889,0.056,inf,3
74,"(Beans, Other Dals, Root Vegetables)",(Banana),0.26,0.72,0.24,0.923077,1.282051,0.0528,3.64,3
78,"(Beans, Other Vegetables, Root Vegetables)",(Banana),0.22,0.72,0.2,0.909091,1.262626,0.0416,3.08,3
82,"(Beans, Snacky Nuts, Root Vegetables)",(Banana),0.22,0.72,0.2,0.909091,1.262626,0.0416,3.08,3
91,"(Other Dals, Root Vegetables, Other Vegetables)",(Banana),0.22,0.72,0.2,0.909091,1.262626,0.0416,3.08,3
94,"(Sugar, Other Dals, Root Vegetables)",(Banana),0.22,0.72,0.2,0.909091,1.262626,0.0416,3.08,3
96,"(Other Dals, Banana, Urad Dal)",(Sugar),0.2,0.5,0.2,1.0,2.0,0.1,inf,3
112,"(Beans, Other Dals, Urad Dal)",(Sugar),0.2,0.5,0.2,1.0,2.0,0.1,inf,3
118,"(Other Dals, Other Vegetables, Urad Dal)",(Sugar),0.2,0.5,0.2,1.0,2.0,0.1,inf,3


## Suggestions and Insights :

### 1. The items in the 'consequents' column can be suggested to customers when the items they have purchased include the items in the 'antecedents' column.
### 2. Combinations such as [Snacky Nuts, Beans, Sugar Cubes, Banana], [Snacky Nuts, Sugar Cubes, Root Vegetables, Banana] are good combinations for customers as they enjoy high 'confidence' and 'consequent_support'.
### 3. After sorting the association rules by 'consequent_support', 'confidence', and 'lift', we can suggest these products to customers and also give discounts on these products since, the probability of the customer purchasing these 4 products is higher than other available combinations.
### 4. By giving discounts on such combinations, the store will sell more units and therefore, increase its profitability.