In [1]:
import numpy as np  
import pandas as pd  
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
# Now, we will load the Data  
data1 = pd.read_excel('Online Retail.xlsx')  
data1.head()  

In [13]:
data1.columns  

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [14]:
# Now, we will explore the different regions of transactions  
data1.Country.unique()  

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [15]:
# Now, we will explore the different regions of transactions  
data1.Country.unique()  

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [16]:
# here, we will strip the extra spaces in the description  
data1['Description'] = data1['Description'].str.strip()  
  
# Now, drop the rows which does not have any invoice number  
data1.dropna(axis = 0, subset = ['InvoiceNo'], inplace = True)  
data1['InvoiceNo'] = data1['InvoiceNo'].astype('str')  
  
# Now, we will drop all transactions which were done on credit  
data1 = data1[~data1['InvoiceNo'].str.contains('C')]  

In [17]:
# Transactions done in France  
basket1_France = (data1[data1['Country'] == "France"]  
        .groupby(['InvoiceNo', 'Description'])['Quantity']  
        .sum().unstack().reset_index().fillna(0)  
        .set_index('InvoiceNo'))  
  
# Transactions done in the United Kingdom  
basket1_UK = (data1[data1['Country'] == "United Kingdom"]  
        .groupby(['InvoiceNo', 'Description'])['Quantity']  
        .sum().unstack().reset_index().fillna(0)  
        .set_index('InvoiceNo'))  
  
# Transactions done in Portugal  
basket1_Por = (data1[data1['Country'] == "Portugal"]  
        .groupby(['InvoiceNo', 'Description'])['Quantity']  
        .sum().unstack().reset_index().fillna(0)  
        .set_index('InvoiceNo'))  
  
basket1_Sweden = (data1[data1['Country'] == "Sweden"]  
        .groupby(['InvoiceNo', 'Description'])['Quantity']  
        .sum().unstack().reset_index().fillna(0)  
        .set_index('InvoiceNo')) 

In [18]:
# Here, we will define the hot encoding function   
# for making the data suitable  
# for the concerned libraries  
def hot_encode1(P):  
    if(P<= 0):  
        return 0  
    if(P>= 1):  
        return 1  
  
# Here, we will encode the datasets  
basket1_encoded = basket1_France.applymap(hot_encode1)  
basket1_France = basket1_encoded  
  
basket1_encoded = basket1_UK.applymap(hot_encode1)  
basket1_UK = basket1_encoded  
  
basket1_encoded = basket1_Por.applymap(hot_encode1)  
basket1_Por = basket1_encoded  
  
basket1_encoded = basket1_Sweden.applymap(hot_encode1)  
basket1_Sweden = basket1_encoded  

In [22]:
# Build the model  
frq_items1 = apriori(basket1_France, min_support = 0.05, use_colnames = True)  
  
# Collect the inferred rules in a dataframe  
rules1 = association_rules(frq_items1, metric = "lift", min_threshold = 1)  
rules1 = rules1.sort_values(['confidence', 'lift'], ascending = [False, False])  
print(rules1.head())  

                                           antecedents  \
45                        (JUMBO BAG WOODLAND ANIMALS)   
260  (PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...   
272  (RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...   
302  (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...   
301  (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...   

                         consequents  antecedent support  consequent support  \
45                         (POSTAGE)            0.076531            0.765306   
260                        (POSTAGE)            0.051020            0.765306   
272                        (POSTAGE)            0.053571            0.765306   
302  (SET/6 RED SPOTTY PAPER PLATES)            0.102041            0.127551   
301    (SET/6 RED SPOTTY PAPER CUPS)            0.102041            0.137755   

      support  confidence      lift  leverage  conviction  
45   0.076531       1.000  1.306667  0.017961         inf  
260  0.051020       1.000  1.306667  0.011974     



In [29]:
frq_items = apriori(basket1_UK, min_support = 0.05, use_colnames = True)  
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)  
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])  
print(rules.head()) 



Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []


In [34]:
frq_items1 = apriori(basket1_Por, min_support = 0.05, use_colnames = True)  
rules1 = association_rules(frq_items1, metric ="lift", min_threshold = 1)  
rules1 = rules1.sort_values(['confidence', 'lift'], ascending =[False, False])  
print(rules1.head())  



                              antecedents                         consequents  \
1170   (SET 12 COLOUR PENCILS DOLLY GIRL)    (SET 12 COLOUR PENCILS SPACEBOY)   
1171     (SET 12 COLOUR PENCILS SPACEBOY)  (SET 12 COLOUR PENCILS DOLLY GIRL)   
1172   (SET OF 4 KNICK KNACK TINS LONDON)  (SET 12 COLOUR PENCILS DOLLY GIRL)   
1173   (SET 12 COLOUR PENCILS DOLLY GIRL)  (SET OF 4 KNICK KNACK TINS LONDON)   
1174  (SET OF 4 KNICK KNACK TINS POPPIES)  (SET 12 COLOUR PENCILS DOLLY GIRL)   

      antecedent support  consequent support   support  confidence       lift  \
1170            0.051724            0.051724  0.051724         1.0  19.333333   
1171            0.051724            0.051724  0.051724         1.0  19.333333   
1172            0.051724            0.051724  0.051724         1.0  19.333333   
1173            0.051724            0.051724  0.051724         1.0  19.333333   
1174            0.051724            0.051724  0.051724         1.0  19.333333   

      leverage  conviction

In [33]:
frq_items1 = apriori(basket1_Sweden, min_support = 0.05, use_colnames = True)  
rules1 = association_rules(frq_items1, metric ="lift", min_threshold = 1)  
rules1 = rules1.sort_values(['confidence', 'lift'], ascending =[False, False])  
print(rules1.head())  



                        antecedents                        consequents  \
0     (12 PENCILS SMALL TUBE SKULL)      (PACK OF 72 SKULL CAKE CASES)   
1     (PACK OF 72 SKULL CAKE CASES)      (12 PENCILS SMALL TUBE SKULL)   
4    (ASSORTED BOTTLE TOP  MAGNETS)            (36 DOILIES DOLLY GIRL)   
5           (36 DOILIES DOLLY GIRL)     (ASSORTED BOTTLE TOP  MAGNETS)   
180  (CHILDRENS CUTLERY DOLLY GIRL)  (CHILDRENS CUTLERY CIRCUS PARADE)   

     antecedent support  consequent support   support  confidence  lift  \
0              0.055556            0.055556  0.055556         1.0  18.0   
1              0.055556            0.055556  0.055556         1.0  18.0   
4              0.055556            0.055556  0.055556         1.0  18.0   
5              0.055556            0.055556  0.055556         1.0  18.0   
180            0.055556            0.055556  0.055556         1.0  18.0   

     leverage  conviction  
0    0.052469         inf  
1    0.052469         inf  
4    0.052469       