In [8]:
# !pip install mlxtend

In [9]:
import numpy as np 
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules 

In [10]:
data = pd.read_csv("Assignment-1_Data.csv",sep=';',low_memory=False)
data.head()

Unnamed: 0,BillNo,Itemname,Quantity,Date,Price,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01.12.2010 08:26,255,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01.12.2010 08:26,339,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,01.12.2010 08:26,275,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,01.12.2010 08:26,339,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,01.12.2010 08:26,339,17850.0,United Kingdom


In [11]:
# data_xl = pd.read_excel("Assignment-1_Data.xlsx")
# data.head()

In [12]:
data.columns

Index(['BillNo', 'Itemname', 'Quantity', 'Date', 'Price', 'CustomerID',
       'Country'],
      dtype='object')

In [13]:
unique_country = data.Country.unique()
print(unique_country)

['United Kingdom' 'France' 'Australia' 'Netherlands' 'Germany' 'Norway'
 'Switzerland' 'Spain' 'Poland' 'Portugal' 'Italy' 'Belgium' 'Lithuania'
 'Japan' 'Iceland' 'Sweden' 'Austria' 'Bahrain' 'Israel' 'Greece'
 'Hong Kong' 'Singapore' 'Lebanon' 'United Arab Emirates' 'Saudi Arabia'
 'Unspecified' 'Brazil' 'USA' 'Malta' 'RSA']


In [14]:
# Stripping extra spaces in the description 
data['Itemname'] = data['Itemname'].str.strip() 
  
# Dropping the rows without any invoice number 
data.dropna(axis = 0, subset =['BillNo'], inplace = True) 
data['BillNo'] = data['BillNo'].astype('str') 
  
# Dropping all transactions which were done on credit 
data = data[~data['BillNo'].str.contains('C')]

In [15]:
# Transactions done in France 
basket_France = (data[data['Country'] =="France"] 
          .groupby(['BillNo', 'Itemname'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('BillNo')) 
  
# # Transactions done in the United Kingdom 
basket_UK = (data[data['Country'] =="United Kingdom"] 
          .groupby(['BillNo', 'Itemname'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('BillNo'))
  
# # Transactions done in Portugal 
basket_Por = (data[data['Country'] =="Portugal"] 
          .groupby(['BillNo', 'Itemname'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('BillNo'))
  
basket_Sweden = (data[data['Country'] =="Sweden"] 
          .groupby(['BillNo', 'Itemname'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('BillNo'))

In [16]:
#  Defining the hot encoding function to make the data suitable  
# for the concerned libraries 
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1
  
# Encoding the datasets 
basket_encoded = basket_France.map(hot_encode) 
basket_France = basket_encoded 
  
basket_encoded = basket_UK.map(hot_encode) 
basket_UK = basket_encoded 
  
basket_encoded = basket_Por.map(hot_encode) 
basket_Por = basket_encoded 
  
basket_encoded = basket_Sweden.map(hot_encode) 
basket_Sweden = basket_encoded 

In [17]:
# Building the model 
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True) 

# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
print(rules.head()) 
# rules.to_csv("out.csv",index=False)



                                           antecedents  \
44                        (JUMBO BAG WOODLAND ANIMALS)   
258  (PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...   
271  (RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...   
302  (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...   
301  (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...   

                         consequents  antecedent support  consequent support  \
44                         (POSTAGE)            0.076531            0.765306   
258                        (POSTAGE)            0.051020            0.765306   
271                        (POSTAGE)            0.053571            0.765306   
302  (SET/6 RED SPOTTY PAPER PLATES)            0.102041            0.127551   
301    (SET/6 RED SPOTTY PAPER CUPS)            0.102041            0.137755   

      support  confidence      lift  leverage  conviction  zhangs_metric  
44   0.076531       1.000  1.306667  0.017961         inf       0.254144  
258  0.051020       



In [18]:
# Building the model 
frq_items = apriori(basket_UK, min_support = 0.03, use_colnames = True) 

# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
print(rules.head()) 





                         antecedents                        consequents  \
3   (PINK REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
5  (GREEN REGENCY TEACUP AND SAUCER)  (ROSES REGENCY TEACUP AND SAUCER)   
4  (ROSES REGENCY TEACUP AND SAUCER)  (GREEN REGENCY TEACUP AND SAUCER)   
8          (JUMBO BAG PINK POLKADOT)          (JUMBO BAG RED RETROSPOT)   
1       (ALARM CLOCK BAKELIKE GREEN)         (ALARM CLOCK BAKELIKE RED)   

   antecedent support  consequent support   support  confidence       lift  \
3            0.037660            0.050035  0.030910    0.820768  16.403939   
5            0.050035            0.051267  0.037553    0.750535  14.639752   
4            0.051267            0.050035  0.037553    0.732497  14.639752   
8            0.062088            0.103820  0.042053    0.677308   6.523895   
1            0.046928            0.049821  0.030160    0.642694  12.900183   

   leverage  conviction  zhangs_metric  
3  0.029026    5.300203       0.975787 

In [19]:
def apply_aprior_algo(basket,country):
    try:
        frq_items = apriori(basket, min_support = 0.02, use_colnames = True) 

        # Collecting the inferred rules in a dataframe 
        rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
        rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
        print(rules.head()) 
        path = "Output/"+country+".csv"
        print("path:",path)
        rules.to_csv(path,index = False)
    except:
        print("error occured when executing")


def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

In [20]:
def apply(basket,country,min):    
    frq_items = apriori(basket, min_support = min, use_colnames = True) 


    # Collecting the inferred rules in a dataframe 
    rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
    rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
    print(rules.head()) 
    if (len(rules) == 0):
        apply(basket,country,0.02)
    path = "Output/"+country+".csv"
    print("path:",path)
    rules.to_csv(path,index = False)
    

In [21]:
print("applying aprior algorithm")

for country in unique_country:
    print("Country :",country)
    basket = (data[data['Country'] == country] 
          .groupby(['BillNo', 'Itemname'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('BillNo'))
    basket_encoded = basket.map(hot_encode) 
    basket = basket_encoded
    apply(basket,country,0.05)
    print("\n\n")

applying aprior algorithm
Country : United Kingdom




Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
Index: []




                                           antecedents  \
165  (ROSES REGENCY TEACUP AND SAUCER, PINK REGENCY...   
166  (GREEN REGENCY TEACUP AND SAUCER, PINK REGENCY...   
27                    (PINK REGENCY TEACUP AND SAUCER)   
170  (JUMBO BAG PINK POLKADOT, JUMBO STORAGE BAG SUKI)   
147                   (PINK REGENCY TEACUP AND SAUCER)   

                           consequents  antecedent support  \
165  (GREEN REGENCY TEACUP AND SAUCER)            0.029249   
166  (ROSES REGENCY TEACUP AND SAUCER)            0.030910   
27   (GREEN REGENCY TEACUP AND SAUCER)            0.037660   
170          (JUMBO BAG RED RETROSPOT)            0.027053   
147  (ROSES REGENCY TEACUP AND SAUCER)            0.037660   

     consequent support   support  confidence       lift  leverage  \
165            0.050035  0.026410    0.902930  18.046041  0.024947   
166            0.051267  0.026410    0.854419  16.666089  0.024826   
27             0.050035  0.030910    0.820768  16.403939  0.029026  



                                           antecedents  \
44                        (JUMBO BAG WOODLAND ANIMALS)   
258  (PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...   
271  (RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...   
302  (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...   
301  (SET/6 RED SPOTTY PAPER PLATES, SET/20 RED RET...   

                         consequents  antecedent support  consequent support  \
44                         (POSTAGE)            0.076531            0.765306   
258                        (POSTAGE)            0.051020            0.765306   
271                        (POSTAGE)            0.053571            0.765306   
302  (SET/6 RED SPOTTY PAPER PLATES)            0.102041            0.127551   
301    (SET/6 RED SPOTTY PAPER CUPS)            0.102041            0.137755   

      support  confidence      lift  leverage  conviction  zhangs_metric  
44   0.076531       1.000  1.306667  0.017961         inf       0.254144  
258  0.051020       