In [1]:
#Implementing Market Based Analysis

#Loading necessary package

import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
#Reading Data From Web

#myretaildata=pd.read_excel("http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx")

myretaildata = pd.read_csv("E:\datafiles\Online_Retail.csv", encoding = 'unicode_escape')
myretaildata.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-10 8:26,3.39,17850.0,United Kingdom


In [3]:
#Data Preparation:-

#Data Cleaning

#Removes spaces from beginning and ending
myretaildata['Description'] = myretaildata['Description'].str.strip()

#Removes duplicate invoice
myretaildata.dropna(axis = 0, subset = ['InvoiceNo'], inplace = True)

#Converting invoice number to be string
myretaildata['InvoiceNo'] = myretaildata['InvoiceNo'].astype('str')

#Remove the credit transactions from dataset
myretaildata = myretaildata[~myretaildata['InvoiceNo'].str.contains('C')]

myretaildata.head() 

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-10 8:26,3.39,17850.0,United Kingdom


In [4]:
#We are checking count based on the Country column

myretaildata['Country'].value_counts()

United Kingdom          487622
Germany                   9042
France                    8408
EIRE                      7894
Spain                     2485
Netherlands               2363
Belgium                   2031
Switzerland               1967
Portugal                  1501
Australia                 1185
Norway                    1072
Italy                      758
Channel Islands            748
Finland                    685
Cyprus                     614
Sweden                     451
Unspecified                446
Austria                    398
Denmark                    380
Poland                     330
Japan                      321
Israel                     295
Hong Kong                  284
Singapore                  222
Iceland                    182
USA                        179
Canada                     151
Greece                     145
Malta                      112
United Arab Emirates        68
European Community          60
RSA                         58
Lebanon 

In [5]:
#To check my size of the data

myretaildata.shape

(532621, 8)

In [6]:
#Filtering/Separting the data based on 'Germany' Country to get a transaction basket

mybasket = (myretaildata[myretaildata['Country'] == "Germany"]
           .groupby(['InvoiceNo', 'Description'])['Quantity']
           .sum().unstack().reset_index().fillna(0)
           .set_index('InvoiceNo'))

In [7]:
#Viewing the transaction basket of Germany
#'0' ---> represents product was not present in the invoice
#'1' ---> represents product was present in the invoice and also 
#'2','3','4'.. represents product was present that number of times invoice.

mybasket.head()

Description,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,12 PENCILS TALL TUBE POSY,12 PENCILS TALL TUBE RED RETROSPOT,12 PENCILS TALL TUBE SKULLS,...,YULETIDE IMAGES GIFT WRAP SET,ZINC HEART T-LIGHT HOLDER,ZINC STAR T-LIGHT HOLDER,ZINC BOX SIGN HOME,ZINC FOLKART SLEIGH BELLS,ZINC HEART LATTICE T-LIGHT HOLDER,ZINC METAL HEART DECORATION,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC WILLIE WINKIE CANDLE STICK
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536840,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#Converting all positive values to 1 and everything else to 0

def my_encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
    
my_basket_sets = mybasket.applymap(my_encode_units)
my_basket_sets.drop('POSTAGE', inplace = True, axis = 1) #Remove "Postage" as an item

In [9]:
#Training the Model

#Generating Frequent itemsets 

my_frequent_itemsets = apriori(my_basket_sets, min_support = 0.07, use_colnames = True)

#Note:- Here we are considering the min_support to be at least 0.7 

In [10]:
#Generating Rules

#Now we are taking the association transactions generated by my_frequent_itemsets with min_support=0.07
#Now we are using them to generate asscociation rules by using my_rules with metric=lift & min_threshold=1

my_rules = association_rules(my_frequent_itemsets, metric = "lift", min_threshold = 1)

In [11]:
#Viewing Top 100 rules

#The total rules depends upon the volume of transactions we are using for recommendation_rule_system

my_rules.head(100)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(PLASTERS IN TIN WOODLAND ANIMALS),(ROUND SNACK BOXES SET OF4 WOODLAND),0.137856,0.245077,0.074398,0.539683,2.202098,0.040613,1.640006
1,(ROUND SNACK BOXES SET OF4 WOODLAND),(PLASTERS IN TIN WOODLAND ANIMALS),0.245077,0.137856,0.074398,0.303571,2.202098,0.040613,1.237951
2,(ROUND SNACK BOXES SET OF4 WOODLAND),(ROUND SNACK BOXES SET OF 4 FRUITS),0.245077,0.157549,0.131291,0.535714,3.400298,0.092679,1.814509
3,(ROUND SNACK BOXES SET OF 4 FRUITS),(ROUND SNACK BOXES SET OF4 WOODLAND),0.157549,0.245077,0.131291,0.833333,3.400298,0.092679,4.52954
4,(SPACEBOY LUNCH BOX),(ROUND SNACK BOXES SET OF4 WOODLAND),0.102845,0.245077,0.070022,0.680851,2.778116,0.044817,2.365427
5,(ROUND SNACK BOXES SET OF4 WOODLAND),(SPACEBOY LUNCH BOX),0.245077,0.102845,0.070022,0.285714,2.778116,0.044817,1.256018


In [12]:
#Making recommendations

#How do we use these rules to make product_recommendation:-

#From Rule:5  The Item:A is 'ROUND SNACK BOXES SET OF4 WOODLAND' which is occuring 112 times in my_rules
#From Rule:5  The Item:B is 'SPACEBOY LUNCH BOX' which is occuring 47 times in my_rules
#Association_Rule_Analysis:-Based on the above Rules:5 the Items:A,B having good Support,Confidence,Lift
#metrics.
#Conclusion:- Which says that we can recommenend product-SPACEBOY LUNCH BOX to some one who is buying the
#product-ROUND SNACK BOXES SET OF4 WOODLAND based on my_rules table i.e, Association_Rule_Analysis_Recomm

#Note:- Using transactions we can create rules and based on these we can design our recommendation_system.
#Based on metrics: support,confidence,lift this is the basic common recommendation system used in retail
#scenario known Association_Analysis or Market_Basket_Analysis here metrics: leverag,conviction are inter-
#related to metrics:support,confidence,lift.

my_basket_sets['ROUND SNACK BOXES SET OF4 WOODLAND'].sum()

112

In [13]:
my_basket_sets['SPACEBOY LUNCH BOX'].sum()

47

In [14]:
#Filtering rules based on condition

my_rules[ (my_rules['lift'] >= 3 ) &
         (my_rules['confidence'] >= 0.3)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(ROUND SNACK BOXES SET OF4 WOODLAND),(ROUND SNACK BOXES SET OF 4 FRUITS),0.245077,0.157549,0.131291,0.535714,3.400298,0.092679,1.814509
3,(ROUND SNACK BOXES SET OF 4 FRUITS),(ROUND SNACK BOXES SET OF4 WOODLAND),0.157549,0.245077,0.131291,0.833333,3.400298,0.092679,4.52954
