#This notebook contains all the tools I will be using for association Rule learning projects:

In [41]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [42]:
#importing the dataset
def generate_data(location:str,sample_number=10):
  data=pd.read_csv(location,engine='python')
  head=data.head()
  tail=data.tail()
  sample=data.sample(sample_number)
  description=data.describe()
  columns=data.columns
  info=data.info()
  shape=data.shape
  size=data.size
  return {'data':data,'head':head,'tail':tail,'sample':sample,'description':description,'columns':columns,'info':info,
          'shape':shape,'size':size}


In [None]:
data_set1 = generate_data('/content/Market_Basket_Optimisation.csv')

In [44]:
data1 = data_set1['data']

In [None]:
#Analyzing the data
data_set1['head']

In [None]:
data_set1['tail']

In [None]:
data_set1['description']

In [9]:
data1.shape

(7500, 20)

In [46]:
#we need to project our dataset into a particular format
#this function will project our dataset into the required format
def project(data):
  transactions=[]
  rows,columns=data.shape
  # we need to include one extra value then the number of rows
  for i in range (0,rows):
    transactions.append([str(data.values[i,j]) for j in range(0, columns)])
  #this will give the required format with which we can continue now
  #there will be a total of rows entries each containing the entire rows of the dataset
  return transactions

transactions = project(data1)

In [None]:
#chechking the new format dataset
for val in transactions:
  print(val)

In [None]:
#installing the module
!pip install apyori

In [47]:
#building the apriori model
def apriori(transactions,min_support:float,min_confidence:float=0.8,min_lift:int =3,min_length:int=2,max_length:int=2):
  from apyori import apriori
  rules = apriori(transactions = transactions, min_support = min_support, min_confidence = min_confidence, min_lift = min_lift, min_length = min_length, max_length = max_length)
  return rules
# we need to define the following parametres
#1)min_support:let's say we want to consider only those products that appear in atlest 3 transactions in a say 
#ans since this dataset contains the data of an entire week we will multiplt  3 with 7
# so our min support is (3*7)/Number of entrires in our dataset i.e transactions over the week
#therfore min_support  = (3*21)/7500


#2)min_confidence:Some rule of thumbs (start with 0.8 and then keep decreasing the confidence by dividing with 2
#i.e 0.8,0.4,0.2 and so on)
#3)min_lift: rule of thum start with 3 and then keep on Increasing the value based on your dataset
#4)min_confidence:min numbe of elements we want in our rules
#5)max_confidence max number of elements we want in our rule

In [66]:
#before calling the function let's define some functions and run a loop for getting different values:
min_support=[(3*7)/7500,(4*7)/7500,(5*7)/7500,(6*7)/7500] # item should apper atleast 3,4,5,6 times in transcations done in a day
min_confidence=[0.8,0.4,0.2]#starting with 0.8 and then decreasing the value each time
min_lift=[3,4,5,6,7,8,9] #starting with 3 and increasing the value consecutively
length=[2,3,4] # will use the same value for both min_length and max_length
#you can try with all these possible values
rules =  apriori(transactions,(3*7)/7500,0.2,3,2,2)
display_Aprori(rules)

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
4,fromage blanc,honey,0.003333,0.245098,5.178128
1,light cream,chicken,0.004533,0.290598,4.843305
3,pasta,escalope,0.005867,0.372881,4.700185
9,pasta,shrimp,0.005067,0.322034,4.514494
8,whole wheat pasta,olive oil,0.008,0.271493,4.130221
0,extra dark chocolate,chicken,0.0028,0.233333,3.888889
6,tomato sauce,ground beef,0.005333,0.377358,3.840147
2,mushroom cream sauce,escalope,0.005733,0.300699,3.790327
5,herb & pepper,ground beef,0.016,0.32345,3.291555
7,light cream,olive oil,0.0032,0.205128,3.120612


In [68]:
rules =  apriori(transactions,(3*7)/7500,0.2,3,2,2)
display_Eclat(rules)

Unnamed: 0,Product 1,Product 2,Support
5,herb & pepper,ground beef,0.016
8,whole wheat pasta,olive oil,0.008
3,pasta,escalope,0.005867
2,mushroom cream sauce,escalope,0.005733
6,tomato sauce,ground beef,0.005333
9,pasta,shrimp,0.005067
1,light cream,chicken,0.004533
4,fromage blanc,honey,0.003333
7,light cream,olive oil,0.0032
0,extra dark chocolate,chicken,0.0028


In [18]:
#installing the library
!pip install fpgrowth-py

Collecting fpgrowth-py
  Downloading fpgrowth_py-1.0.0-py3-none-any.whl (5.6 kB)
Installing collected packages: fpgrowth-py
Successfully installed fpgrowth-py-1.0.0


In [27]:
#this function deduces the rules based on the FP tree approach
def FP_Growth(transactions,min_support,min_confidence):
  from fpgrowth_py import fpgrowth
  freqItemSet, rules = fpgrowth(transactions, minSupRatio=min_support, minConf=min_confidence)
  return {'frequentItemSets':freqItemSet,'rules':rules}

#now how to find the optimal value for min_support and min_confidence
#trying with a set of differnt values

In [None]:
min_support=[(3*7)/7500,(4*7)/7500,(5*7)/7500,(6*7)/7500] # item should apper atleast 3,4,5,6 times in transcations done in a day
min_confidence=[0.8,0.4,0.2]#starting with 0.8 and then decreasing the value each time
rulesFP=[] #this will be a list containing all the rules
valuesFP = [] #this will be coressponding values combination for the rules

for support in min_support:
  for confidence in min_confidence:
    rule = FP_Growth(transactions,support,confidence)
    rulesFP.append(rule)
    value = {'min_support':support,'min_confidence':confidence}
    valuesFP.append(value)

#this takes quite a long time to execute so execute at your own risk

In [65]:
#this funnction will display the rules for the apriori model
#this function takes a rule set as an argument
#this is fora rule set with two elements
def display_Aprori(rules,Rows:int=10):
  results = list(rules)
  if len(results) != 0:
    resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Left Hand Side', 'Right Hand Side', 'Support', 'Confidence', 'Lift'])
  # sorting the results in decreasing order based on the lift
    resultsinDataFrame=resultsinDataFrame.nlargest(n = Rows, columns = 'Lift')
    display(resultsinDataFrame)

#this block is hard coded and is specific for a rule set containig only two elements
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))



In [67]:
#this funnction will display the rules for the eclat model
#this function takes a rule set as an argument
#this is also hard coded for a 2 item rule set
def display_Eclat(rules,Rows:int=10):
  results = list(rules)
  if len(results) != 0:
    resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Product 1', 'Product 2', 'Support'])
  # sorting the results in decreasing order based on the lift
    resultsinDataFrame=resultsinDataFrame.nlargest(n = Rows, columns = 'Support')
    display(resultsinDataFrame)

#this block is hard coded and is specific for a rule set containig only two elements
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    return list(zip(lhs, rhs, supports))


In [None]:
#need to modify the result for more than two items rule sets

In [None]:
#this funnction will display the rules for the FP growth model
#this function takes a rule set as an argument
def display_FP(rules):
  pass

In [99]:
#test
test =  apriori(transactions,(3*7)/7500,0.2,3,4,4)

In [100]:
test=list(test)
testing = test[34]

In [102]:
display(testing)
display(testing[0])
display(testing[1])#this is the support value
display(testing[2])
display(testing[2][0])
display(testing[2][0][0]) #this is the set of two values associated with the 1 element
display(testing[2][0][1]) #this is the single associated value
display(testing[2][0][2])#this is the confidence value
display(testing[2][0][3])#this is the lift value


  #observations let's say we have a rule of n elements
  #than it signifies
  #that n-1 elements is associated with 1 element
#for eg if the rule set contains 3 elements that 2 of them will be a set which will be associated to the element in some way

RelationRecord(items=frozenset({'milk', 'soup', 'chocolate'}), support=0.004, ordered_statistics=[OrderedStatistic(items_base=frozenset({'soup', 'chocolate'}), items_add=frozenset({'milk'}), confidence=0.39473684210526316, lift=3.045808966861599)])

frozenset({'chocolate', 'milk', 'soup'})

0.004

[OrderedStatistic(items_base=frozenset({'soup', 'chocolate'}), items_add=frozenset({'milk'}), confidence=0.39473684210526316, lift=3.045808966861599)]

OrderedStatistic(items_base=frozenset({'soup', 'chocolate'}), items_add=frozenset({'milk'}), confidence=0.39473684210526316, lift=3.045808966861599)

frozenset({'chocolate', 'soup'})

frozenset({'milk'})

0.39473684210526316

3.045808966861599

In [None]:
#this funnction will display the rules for the apriori model
#this function takes a rule set as an argument
#this is fora rule set with three elements
def display_Aprori(rules,Rows:int=10):
  results = list(rules)
  if len(results) != 0:
    resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Left Hand Side', 'Right Hand Side', 'Support', 'Confidence', 'Lift'])
  # sorting the results in decreasing order based on the lift
    resultsinDataFrame=resultsinDataFrame.nlargest(n = Rows, columns = 'Lift')
    display(resultsinDataFrame)

#this block is hard coded and is specific for a rule set containig more than elements
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))



In [103]:
#this funnction will display the rules for the eclat model
#this function takes a rule set as an argument
#this is also hard coded for more than 2 items rule set
def display_Eclat(rules,Rows:int=10):
  results = list(rules)
  if len(results) != 0:
    resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Product Set', 'Product', 'Support'])
  # sorting the results in decreasing order based on the lift
    resultsinDataFrame=resultsinDataFrame.nlargest(n = Rows, columns = 'Support')
    display(resultsinDataFrame)

#this block is hard coded and is specific for a rule set containig more than two elements
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    return list(zip(lhs, rhs, supports))