In [1]:
import pandas as pd
import numpy as np

from itertools import permutations

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

Cross-selling products

The small grocery store has decided to cross-sell chewing gum with either coffee, cereal, or bread. To determine which of the three items is best to use, the store owner has performed an experiment. For one week, she sold chewing gum next to the register and recorded all transactions where it was purchased with either coffee, cereal, or bread. The transactions from that day are available as a list of lists named transactions. Each transaction is either ['coffee','gum'], ['cereal','gum'], or ['bread','gum'].

In [2]:
transactions = [['bread', 'gum'],
 ['bread', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['cereal', 'gum'],
 ['bread', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['bread', 'gum'],
 ['cereal', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['cereal', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['cereal', 'gum'],
 ['bread', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['cereal', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['cereal', 'gum'],
 ['cereal', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['bread', 'gum'],
 ['coffee', 'gum'],
 ['coffee', 'gum']]

In [3]:
# Count the number of transactions with coffee and gum
coffee = transactions.count(['coffee', 'gum'])
coffee

40

In [4]:
# Count the number of transactions with cereal and gum
cereal = transactions.count(['cereal', 'gum'])
cereal

25

In [5]:
# Count the number of transactions with bread and gum
bread = transactions.count(['bread', 'gum'])
bread

20

Preparing data for market basket analysis

Throughout this course, you will typically encounter data in one of two formats: a pandas DataFrame or a list of lists. DataFrame objects will be constructed by importing a csv file using pandas. They will consist of a single column of data, where each element contains a string of items in a transaction, separated by a comma, as in the table below.

In this exercise, you will practice loading the data from a csv file and will prepare it for use as a list of lists.

In [6]:
groceries_path = 'https://assets.datacamp.com/production/repositories/5654/datasets/5992818fd324b0de7d48311ee43fa038f7614ee5/small_grocery_store.csv'


In [7]:
# Load transactions from pandas
groceries = pd.read_csv(groceries_path)
groceries


Unnamed: 0,Transaction
0,"milk,bread,biscuit"
1,"bread,milk,biscuit,cereal"
2,"bread,tea"
3,"jam,bread,milk"
4,"tea,biscuit"
5,"bread,tea"
6,"tea,cereal"
7,"bread,tea,biscuit"
8,"jam,bread,tea"
9,"bread,milk"


In [8]:
# Split transaction strings into lists
transactions = groceries['Transaction'].apply(lambda t: t.split(','))
transactions


0                [milk, bread, biscuit]
1        [bread, milk, biscuit, cereal]
2                          [bread, tea]
3                    [jam, bread, milk]
4                        [tea, biscuit]
5                          [bread, tea]
6                         [tea, cereal]
7                 [bread, tea, biscuit]
8                     [jam, bread, tea]
9                         [bread, milk]
10    [coffee, orange, biscuit, cereal]
11    [coffee, orange, biscuit, cereal]
12                      [coffee, sugar]
13              [bread, coffee, orange]
14              [bread, sugar, biscuit]
15              [coffee, sugar, cereal]
16              [bread, sugar, biscuit]
17               [bread, coffee, sugar]
18               [bread, coffee, sugar]
19          [tea, milk, coffee, cereal]
Name: Transaction, dtype: object

In [9]:
# Convert DataFrame column into list of strings
transactions = list(transactions)
transactions

[['milk', 'bread', 'biscuit'],
 ['bread', 'milk', 'biscuit', 'cereal'],
 ['bread', 'tea'],
 ['jam', 'bread', 'milk'],
 ['tea', 'biscuit'],
 ['bread', 'tea'],
 ['tea', 'cereal'],
 ['bread', 'tea', 'biscuit'],
 ['jam', 'bread', 'tea'],
 ['bread', 'milk'],
 ['coffee', 'orange', 'biscuit', 'cereal'],
 ['coffee', 'orange', 'biscuit', 'cereal'],
 ['coffee', 'sugar'],
 ['bread', 'coffee', 'orange'],
 ['bread', 'sugar', 'biscuit'],
 ['coffee', 'sugar', 'cereal'],
 ['bread', 'sugar', 'biscuit'],
 ['bread', 'coffee', 'sugar'],
 ['bread', 'coffee', 'sugar'],
 ['tea', 'milk', 'coffee', 'cereal']]

Generating association rules

As you saw, the function permutations from the module itertools can be used to quickly generate the set of all one-antecedent, one-consequent rules. You do not, of course, know which of these rules are useful. You simply know that each is a valid way to combine two items.

Let's practice generating and counting the set of all rules for a subset of the grocery dataset: coffee, tea, milk, and sugar.

In [10]:
# Define the set of groceries
flattened = [i for t in transactions for i in t]
flattened

['milk',
 'bread',
 'biscuit',
 'bread',
 'milk',
 'biscuit',
 'cereal',
 'bread',
 'tea',
 'jam',
 'bread',
 'milk',
 'tea',
 'biscuit',
 'bread',
 'tea',
 'tea',
 'cereal',
 'bread',
 'tea',
 'biscuit',
 'jam',
 'bread',
 'tea',
 'bread',
 'milk',
 'coffee',
 'orange',
 'biscuit',
 'cereal',
 'coffee',
 'orange',
 'biscuit',
 'cereal',
 'coffee',
 'sugar',
 'bread',
 'coffee',
 'orange',
 'bread',
 'sugar',
 'biscuit',
 'coffee',
 'sugar',
 'cereal',
 'bread',
 'sugar',
 'biscuit',
 'bread',
 'coffee',
 'sugar',
 'bread',
 'coffee',
 'sugar',
 'tea',
 'milk',
 'coffee',
 'cereal']

In [11]:
groceries = list(set(flattened))

groceries

['cereal',
 'tea',
 'coffee',
 'orange',
 'sugar',
 'milk',
 'jam',
 'bread',
 'biscuit']

In [12]:
# Generate all possible rules from groceries list
rules = list(permutations(groceries, 2))

# Print the set of rules
print(rules)

# Print the number of rules
print(len(rules)) 

[('cereal', 'tea'), ('cereal', 'coffee'), ('cereal', 'orange'), ('cereal', 'sugar'), ('cereal', 'milk'), ('cereal', 'jam'), ('cereal', 'bread'), ('cereal', 'biscuit'), ('tea', 'cereal'), ('tea', 'coffee'), ('tea', 'orange'), ('tea', 'sugar'), ('tea', 'milk'), ('tea', 'jam'), ('tea', 'bread'), ('tea', 'biscuit'), ('coffee', 'cereal'), ('coffee', 'tea'), ('coffee', 'orange'), ('coffee', 'sugar'), ('coffee', 'milk'), ('coffee', 'jam'), ('coffee', 'bread'), ('coffee', 'biscuit'), ('orange', 'cereal'), ('orange', 'tea'), ('orange', 'coffee'), ('orange', 'sugar'), ('orange', 'milk'), ('orange', 'jam'), ('orange', 'bread'), ('orange', 'biscuit'), ('sugar', 'cereal'), ('sugar', 'tea'), ('sugar', 'coffee'), ('sugar', 'orange'), ('sugar', 'milk'), ('sugar', 'jam'), ('sugar', 'bread'), ('sugar', 'biscuit'), ('milk', 'cereal'), ('milk', 'tea'), ('milk', 'coffee'), ('milk', 'orange'), ('milk', 'sugar'), ('milk', 'jam'), ('milk', 'bread'), ('milk', 'biscuit'), ('jam', 'cereal'), ('jam', 'tea'), ('ja

One-hot encoding transaction data

Throughout the course, we will use a common pipeline for preprocessing data for use in market basket analysis. The first step is to import a pandas DataFrame and select the column that contains transactions. Each transaction in the column will be a string that consists of a number of items, each separated by a comma. The next step is to use a lambda function to split each transaction string into a list, thereby transforming the column into a list of lists.

In this exercise, you'll start with the list of lists from the grocery dataset, which is available to you as transactions. You will then transform transactions into a one-hot encoded DataFrame, where each column consists of TRUE and FALSE values that indicate whether an item was included in a transaction.

In [13]:
# Instantiate transaction encoder and identify unique items in transactions
encoder = TransactionEncoder().fit(transactions)

# One-hot encode transactions
onehot = encoder.transform(transactions)

onehot # 20 transactions by 9 products

array([[ True,  True, False, False, False,  True, False, False, False],
       [ True,  True,  True, False, False,  True, False, False, False],
       [False,  True, False, False, False, False, False, False,  True],
       [False,  True, False, False,  True,  True, False, False, False],
       [ True, False, False, False, False, False, False, False,  True],
       [False,  True, False, False, False, False, False, False,  True],
       [False, False,  True, False, False, False, False, False,  True],
       [ True,  True, False, False, False, False, False, False,  True],
       [False,  True, False, False,  True, False, False, False,  True],
       [False,  True, False, False, False,  True, False, False, False],
       [ True, False,  True,  True, False, False,  True, False, False],
       [ True, False,  True,  True, False, False,  True, False, False],
       [False, False, False,  True, False, False, False,  True, False],
       [False,  True, False,  True, False, False,  True, False, 

In [14]:
# Convert one-hot encoded data to DataFrame
onehot = pd.DataFrame(onehot, columns = encoder.columns_)

onehot

Unnamed: 0,biscuit,bread,cereal,coffee,jam,milk,orange,sugar,tea
0,True,True,False,False,False,True,False,False,False
1,True,True,True,False,False,True,False,False,False
2,False,True,False,False,False,False,False,False,True
3,False,True,False,False,True,True,False,False,False
4,True,False,False,False,False,False,False,False,True
5,False,True,False,False,False,False,False,False,True
6,False,False,True,False,False,False,False,False,True
7,True,True,False,False,False,False,False,False,True
8,False,True,False,False,True,False,False,False,True
9,False,True,False,False,False,True,False,False,False


Computing the support metric

In the previous exercise, you one-hot encoded a small grocery store's transactions as the DataFrame onehot. In this exercise, you'll make use of that DataFrame and the support metric to help the store's owner. First, she has asked you to identify frequently purchased items, which you'll do by computing support at the item-level. And second, she asked you to check whether the rule {jam} --> {bread} has a support of over 0.05

In [15]:
# Compute the support
support = onehot.mean()
support

biscuit    0.40
bread      0.65
cereal     0.30
coffee     0.40
jam        0.10
milk       0.25
orange     0.15
sugar      0.30
tea        0.35
dtype: float64

In [16]:
# Add a jam+bread column to the DataFrame onehot
onehot['jam+bread'] = np.logical_and(onehot['jam'], onehot['bread'])
onehot

Unnamed: 0,biscuit,bread,cereal,coffee,jam,milk,orange,sugar,tea,jam+bread
0,True,True,False,False,False,True,False,False,False,False
1,True,True,True,False,False,True,False,False,False,False
2,False,True,False,False,False,False,False,False,True,False
3,False,True,False,False,True,True,False,False,False,True
4,True,False,False,False,False,False,False,False,True,False
5,False,True,False,False,False,False,False,False,True,False
6,False,False,True,False,False,False,False,False,True,False
7,True,True,False,False,False,False,False,False,True,False
8,False,True,False,False,True,False,False,False,True,True
9,False,True,False,False,False,True,False,False,False,False


In [17]:
# Compute the support
support = onehot.mean()
support

biscuit      0.40
bread        0.65
cereal       0.30
coffee       0.40
jam          0.10
milk         0.25
orange       0.15
sugar        0.30
tea          0.35
jam+bread    0.10
dtype: float64

Recommending books with support

A library wants to get members to read more and has decided to use market basket analysis to figure out how. They approach you to do the analysis and ask that you use the five most highly-rated books from the goodbooks-10k dataset, which was introduced in the video. You are given the data in one-hot encoded format in a pandas DataFrame called books.

Each column in the DataFrame corresponds to a book and has the value TRUE if the book is contained in a reader's library and is rated highly. To make things simpler, we'll work with shortened book names: Hunger, Potter, and Twilight.

In [18]:
books = pd.read_csv('books.csv', index_col = 0)
books.reset_index(drop=True, inplace=True)
books


Unnamed: 0,Hunger,Potter,Twilight
0,FALSE,True,False
1,FALSE,True,True
2,FALSE,False,False
3,FALSE,True,False
4,FALSE,False,False
...,...,...,...
8045,5 False,False,False
8046,6 False,False,False
8047,7 False,False,True
8048,8 True,False,True


In [19]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8050 entries, 0 to 8049
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Hunger    8050 non-null   object
 1   Potter    8050 non-null   bool  
 2   Twilight  8050 non-null   bool  
dtypes: bool(2), object(1)
memory usage: 78.7+ KB


In [20]:
# remove wrong character to the 'Hunger' column and transform it to boolean
books['Hunger'] = books['Hunger'].apply(lambda x: True if 'true' in str(x).lower() else (False if 'false' in str(x).lower() else x))
books

Unnamed: 0,Hunger,Potter,Twilight
0,False,True,False
1,False,True,True
2,False,False,False
3,False,True,False
4,False,False,False
...,...,...,...
8045,False,False,False
8046,False,False,False
8047,False,False,True
8048,True,False,True


In [21]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8050 entries, 0 to 8049
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Hunger    8050 non-null   bool 
 1   Potter    8050 non-null   bool 
 2   Twilight  8050 non-null   bool 
dtypes: bool(3)
memory usage: 23.7 KB


In [22]:
# Recall that support is the share of TRUE entries in the column of books.

# Compute support for Hunger and Potter
supportHP = np.logical_and(books['Hunger'], books['Potter']).mean()

# Compute support for Hunger and Twilight
supportHT = np.logical_and(books['Hunger'], books['Twilight']).mean()

# Compute support for Potter and Twilight
supportPT = np.logical_and(books['Potter'], books['Twilight']).mean()

# Print support values
print("Hunger Games and Harry Potter: %.2f" % supportHP)
print("Hunger Games and Twilight: %.2f" % supportHT)
print("Harry Potter and Twilight: %.2f" % supportPT)

Hunger Games and Harry Potter: 0.12
Hunger Games and Twilight: 0.09
Harry Potter and Twilight: 0.14


Refining support with confidence

After reporting your findings from the previous exercise, the library asks you about the direction of the relationship. Should they use Harry Potter to promote Twilight or Twilight to promote Harry Potter?

After thinking about this, you decide to compute the confidence metric, which has a direction, unlike support. You'll compute it for both {Potter} --> {Twilight} and {Twilight} --> {Potter}.

In [23]:
# Recall that the confidence of "if X then Y" is: support(X & Y) / support(X)

# Compute support for Potter and Twilight
supportPT = np.logical_and(books['Potter'], books['Twilight']).mean()

# Compute support for Potter
supportP = books['Potter'].mean()

# Compute support for Twilight
supportT = books['Twilight'].mean()

print('supportP', supportP, 'supportT', supportT)

supportP 0.47751552795031055 supportT 0.2567701863354037


In [24]:
# Compute confidence for both rules
confidencePT = supportPT / supportP
confidenceTP = supportPT / supportT

# Print results
print('{0:.2f}, {1:.2f}'.format(confidencePT, confidenceTP))

# Even though the support is identical for the two association rules, the confidence is much higher for Twilight -> Harry Potter, since Harry Potter has a higher support than Twilight.

0.29, 0.55


Further refinement with lift

Once again, you report your results to the library: Use Twilight to promote Harry Potter, since the rule has a higher confidence metric. The library thanks you for the suggestion, but asks you to confirm that this is a meaningful relationship using another metric.

You recall that lift may be useful here. If lift is less than 1, this means that Harry Potter and Twilight are paired together less frequently than we would expect if the pairings occurred by random chance.

In [25]:
# Remember that lift is computed for items X and Y as: support(X & Y) / [support(X) * support(Y)]

# Compute lift
lift = supportPT / (supportP * supportT)

lift

# As it turns out, lift is greater than 1.0. This could give us some confidence that the association rule we recommended did not arise by random chance.

1.146881247209129

Computing conviction

After hearing about the useful advice you provided to the library, the founder of a small ebook selling start-up approaches you for consulting services. As a test of your abilities, she asks you if you are able to compute conviction for the rule {Potter} --> {Hunger}, so she can decide whether to place the books next to each other on the company's website. 

In [26]:
# The conviction of "if X then Y" is computed as follows: support(X) * support(NOT Y) / support(X and NOT Y).

# Compute support for Potter AND Hunger
supportPH = np.logical_and(books['Potter'], books['Hunger']).mean()

# Compute support for NOT Hunger
supportnH = 1.0 - books['Hunger'].mean()

# Compute support for Potter and NOT Hunger
supportPnH = supportP - supportPH

# Compute and print conviction for Potter -> Hunger
conviction = supportP * supportnH / supportPnH
print("Conviction: %.2f" % conviction)

# Notice that the value of conviction is than 1, suggesting that the rule ``if Potter then Hunger'' is not supported.

Conviction: 0.92


Computing conviction with a function

After successful completion of her trial project, the ebook start-up's founder decides to hire you for a much bigger project. She asks you if you are able to compute conviction for every pair of books in the goodreads-10k dataset, so she can use that information to decide which books to locate closer together on the website.

You agree to take the job, but realize that you need more a efficient way to compute conviction, since you will need to compute it many times. You decide to write a function that computes it. It will take two columns of a pandas DataFrame as an input, one antecedent and one consequent, and output the conviction metric.

In [27]:
def conviction(antecedent, consequent):
    
    # Compute support for antecedent AND consequent
    supportAC = np.logical_and(antecedent, consequent).mean()

    # Compute support for antecedent
    supportA = antecedent.mean()

    # Compute support for NOT consequent
    supportnC = 1.0 - consequent.mean()

    # Compute support for antecedent and NOT consequent
    supportAnC = supportA - supportAC

    # Return conviction
    return supportA * supportnC / supportAnC

Promoting ebooks with conviction

In the previous exercise, we defined a function to compute conviction. We were asked to apply that function to all two-book permutations of the goodreads-10k dataset. In this exercise, we'll test the function by applying it to the three most popular books, which we used in earlier exercises: The Hunger Games, Harry Potter, and Twilight.

In [28]:
potter = books['Potter']
twilight = books['Twilight']
hunger = books['Hunger']

In [29]:
# Compute conviction for twilight -> potter and potter -> twilight
convictionTP = conviction(twilight, potter)
convictionPT = conviction(potter, twilight)

# Compute conviction for twilight -> hunger and hunger -> twilight
convictionTH = conviction(twilight, hunger)
convictionHT = conviction(hunger, twilight)

# Compute conviction for potter -> hunger and hunger -> potter
convictionPH = conviction(potter, hunger)
convictionHP = conviction(hunger,potter)

# Print results
print('Harry Potter -> Twilight: ', convictionHT)
print('Twilight -> Potter: ', convictionTP)

Harry Potter -> Twilight:  1.0315274939515657
Twilight -> Potter:  1.1550539077290998


Computing association and dissociation

The library has returned to you once again about your recommendation to promote Harry Potter using Twilight. They're worried that the two might be dissociated, which could have a negative impact on their promotional effort. They ask you to verify that this is not the case.

You immediately think of Zhang's metric, which measures association and dissociation continuously. Association is positive and dissociation is negative. 

In [30]:
# Compute the support of Twilight and Harry Potter
supportT = books['Twilight'].mean()
supportP = books['Potter'].mean()

# Compute the support of both books
supportTP = np.logical_and(books['Twilight'],books['Potter']).mean()

# Complete the expressions for the numerator and denominator
numerator = supportTP - supportT*supportP
denominator = max(supportTP*(1-supportT), supportT*(supportP-supportTP))

# Compute and print Zhang's metric
zhang = numerator / denominator
zhang

# the association rule ``if Twilight then Harry Potter'' proved robust. It had a positive value for Zhang's metric, indicating that the two books are not dissociated.

0.17231567178855997

Defining Zhang's metric

In general, when we want to perform a task many times, we'll write a function, rather than coding up each individual instance. In this exercise, we'll define a function for Zhang's metric that takes an antecedent and consequent and outputs the metric itself. When the problems we solve become increasingly complicated in the following chapter, having a convenient means of computing a metric will greatly simplify things.

In [31]:
# Define a function to compute Zhang's metric
def zhang(antecedent, consequent):
    # Compute the support of each book
    supportA = antecedent.mean()
    supportC = consequent.mean()

    # Compute the support of both books
    supportAC = np.logical_and(antecedent, consequent).mean()

    # Complete the expressions for the numerator and denominator
    numerator = supportAC - supportA*supportC
    denominator = max(supportAC*(1-supportA), supportA*(supportC-supportAC))

    # Return Zhang's metric
    return numerator / denominator

Applying Zhang's metric

The founder of the ebook start-up has returned for additional consulting services. She has sent you a list of itemsets she's investigating and has asked you to determine whether any of them contain items that are dissociated. When you're finished, she has asked that you add the metric you use to a column in the rules DataFrame, which is available to you, and currently contains columns for antecedents and consequents.

In [32]:
books_5 = pd.read_csv('books_5.csv', index_col = 0)
books_5.reset_index(drop=True, inplace=True)

# remove wrong character to the 'Hunger' column and transform it to boolean
books_5['Hunger'] = books_5['Hunger'].apply(lambda x: True if 'true' in str(x).lower() else (False if 'false' in str(x).lower() else x))
books_5

Unnamed: 0,Hunger,Potter,Twilight,Mockingbird,Gatsby
0,False,True,False,True,True
1,False,True,True,False,True
2,False,False,False,True,False
3,False,True,False,False,True
4,False,False,False,False,True
...,...,...,...,...,...
8045,False,False,False,True,True
8046,False,False,False,True,False
8047,False,False,True,False,False
8048,True,False,True,False,False


In [33]:
itemsets = [['Potter', 'Hunger'],
 ['Twilight', 'Hunger'],
 ['Mockingbird', 'Hunger'],
 ['Gatsby', 'Hunger'],
 ['Potter', 'Twilight'],
 ['Potter', 'Mockingbird'],
 ['Potter', 'Gatsby'],
 ['Mockingbird', 'Twilight'],
 ['Gatsby', 'Twilight'],
 ['Mockingbird', 'Gatsby']]

In [34]:
rules = pd.DataFrame(itemsets, columns=['antecedent', 'consequent'])
rules

Unnamed: 0,antecedent,consequent
0,Potter,Hunger
1,Twilight,Hunger
2,Mockingbird,Hunger
3,Gatsby,Hunger
4,Potter,Twilight
5,Potter,Mockingbird
6,Potter,Gatsby
7,Mockingbird,Twilight
8,Gatsby,Twilight
9,Mockingbird,Gatsby


In [35]:
books_5[itemsets[0][0]]

0        True
1        True
2       False
3        True
4       False
        ...  
8045    False
8046    False
8047    False
8048    False
8049    False
Name: Potter, Length: 8050, dtype: bool

In [36]:
# Define an empty list for Zhang's metric
zhangs_metric = []

# Loop over lists in itemsets
for itemset in itemsets:
    # Extract the antecedent and consequent columns
    antecedent = books_5[itemset[0]]
    consequent = books_5[itemset[1]]

    # Complete Zhang's metric and append it to the list
    zhangs_metric.append(zhang(antecedent, consequent))
    
# Print results
rules['zhang'] = zhangs_metric

rules

# Notice that most of the items are dissociated, which suggests that they would have been a poor choice to pair together for promotional purposes.

Unnamed: 0,antecedent,consequent,zhang
0,Potter,Hunger,-0.306049
1,Twilight,Hunger,0.109357
2,Mockingbird,Hunger,-0.525436
3,Gatsby,Hunger,-0.550446
4,Potter,Twilight,0.245118
5,Potter,Mockingbird,-0.065537
6,Potter,Gatsby,-0.165572
7,Mockingbird,Twilight,-0.319008
8,Gatsby,Twilight,-0.370875
9,Mockingbird,Gatsby,0.46646


Filtering with support and conviction

In the video, we discussed the continued consulting work you are doing for the founder of an ebook selling start-up. The founder has approached you with the DataFrame rules, which contains the work of a data scientist who was previously on staff. It includes columns for antecedents and consequents, along with the performance for each of those rules with respect to a number of metrics.

Your objective will be to perform multi-metric filtering on the dataset to identify potentially useful rules

In [37]:
rules_full = pd.read_csv('rules_full.csv', index_col = 0)
rules_full

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Potter),(Hunger),0.478,0.319,0.124,0.259,0.813,-0.02850,0.919
1,(Hunger),(Potter),0.319,0.478,0.124,0.388,0.813,-0.02850,0.854
2,(Twilight),(Hunger),0.257,0.319,0.089,0.347,1.088,0.00725,1.043
3,(Hunger),(Twilight),0.319,0.257,0.089,0.279,1.088,0.00725,1.032
4,(Mockingbird),(Hunger),0.477,0.319,0.096,0.202,0.633,-0.05580,0.853
...,...,...,...,...,...,...,...,...,...
145,"(Gatsby, Mockingbird)","(Potter, Twilight)",0.186,0.141,0.024,0.131,0.930,-0.00182,0.989
146,(Potter),"(Twilight, Gatsby, Mockingbird)",0.478,0.036,0.024,0.051,1.406,0.00703,1.016
147,(Twilight),"(Potter, Gatsby, Mockingbird)",0.257,0.090,0.024,0.095,1.056,0.00129,1.006
148,(Gatsby),"(Potter, Twilight, Mockingbird)",0.295,0.063,0.024,0.082,1.310,0.00576,1.021


In [38]:
# Select the subset of rules with antecedent support greater than 0.05
rules_filt = rules_full[rules_full['antecedent support'] > 0.05]

# Select the subset of rules with a consequent support greater than 0.02
rules_filt = rules_filt[rules_filt['consequent support'] > 0.02]

# Select the subset of rules with a conviction greater than 1.01
rules_filt = rules_filt[rules_filt['conviction'] > 1.01]

rules_filt

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(Twilight),(Hunger),0.257,0.319,0.089,0.347,1.088,0.00725,1.043
3,(Hunger),(Twilight),0.319,0.257,0.089,0.279,1.088,0.00725,1.032
8,(Potter),(Twilight),0.478,0.257,0.141,0.294,1.147,0.01800,1.053
9,(Twilight),(Potter),0.257,0.478,0.141,0.548,1.147,0.01800,1.155
18,(Gatsby),(Mockingbird),0.295,0.477,0.186,0.630,1.323,0.04540,1.417
...,...,...,...,...,...,...,...,...,...
143,"(Twilight, Gatsby)","(Potter, Mockingbird)",0.054,0.220,0.024,0.455,2.072,0.01260,1.431
144,"(Twilight, Mockingbird)","(Potter, Gatsby)",0.098,0.128,0.024,0.248,1.940,0.01180,1.160
146,(Potter),"(Twilight, Gatsby, Mockingbird)",0.478,0.036,0.024,0.051,1.406,0.00703,1.016
148,(Gatsby),"(Potter, Twilight, Mockingbird)",0.295,0.063,0.024,0.082,1.310,0.00576,1.021


In [39]:
# The Zhang's metric can be easily added, unsing the antecendent, consequent and antecedent*consequent (support field) already defined in the table

supportA = rules_full['antecedent support']
supportC = rules_full['consequent support']
supportAC = rules_full['support']

# Complete the expressions for the numerator and denominator
numerator = supportAC - supportA*supportC
denominator = np.maximum(supportAC*(1-supportA), supportA*(supportC-supportAC))

rules_full['Zhang'] = numerator/denominator
rules_full

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Zhang
0,(Potter),(Hunger),0.478,0.319,0.124,0.259,0.813,-0.02850,0.919,-0.305568
1,(Hunger),(Potter),0.319,0.478,0.124,0.388,0.813,-0.02850,0.854,-0.252218
2,(Twilight),(Hunger),0.257,0.319,0.089,0.347,1.088,0.00725,1.043,0.106114
3,(Hunger),(Twilight),0.319,0.257,0.089,0.279,1.088,0.00725,1.032,0.115775
4,(Mockingbird),(Hunger),0.477,0.319,0.096,0.202,0.633,-0.05580,0.853,-0.527992
...,...,...,...,...,...,...,...,...,...,...
145,"(Gatsby, Mockingbird)","(Potter, Twilight)",0.186,0.141,0.024,0.131,0.930,-0.00182,0.989,-0.102288
146,(Potter),"(Twilight, Gatsby, Mockingbird)",0.478,0.036,0.024,0.051,1.406,0.00703,1.016,0.542146
147,(Twilight),"(Potter, Gatsby, Mockingbird)",0.257,0.090,0.024,0.095,1.056,0.00129,1.006,0.048789
148,(Gatsby),"(Potter, Twilight, Mockingbird)",0.295,0.063,0.024,0.082,1.310,0.00576,1.021,0.320035


Using multi-metric filtering to cross-promote books

As a final request, the founder of the ebook selling start-up asks you to perform additional filtering. Your previous attempt returned 82 rules, but she wanted only one

In [40]:
# Set the lift threshold to 1.5
rules_best = rules_full[rules_full['lift'] > 1.5]

# Set the conviction threshold to 1.0
rules_best = rules_best[rules_best['conviction'] > 1.0]

# Set the threshold for Zhang's rule to 0.65
rules_best = rules_best[rules_best['Zhang'] > 0.65]

rules_best

# Print rule
# print(rules_best[['antecedents','consequents']])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,Zhang
113,"(Potter, Mockingbird)","(Gatsby, Hunger)",0.22,0.051,0.023,0.103,2.037,0.0115,1.058,0.656633
120,(Mockingbird),"(Potter, Gatsby, Hunger)",0.477,0.031,0.023,0.047,1.528,0.00781,1.017,0.682767
127,"(Twilight, Mockingbird)","(Gatsby, Hunger)",0.098,0.051,0.013,0.129,2.55,0.0077,1.09,0.682415
131,"(Mockingbird, Hunger)","(Twilight, Gatsby)",0.096,0.054,0.013,0.132,2.458,0.00752,1.09,0.665078


Performing aggregation

After completing minor consulting jobs for a library and an ebook seller, you've finally received your first big market basket analysis project: advising an online novelty gifts retailer on cross-promotions. Since the retailer has never previously hired a data scientist, it would like you to start the project by exploring its transaction data. It has asked you to perform aggregation for all signs in the dataset and also compute the support for this category. 

In [41]:
online_retail = pd.read_csv('online_retail.csv')
online_retail

Unnamed: 0,InvoiceNo,StockCode,Description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)
...,...,...,...
227755,C581229,23158,SET OF 5 LUCKY CAT MAGNETS
227756,C581229,22712,CARD DOLLY GIRL
227757,C581229,22027,TEA PARTY BIRTHDAY CARD
227758,C581229,21508,VINTAGE KID DOLLY CARD


In [42]:
online_retail.InvoiceNo.nunique() # unique invoices

9709

In [43]:
online_retail.StockCode.nunique() # unique items

3353

In [44]:
online_retail.Description.nunique() # unique descriptions

3460

In [45]:
onehot = pd.crosstab(online_retail['InvoiceNo'], online_retail['Description'])
onehot

Description,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,TRELLIS COAT RACK,...,wet boxes,wet pallet,wet rusty,wet?,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
549687,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
550644,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
552695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
553857,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
557499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C581484,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# Select the column headers for sign items
sign_headers = [i for i in onehot.columns if i.lower().find('sign')>=0]

len(sign_headers)

173

In [47]:
# Select columns of sign items using sign_headers
sign_columns = onehot[sign_headers]

print(sign_columns.mean()) # very sparse columns (LOW SUPPORT)
sign_columns

Description
16 PIECE CUTLERY SET PANTRY DESIGN    0.001925
60 CAKE CASES DOLLY GIRL DESIGN       0.018604
AIRLINE LOUNGE,METAL SIGN             0.013258
AREA PATROLLED METAL SIGN             0.020528
ASSTD DESIGN 3D PAPER STICKERS        0.001176
                                        ...   
WRAP VINTAGE LEAF DESIGN              0.010906
WRAP VINTAGE PETALS  DESIGN           0.000107
YELLOW/PINK FLOWER DESIGN BIG MUG     0.000962
YOU'RE CONFUSING ME METAL SIGN        0.019245
ZINC BOX SIGN HOME                    0.006522
Length: 173, dtype: float64


Description,16 PIECE CUTLERY SET PANTRY DESIGN,60 CAKE CASES DOLLY GIRL DESIGN,"AIRLINE LOUNGE,METAL SIGN",AREA PATROLLED METAL SIGN,ASSTD DESIGN 3D PAPER STICKERS,ASSTD DESIGN RACING CAR PEN,BAKING SET SPACEBOY DESIGN,BATHROOM METAL SIGN,BATHROOM METAL SIGN,BATHROOM SET LOVE HEART DESIGN,...,WRAP ALPHABET DESIGN,WRAP BILLBOARD FONTS DESIGN,WRAP DOILEY DESIGN,WRAP POPPIES DESIGN,WRAP SUMMER ROSE DESIGN,WRAP VINTAGE LEAF DESIGN,WRAP VINTAGE PETALS DESIGN,YELLOW/PINK FLOWER DESIGN BIG MUG,YOU'RE CONFUSING ME METAL SIGN,ZINC BOX SIGN HOME
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
549687,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
550644,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
552695,0,0,0,1,0,0,0,0,0,0,...,0,1,1,1,0,0,1,0,0,0
553857,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
557499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C581484,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Perform aggregation of sign items into sign category
signs = sign_columns.sum(axis = 1) >= 1.0 # set the new columns to True if at least a sub-column contains a 1

print(signs.mean()) # aggregated column less sparse (HIGHER SUPPORT)
signs

0.45856944295947827


InvoiceNo
549687     False
550644      True
552695      True
553857     False
557499     False
           ...  
C581484    False
C581490    False
C581499    False
C581568    False
C581569    False
Length: 9353, dtype: bool

Defining an aggregation function

Surprised by the high share of sign items in its inventory, the retailer decides that it makes sense to do further aggregation for different categories to explore the data better. This seems trivial to you, but the retailer has not previously been able to perform even a basic descriptive analysis of its transaction and items.

The retailer asks you to perform aggregation for the candles, bags, and boxes categories. To simplify the task, you decide to write a function. It will take a string that contains an item's category. It will then output a DataFrame that indicates whether each transaction includes items from that category. 

In [49]:
def aggregate(item):
    # Select the column headers for sign items in onehot
    item_headers = [i for i in onehot.columns if i.lower().find(item)>=0]

    # Select columns of sign items
    item_columns = onehot[item_headers]

    # Return category of aggregated items
    return item_columns.sum(axis = 1) >= 1.0


In [50]:
# Aggregate items for the bags, boxes, and candles categories  
bags = aggregate('bag')
boxes = aggregate('boxes')
candles = aggregate('candles')

bags.mean(), boxes.mean(), candles.mean()

(0.3858654977012723, 0.07077942906019459, 0.11953383940981503)

Identifying frequent itemsets with Apriori

The aggregation exercise you performed for the online retailer proved helpful. It offered a starting point for understanding which categories of items appear frequently in transactions. The retailer now wants to explore the individual items themselves to find out which are frequent.

In this exercise, you'll apply the Apriori algorithm to the online retail dataset without aggregating first. Your objective will be to prune the itemsets using a minimum value of support and a maximum item number threshold.

In [51]:
# Convert the DataFrame to boolean values (True/False) and sample only some rown and columns (to let the apriori algorithm run faster)
onehot_bool = onehot.astype(bool)
# onehot_samp = onehot_bool.sample(n=1000)
# onehot_samp = onehot_samp.sample(n=500, axis=1)

In [52]:
# Compute frequent itemsets using the Apriori algorithm
frequent_itemsets = apriori(onehot_bool, 
                            min_support = 0.05, 
                            max_len = 3, 
                            use_colnames = True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.050144,(ALARM CLOCK BAKELIKE GREEN)
1,0.062974,(ASSORTED COLOUR BIRD ORNAMENT)
2,0.056987,(CHOCOLATE HOT WATER BOTTLE)
3,0.051748,(GARDENERS KNEELING PAD CUP OF TEA )
4,0.062547,(GARDENERS KNEELING PAD KEEP CALM )
5,0.056987,(HAND WARMER OWL DESIGN)
6,0.081578,(HOT WATER BOTTLE KEEP CALM)
7,0.056559,(HOT WATER BOTTLE TEA AND SYMPATHY)
8,0.066717,(JUMBO BAG 50'S CHRISTMAS )
9,0.050786,(JUMBO BAG ALPHABET)


Selecting a support threshold

The manager of the online gift store looks at the results you provided from the previous exercise and commends you for the good work. She does, however, raise an issue: all of the itemsets you identified contain only one item. She asks whether it would be possible to use a less restrictive rule and to generate more itemsets, possibly including those with multiple items.

After agreeing to do this, you think about what might explain the lack of itemsets with more than 1 item. It can't be the max_len parameter, since that was set to three. You decide it must be support and decide to test two different values, each time checking how many additional itemsets are generated.

In [53]:
# Compute frequent itemsets using a support of 0.03 and length of 3
frequent_itemsets_1 = apriori(onehot_bool, min_support = 0.03, 
                            max_len = 3, use_colnames = True)

# Compute frequent itemsets using a support of 0.01 and length of 3
frequent_itemsets_2 = apriori(onehot_bool, min_support = 0.01, 
                            max_len = 3, use_colnames = True)

# Print the number of freqeuent itemsets
print(len(frequent_itemsets_1), len(frequent_itemsets_2))

135 2515


Generating association rules

In the final exercise of the previous section, you computed itemsets for the novelty gift store owner using the Apriori algorithm. 
Satisfied with the descriptive work you've done, the store manager asks you to identify some association rules from those two sets of frequent itemsets you computed.

In [54]:
# Compute all association rules for frequent_itemsets_1
rules_1 = association_rules(frequent_itemsets_1, 
                            metric = "support", 
                         	min_threshold = 0.04)

# Compute all association rules for frequent_itemsets_2
rules_2 = association_rules(frequent_itemsets_2, 
                            metric = "support",
                        	min_threshold = 0.04)

# Print the number of association rules generated
print(len(rules_1), len(rules_2))

2 2


In [55]:
rules_1

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(PAPER CHAIN KIT VINTAGE CHRISTMAS),(PAPER CHAIN KIT 50'S CHRISTMAS ),0.068534,0.096226,0.046616,0.680187,7.068657,0.040021,2.825948,0.921698
1,(PAPER CHAIN KIT 50'S CHRISTMAS ),(PAPER CHAIN KIT VINTAGE CHRISTMAS),0.096226,0.068534,0.046616,0.484444,7.068657,0.040021,1.806723,0.949939


In [56]:
rules_2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(PAPER CHAIN KIT VINTAGE CHRISTMAS),(PAPER CHAIN KIT 50'S CHRISTMAS ),0.068534,0.096226,0.046616,0.680187,7.068657,0.040021,2.825948,0.921698
1,(PAPER CHAIN KIT 50'S CHRISTMAS ),(PAPER CHAIN KIT VINTAGE CHRISTMAS),0.096226,0.068534,0.046616,0.484444,7.068657,0.040021,1.806723,0.949939


Pruning with lift

Once again, you report back to the novelty gift store manager. This time, you tell her that you identified no rules when you used a higher support threshold for the Apriori algorithm and only two rules when you used a lower threshold. She commends you for the good work, but asks you to consider using another metric to reduce the two rules to one.

You remember that lift had a simple interpretation: values greater than 1 indicate that items co-occur more than we would expect if they were independently distributed across transactions. You decide to use lift, since that message will be simple to convey.

In [57]:
frequent_itemsets_1

Unnamed: 0,support,itemsets
0,0.031434,(6 GIFT TAGS 50'S CHRISTMAS )
1,0.031006,(6 GIFT TAGS VINTAGE CHRISTMAS )
2,0.039880,(6 RIBBONS RUSTIC CHARM)
3,0.049289,(60 CAKE CASES VINTAGE CHRISTMAS)
4,0.050144,(ALARM CLOCK BAKELIKE GREEN)
...,...,...
130,0.031755,"(JUMBO BAG VINTAGE CHRISTMAS , JUMBO BAG 50'S ..."
131,0.030685,"(JUMBO BAG RED RETROSPOT, JUMBO BAG PINK POLKA..."
132,0.035604,"(JUMBO BAG VINTAGE DOILY , JUMBO BAG RED RETRO..."
133,0.046616,"(PAPER CHAIN KIT VINTAGE CHRISTMAS, PAPER CHAI..."


In [58]:
# Compute all association rules for frequent_itemsets1
rules = association_rules(frequent_itemsets, 
                            metric = "lift", 
                         	min_threshold = 15)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


Pruning with confidence

Once again, you've come up short: you found multiple useful rules, but can't narrow it down to one. Even worse, the two rules you found used the same itemset, but just swapped the antecedents and consequents. You decide to see whether pruning by another metric might allow you to narrow things down to a single association rule.

What would be the right metric? Both lift and support are identical for all rules that can be generated from an itemset, so you decide to use confidence instead, which differs for rules produced from the same itemset.

In [59]:
# Compute all association rules using confidence
rules = association_rules(frequent_itemsets_2, 
                            metric = "confidence", 
                         	min_threshold = 1)

rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(SKULL SHOULDER BAG, JAM MAKING SET PRINTED)",(DOTCOM POSTAGE),0.010906,0.019566,0.010906,1.0,51.10929,0.010692,inf,0.991244


Aggregation and filtering

In the video, we helped a gift store manager arrange the sections in her physical retail location according to association rules. The layout of the store forced us to group sections into two pairs of product types. After applying advanced filtering techniques, we proposed the floor layout below.

Boxes - Bags
Signs - Candles

The image shows the store layout that was selected in the video.
The store manager is now asking you to generate another floorplan proposal, but with a different criterion: each pair of sections should contain one high support product and one low support product

In [60]:
aggregated = pd.read_csv('aggregated.csv', index_col = 0)
aggregated

Unnamed: 0,bag,box,candle,sign
0,True,False,False,True
1,True,False,False,False
2,False,True,False,False
3,True,False,True,True
4,False,True,False,False
...,...,...,...,...
366,False,False,False,True
367,False,False,False,True
368,True,False,False,True
369,True,False,False,False


In [61]:
# Apply the apriori algorithm with a minimum support of 0.0001
frequent_itemsets_aggr = apriori(aggregated, min_support = 0.0001, use_colnames = True)
frequent_itemsets_aggr

Unnamed: 0,support,itemsets
0,0.466307,(bag)
1,0.256065,(box)
2,0.088949,(candle)
3,0.355795,(sign)
4,0.021563,"(box, bag)"
5,0.010782,"(candle, bag)"
6,0.097035,"(sign, bag)"
7,0.016173,"(box, candle)"
8,0.018868,"(box, sign)"
9,0.008086,"(candle, sign)"


In [62]:
# Generate the initial set of rules using a minimum support of 0.0001
rules = association_rules(frequent_itemsets_aggr, 
                          metric = "support", min_threshold = 0.0001)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(box),(bag),0.256065,0.466307,0.021563,0.084211,0.18059,-0.097841,0.582768,-0.859139
1,(bag),(box),0.466307,0.256065,0.021563,0.046243,0.18059,-0.097841,0.780005,-0.894758
2,(candle),(bag),0.088949,0.466307,0.010782,0.121212,0.25994,-0.030696,0.607306,-0.757576
3,(bag),(candle),0.466307,0.088949,0.010782,0.023121,0.25994,-0.030696,0.932615,-0.842137
4,(sign),(bag),0.355795,0.466307,0.097035,0.272727,0.584866,-0.068875,0.733827,-0.52422
5,(bag),(sign),0.466307,0.355795,0.097035,0.208092,0.584866,-0.068875,0.813485,-0.570809
6,(box),(candle),0.256065,0.088949,0.016173,0.063158,0.710048,-0.006604,0.97247,-0.354386
7,(candle),(box),0.088949,0.256065,0.016173,0.181818,0.710048,-0.006604,0.909254,-0.309499
8,(box),(sign),0.256065,0.355795,0.018868,0.073684,0.207097,-0.072239,0.695448,-0.837305
9,(sign),(box),0.355795,0.256065,0.018868,0.05303,0.207097,-0.072239,0.785596,-0.855975


In [63]:
# Set minimum antecedent support to 0.35
rules = rules[rules['antecedent support'] > 0.35]

# Set maximum consequent support to 0.35
rules = rules[rules['consequent support'] < 0.35]

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(bag),(box),0.466307,0.256065,0.021563,0.046243,0.18059,-0.097841,0.780005,-0.894758
3,(bag),(candle),0.466307,0.088949,0.010782,0.023121,0.25994,-0.030696,0.932615,-0.842137
9,(sign),(box),0.355795,0.256065,0.018868,0.05303,0.207097,-0.072239,0.785596,-0.855975
11,(sign),(candle),0.355795,0.088949,0.008086,0.022727,0.25551,-0.023561,0.932238,-0.818939
16,(sign),"(candle, bag)",0.355795,0.010782,0.005391,0.015152,1.405303,0.001555,1.004437,0.447699
17,(bag),"(candle, sign)",0.466307,0.008086,0.005391,0.011561,1.429672,0.00162,1.003515,0.563131


Applying Zhang's rule

In Chapter 2, we learned that Zhang's rule is a continuous measure of association between two items that takes values in the [-1,+1] interval. A -1 value indicates a perfectly negative association and a +1 value indicates a perfectly positive association. In this exercise, you'll determine whether Zhang's rule can be used to refine a set of rules a gift store is currently using to promote products.

In [64]:
# Generate the initial set of rules using a minimum lift of 1.00
rules = association_rules(frequent_itemsets_aggr, metric = "lift", min_threshold = 1)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(candle, sign)",(bag),0.008086,0.466307,0.005391,0.666667,1.429672,0.00162,1.601078,0.302989
1,"(candle, bag)",(sign),0.010782,0.355795,0.005391,0.5,1.405303,0.001555,1.28841,0.291553
2,(sign),"(candle, bag)",0.355795,0.010782,0.005391,0.015152,1.405303,0.001555,1.004437,0.447699
3,(bag),"(candle, sign)",0.466307,0.008086,0.005391,0.011561,1.429672,0.00162,1.003515,0.563131


In [65]:
# Set antecedent support to 0.005
rules = rules[rules['antecedent support'] > 0.05]

# Set consequent support to 0.005
rules = rules[rules['consequent support'] > 0.005]

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(sign),"(candle, bag)",0.355795,0.010782,0.005391,0.015152,1.405303,0.001555,1.004437,0.447699
3,(bag),"(candle, sign)",0.466307,0.008086,0.005391,0.011561,1.429672,0.00162,1.003515,0.563131


In [66]:
def zhangs_rule(rules):
    PAB = rules['support'].copy()
    PA = rules['antecedent support'].copy()
    PB = rules['consequent support'].copy()
    NUMERATOR = PAB - PA*PB
    DENOMINATOR = np.max((PAB*(1-PA).values,PA*(PB-PAB).values), axis = 0)
    return NUMERATOR / DENOMINATOR 

In [67]:
# Compute Zhang's rule
rules['zhang'] = zhangs_rule(rules)
rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,zhang
2,(sign),"(candle, bag)",0.355795,0.010782,0.005391,0.015152,1.405303,0.001555,1.004437,0.447699,0.447699
3,(bag),"(candle, sign)",0.466307,0.008086,0.005391,0.011561,1.429672,0.00162,1.003515,0.563131,0.563131


In [68]:
# Set the lower bound for Zhang's rule to 0.98
rules = rules[rules['zhang'] > 0.5]
print(rules[['antecedents', 'consequents']])

  antecedents     consequents
3       (bag)  (candle, sign)


Advanced filtering with multiple metrics

Earlier, we used data from an online novelty gift store to find antecedents that could be used to promote a targeted consequent. Since the set of potential rules was large, we had to rely on the Apriori algorithm and multi-metric filtering to narrow it down. In this exercise, we'll examine the full set of rules and find a useful one, rather than targeting a particular antecedent.

In [71]:
frequent_itemsets_2

Unnamed: 0,support,itemsets
0,0.012723,( 50'S CHRISTMAS GIFT BAG LARGE)
1,0.013258,( DOLLY GIRL BEAKER)
2,0.010050,( RED SPOT GIFT BAG LARGE)
3,0.014113,(10 COLOUR SPACEBOY PEN)
4,0.010906,(12 MESSAGE CARDS WITH ENVELOPES)
...,...,...
2510,0.017000,"(SET OF 3 WOODEN TREE DECORATIONS, SET OF 3 WO..."
2511,0.010906,"(WOODEN HEART CHRISTMAS SCANDINAVIAN, WOODEN S..."
2512,0.015182,"(SET OF 3 WOODEN TREE DECORATIONS, SET OF 3 WO..."
2513,0.010157,"(WOODEN HEART CHRISTMAS SCANDINAVIAN, WOOD STO..."


In [82]:
# Recover association rules using a minium support threshold of 0.001
rules = association_rules(frequent_itemsets_2, metric = 'support', min_threshold = 0.001)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(SET 12 COLOUR PENCILS SPACEBOY ),(12 PENCIL SMALL TUBE WOODLAND),0.027371,0.021597,0.011013,0.402344,18.629312,0.010421,1.637066,0.972952
1,(12 PENCIL SMALL TUBE WOODLAND),(SET 12 COLOUR PENCILS SPACEBOY ),0.021597,0.027371,0.011013,0.509901,18.629312,0.010421,1.984556,0.967210
2,(WOOD 2 DRAWER CABINET WHITE FINISH),(3 DRAWER ANTIQUE WHITE WOOD CABINET),0.027371,0.020956,0.012082,0.441406,21.063636,0.011508,1.752694,0.979330
3,(3 DRAWER ANTIQUE WHITE WOOD CABINET),(WOOD 2 DRAWER CABINET WHITE FINISH),0.020956,0.027371,0.012082,0.576531,21.063636,0.011508,2.296811,0.972913
4,(4 TRADITIONAL SPINNING TOPS),(VINTAGE SNAP CARDS),0.023094,0.051641,0.011013,0.476852,9.233945,0.009820,1.812792,0.912784
...,...,...,...,...,...,...,...,...,...,...
5111,"(WOODEN TREE CHRISTMAS SCANDINAVIAN, WOODEN ST...",(WOODEN HEART CHRISTMAS SCANDINAVIAN),0.025981,0.047364,0.021811,0.839506,17.724382,0.020581,5.935652,0.968750
5112,"(WOODEN HEART CHRISTMAS SCANDINAVIAN, WOODEN S...",(WOODEN TREE CHRISTMAS SCANDINAVIAN),0.037207,0.029723,0.021811,0.586207,19.722277,0.020705,2.344836,0.985982
5113,(WOODEN TREE CHRISTMAS SCANDINAVIAN),"(WOODEN HEART CHRISTMAS SCANDINAVIAN, WOODEN S...",0.029723,0.037207,0.021811,0.733813,19.722277,0.020705,3.616978,0.978376
5114,(WOODEN HEART CHRISTMAS SCANDINAVIAN),"(WOODEN TREE CHRISTMAS SCANDINAVIAN, WOODEN ST...",0.047364,0.025981,0.021811,0.460497,17.724382,0.020581,1.805399,0.990495


In [88]:

# Apply a 0.002 antecedent support threshold, 0.60 confidence threshold, and 2.50 lift threshold
filtered_rules = rules[(rules['antecedent support'] > 0.02) &
						(rules['consequent support'] > 0.02) &
						(rules['confidence'] > 0.8) &
						(rules['lift'] > 29)]

filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
5088,"(SET OF 3 WOODEN STOCKING DECORATION, SET OF 3...",(SET OF 3 WOODEN TREE DECORATIONS),0.021063,0.027799,0.017,0.807107,29.034108,0.016414,5.040097,0.986333
