# Market Basket Analysis

<img src = "img/basket.jpg" height="300" width="300">


Affinity analysis is a data analysis and data mining technique that discovers co-occurrence relationships among activities performed by (or recorded about) specific individuals or groups. In general, this can be applied to any process where agents can be uniquely identified and information about their activities can be recorded. In retail, affinity analysis is used to perform market basket analysis, in which retailers seek to understand the purchase behavior of customers. This information can then be used for purposes of cross-selling and up-selling, in addition to influencing sales promotions, loyalty programs, store design, and discount plans

**Example** 

1. Market basket analysis might tell a retailer that customers often purchase shampoo and conditioner together, so putting both items on promotion at the same time would not create a significant increase in revenue, while a promotion involving just one of the items would likely drive sales of the other.

2.  one super market chain discovered in its analysis that male customers that bought diapers often bought beer as well, have put the diapers close to beer coolers, and their sales increased dramatically

*src*: https://en.wikipedia.org/wiki/Affinity_analysis

In [1]:
#Import required libraries
import itertools
import sys
import time
import pandas as pd

In [2]:
def tokenize(file_name):
    # Assumes that sentences are separated by a single '\n'.
    # Assumes that words are separated by a single ' '.
    # Tokenizes each sentence, removes duplicate tokens, sorts tokens.
    return [sorted(list(set(e.split(",")))) for e in
            open(file_name).read().strip(",").split('\n')]

In [3]:
#Run tokenize function on groceries_mba dataset and get the output as shown below
tokenize("../data/groceries_mba.csv")

[['citrus_fruit', 'margarine', 'ready_soups', 'semi-finished_bread'],
 ['coffee', 'tropical_fruit', 'yogurt'],
 ['whole_milk'],
 ['cream_cheese_', 'meat_spreads', 'pip_fruit', 'yogurt'],
 ['condensed_milk',
  'long_life_bakery_product',
  'other_vegetables',
  'whole_milk'],
 ['abrasive_cleaner', 'butter', 'rice', 'whole_milk', 'yogurt'],
 ['rolls/buns'],
 ['UHT-milk',
  'bottled_beer',
  'liquor_(appetizer)',
  'other_vegetables',
  'rolls/buns'],
 ['pot_plants'],
 ['cereals', 'whole_milk'],
 ['bottled_water',
  'chocolate',
  'other_vegetables',
  'tropical_fruit',
  'white_bread'],
 ['bottled_water',
  'butter',
  'citrus_fruit',
  'curd',
  'dishes',
  'flour',
  'tropical_fruit',
  'whole_milk',
  'yogurt'],
 ['beef'],
 ['frankfurter', 'rolls/buns', 'soda'],
 ['chicken', 'tropical_fruit'],
 ['butter', 'fruit/vegetable_juice', 'newspapers', 'sugar'],
 ['fruit/vegetable_juice'],
 ['packaged_fruit/vegetables'],
 ['chocolate'],
 ['specialty_bar'],
 ['other_vegetables'],
 ['butter_milk

In [4]:
def frequent_itemsets(sentences):
    # Counts sets with Apriori algorithm.
    SUPP_THRESHOLD = 100
    supps = []
    
    supp = {}
    for sentence in sentences:
        for key in sentence:
            if key in supp:
                supp[key] += 1
            else:
                supp[key] = 1
    print "|C1| = " + str(len(supp))
    supps.append({k:v for k,v in supp.iteritems() if v >= SUPP_THRESHOLD})
    print "|L1| = " + str(len(supps[0]))
    
    supp = {}
    for sentence in sentences:
        for combination in itertools.combinations(sentence, 2):
            if combination[0] in supps[0] and combination[1] in supps[0]:
                key = ','.join(combination)
                if key in supp:
                    supp[key] += 1
                else:
                    supp[key] = 1
    print "|C2| = " + str(len(supp))
    supps.append({k:v for k,v in supp.iteritems() if v >= SUPP_THRESHOLD})
    print "|L2| = " + str(len(supps[1]))
 
    supp = {}
    for sentence in sentences:
        for combination in itertools.combinations(sentence, 3):
            if (combination[0]+','+combination[1] in supps[1] and
                    combination[0]+','+combination[2] in supps[1] and
                    combination[1]+','+combination[2] in supps[1]):
                key = ','.join(combination)
                if key in supp:
                    supp[key] += 1
                else:
                    supp[key] = 1
    print "|C3| = " + str(len(supp))
    supps.append({k:v for k,v in supp.iteritems() if v >= SUPP_THRESHOLD})
    print "|L3| = " + str(len(supps[2]))
 
    return supps

SyntaxError: invalid syntax (<ipython-input-4-fbd278bd2db2>, line 13)

In [5]:
def measures(supp_ab, supp_a, supp_b, transaction_count):
    # Assumes A -> B, where A and B are sets.
    conf = float(supp_ab) / float(supp_a)
    s = float(supp_b) / float(transaction_count)
    lift = conf / s
    if conf == 1.0:
        conv = float('inf')
    else:
        conv = (1-s) / (1-conf)
    return [conf, lift, conv]

In [6]:
def generate_rules(measure, supps, transaction_count):
    rules = []
    CONF_THRESHOLD = 0.4
    LIFT_THRESHOLD = 20.0
    CONV_THRESHOLD = 5.0
    if measure == 'conf':
        for i in range(2, len(supps)+1):
            for k,v in supps[i-1].iteritems():
                k = k.split(',')
                for j in range(1, len(k)):
                    for a in itertools.combinations(k, j):
                        b = tuple([w for w in k if w not in a])
                        [conf, lift, conv] = measures(v,
                                supps[len(a)-1][','.join(a)],
                                supps[len(b)-1][','.join(b)],
                                transaction_count)
                        if conf >= CONF_THRESHOLD:
                            rules.append((a, b, conf, lift, conv))
            rules = sorted(rules, key=lambda x: (x[0], x[1]))
            rules = sorted(rules, key=lambda x: (x[2]), reverse=True)
    elif measure == 'lift':
        for i in range(2, len(supps)+1):
            for k,v in supps[i-1].iteritems():
                k = k.split(',')
                for j in range(1, len(k)):
                    for a in itertools.combinations(k, j):
                        b = tuple([w for w in k if w not in a])
                        [conf, lift, conv] = measures(v,
                                supps[len(a)-1][','.join(a)],
                                supps[len(b)-1][','.join(b)],
                                transaction_count)
                        if lift >= LIFT_THRESHOLD:
                            rules.append((a, b, conf, lift, conv))
            rules = sorted(rules, key=lambda x: (x[0], x[1]))
            rules = sorted(rules, key=lambda x: (x[3]), reverse=True)
    elif measure == 'conv':
        for i in range(2, len(supps)+1):
            for k,v in supps[i-1].iteritems():
                k = k.split(',')
                for j in range(1, len(k)):
                    for a in itertools.combinations(k, j):
                        b = tuple([w for w in k if w not in a])
                        [conf, lift, conv] = measures(v,
                                supps[len(a)-1][','.join(a)],
                                supps[len(b)-1][','.join(b)],
                                transaction_count)
                        if conv >= CONV_THRESHOLD:
                            rules.append((a, b, conf, lift, conv))
            rules = sorted(rules, key=lambda x: (x[0], x[1]))
            rules = sorted(rules, key=lambda x: (x[4]), reverse=True)
    else:
        for i in range(2, len(supps)+1):
            for k,v in supps[i-1].iteritems():
                k = k.split(',')
                for j in range(1, len(k)):
                    for a in itertools.combinations(k, j):
                        b = tuple([w for w in k if w not in a])
                        [conf, lift, conv] = measures(v,
                                supps[len(a)-1][','.join(a)],
                                supps[len(b)-1][','.join(b)],
                                transaction_count)
                        if (conf >= CONF_THRESHOLD and
                                lift >= LIFT_THRESHOLD and
                                conv >= CONV_THRESHOLD):
                            rules.append((a, b, conf, lift, conv))
            rules = sorted(rules, key=lambda x: (x[0], x[1]))
            rules = sorted(rules, key=lambda x: (x[2],x[3],x[4]), reverse=True)
    return rules
 

In [7]:
def main(file, measure): 
    sentences = tokenize(file)
 
    start_time = time.time()
    supps = frequent_itemsets(sentences)
    end_time = time.time()
    print "Time spent finding frequent itemsets = {:.2f} seconds.".format(
          end_time - start_time)
 
    start_time = time.time()
    rules = generate_rules(measure, supps, len(sentences))
    for rule in rules:
        print ("{{{}}} -> {{{}}}, "
               "conf = {:.2f}, lift = {:.2f}, conv = {:.2f}").format(
              ', '.join(rule[0]), ', '.join(rule[1]), rule[2], rule[3], rule[4])
    end_time = time.time()
    print "Time spent finding association rules = {:.2f} second.".format(
          end_time - start_time)
 
if __name__ == "__main__":
    measure = 'all'
    #measure could be all, conf, lift or conv
    file = "../data/groceries_mba.csv"
    #file = "/Users/bargavaraman/Downloads/mba_temp.txt"
    main(file, "conf")

SyntaxError: invalid syntax (<ipython-input-7-bbbec2e53025>, line 7)

**Exercise 1** Try changing the parameters (`measure` and `threshold`) and get rules