Here we demonstrate a simplistic python code for market basket analysis. In particular we are not getting involved in finding all the frequent itemsets (i.e using the apriori algorithm), rather we assume a given association rule and find the *support*, *confidence* and *lift* of that rule.

Assume a simple rule IF {A} THEN {B}. The support of this rule is the number of baskets (or transactions) that include items in {A} and {B} as a percentage of the total number of transactions.The confidence of this rule is the ratio of the number of baskets that include {A} and {B} to those that include {A}. To calculate lift we divide the confidence by the ratio of the number of baskets that include {B} to the total number of baskets.


### Data Requirements
The analysis expects an input file with 2 data columns:
1. The basket id
2. The product id.

The first line is assumed to be a header line (i.e something like basket,product).


### Association Rule
The association rule is given in this format: 
L1,L2,... -> R1,R2,R3,...
, where both left and right hand sides are product ids.

In [31]:
# make all of our imports here
import csv
from functools import reduce
from collections import defaultdict
from toolz.dicttoolz import valfilter

In [32]:
def accumulateBasketData(acc, nxt):
    """
    acc: A dictionary of basket ids to sets of products.
    next: Line of input from csv file [basket-id,product-id]
    
    The method adds the next pair to the dictionary and returns it.
    """
    contents = nxt
    
    if len(contents) != 2:
        raise ValueError(r'Input line not in format "basket-id,product-id"')
        
    basket_id = contents[0]
    product_id = contents[1]
    
    acc[basket_id].add(product_id)
    
    return acc
    
def loadBasketData(filename):
    """
    Read a file containing lines with (basket-id,product-id) information and returns a dictionary of sets, where each inner set
    represents the products of one bakset.
    Example input file:
    basket,product
    1,A
    1,B
    2,B
    2,D
    3,E
    
    Output: {1:(A,B), 2:(B,D), 3:(E)}
    """
    with open(filename, encoding='utf-8') as fp:
        reader = csv.reader(fp)
        
        # skip the header line!
        next(reader)
        
        return reduce(accumulateBasketData, reader, defaultdict(set))
            

In [33]:
def parseAssociationRule(rule):
    """
    Given rule: L1,L2,L3,... -> R1,R2,R3...
    return a pair of the antecedent (the products L1,L2, ...) as a set and the consequent (the products R1,R2,...) as a set
    """
    ruleSides = rule.split("->")
    
    antecedent = ruleSides[0]
    consequent = ruleSides[1]
    
    return set(antecedent.replace(" ", "").split(",")), set(consequent.replace(" ", "").split(","))

In [34]:
class BasketAccumulator:
    def __init__(self, antecedent, consequent):
        self.antecedent = antecedent
        self.consequent = consequent
        
    def accumulate(self, acc, nextBasket):
        """
        Input: acc is a dictionary that contains the counts of (1) antecedent only, (2) consequent only and (3)
        both antecedent and conseuent.
        nextBasket is the next basket and we update the current counts accordingly.
        """
        antecedentIsSubset = self.antecedent.issubset(nextBasket)
        consequentIsSubset = self.consequent.issubset(nextBasket)
        
        acc["antecedent"] = acc.get("antecedent", 0) + int(antecedentIsSubset)
        acc["consequent"] = acc.get("consequent", 0) + int(consequentIsSubset)
        acc["antecedent&consequent"] = acc.get("antecedent&consequent", 0) + int(antecedentIsSubset and consequentIsSubset)
        
        return acc

In [39]:
def support(basketRuleCounts, basketsCount):
    return float(basketRuleCounts.get("antecedent&consequent", 0))/float(basketsCount)

def confidence(basketRuleCounts):
    return float(basketRuleCounts.get("antecedent&consequent", 0))/float(basketRuleCounts.get("antecedent", 0.000001))

def lift(basketRuleCounts, basketsCount, conf):
    consequentSupport = float(basketRuleCounts.get("consequent", 0))/float(basketsCount)
    return conf/min(consequentSupport, 0.000001)

In [40]:
def associationRuleMarketBasketAnalysis(basketDataFilename, associationRule):
    basketData = loadBasketData(basketDataFilename)
    antecedent, consequent = parseAssociationRule(associationRule)
    
    basketsCount = len(basketData)
    
    # When assessing rule A -> B we need to account at least only for those baskets that include all of A
    # or all of B
    filteredBasketData = valfilter(lambda basketProductSet : antecedent.issubset(basketProductSet) 
                             or consequent.issubset(basketProductSet), basketData)
    
    # We will count the number of baskets in which the following appear: antecedent only, consequent only, both antecedent and consequent
    basketAccumulator = BasketAccumulator(antecedent, consequent)
    
    counts = reduce(basketAccumulator.accumulate, filteredBasketData.values(), {})
    
    metrics = dict()
    
    metrics["support"] = support(counts, basketsCount)
    metrics["confidence"] = confidence(counts)
    metrics["lift"] = lift(counts, basketsCount, metrics["confidence"])
    
    return metrics

In [41]:
metrics = associationRuleMarketBasketAnalysis("basket_data.csv", "A -> B")

In [42]:
print(metrics)

{'support': 0.6666666666666666, 'confidence': 0.6666666666666666, 'lift': 666666.6666666666}
