In [26]:
'''
Introduction

The problem of discovering association rules between itemsets in a sales transaction database (a set of baskets) 
includes the following two sub-problems [R. Agrawal and R. Srikant, VLDB '94 Links to an external site.]:
- Finding frequent itemsets with support at least s;
- Generating association rules with confidence at least c from the itemsets found in the first step.

Remind that an association rule is an implication X → Y, where X and Y are itemsets such that X∩Y=∅.
Support of rule X → Y is the number of transactions that contain X⋃Y. Confidence of rule X → Y is the fraction
of transactions containing X⋃Y in all transactions that contain X.

'''

'''
TASK 1
You are to solve the first sub-problem: to implement the A-Priori algorithm for finding frequent itemsets
with support at least s in a dataset of sales transactions. Remind that support of an itemset is the number
of transactions containing the itemset. To test and evaluate your implementation, write a program that uses
your A-Priori algorithm implementation to discover frequent itemsets with support at least s in a given dataset
of sales transactions.

TASK 2
Solve the second sub-problem, i.e., develop and implement an algorithm for generating association rules between
frequent itemsets discovered using the A-Priori algorithm in a dataset of sales transactions. The rules must have
the support of at least s and confidence of at least c, where s and c are given as input parameters.

'''


'''
Performance improvements:
- print frequent items and rules as they are found so that the item and rule list doesn't have to be iterated again
- use vectorized approach for fast computation
'''

import csv
from TransactionCollection import TransactionCollection
from AssociationRules import AssociationRules
from APrioriAlgorithm import APrioriAlgorithm

In [10]:
baskets = TransactionCollection('T10I4D100K.dat').getTransactions()

In [11]:
#baskets = TransactionCollection('test.txt').getTransactions()

#TASK 1: Finding frequent itemsets
minSupport = 1000
frequentItems = APrioriAlgorithm(baskets, minSupport).runApriori()
print('frequent items:\n{}'.format(frequentItems))

frequent items:
[[frozenset({207.0}), frozenset({820.0}), frozenset({366.0}), frozenset({554.0}), frozenset({769.0}), frozenset({550.0}), frozenset({428.0}), frozenset({450.0}), frozenset({85.0}), frozenset({258.0}), frozenset({173.0}), frozenset({335.0}), frozenset({893.0}), frozenset({163.0}), frozenset({949.0}), frozenset({688.0}), frozenset({405.0}), frozenset({351.0}), frozenset({634.0}), frozenset({661.0}), frozenset({308.0}), frozenset({948.0}), frozenset({815.0}), frozenset({105.0}), frozenset({707.0}), frozenset({394.0}), frozenset({826.0}), frozenset({325.0}), frozenset({804.0}), frozenset({309.0}), frozenset({887.0}), frozenset({4.0}), frozenset({860.0}), frozenset({68.0}), frozenset({510.0}), frozenset({578.0}), frozenset({129.0}), frozenset({843.0}), frozenset({429.0}), frozenset({886.0}), frozenset({819.0}), frozenset({663.0}), frozenset({468.0}), frozenset({540.0}), frozenset({686.0}), frozenset({265.0}), frozenset({784.0}), frozenset({38.0}), frozenset({440.0}), frozens

It's too time comsuming for get association rules by the whole dataset, so we choose the first 1000 itemsets as example

In [22]:
baskets_test = baskets[:1000]

#TASK 1: Finding frequent itemsets
minSupport = 10
frequentItems = APrioriAlgorithm(baskets_test, minSupport).runApriori()
print('frequent items:\n{}'.format(frequentItems))

frequent items:
[[frozenset({207.0}), frozenset({820.0}), frozenset({732.0}), frozenset({769.0}), frozenset({550.0}), frozenset({428.0}), frozenset({450.0}), frozenset({258.0}), frozenset({173.0}), frozenset({922.0}), frozenset({893.0}), frozenset({949.0}), frozenset({405.0}), frozenset({351.0}), frozenset({215.0}), frozenset({634.0}), frozenset({661.0}), frozenset({308.0}), frozenset({948.0}), frozenset({815.0}), frozenset({838.0}), frozenset({707.0}), frozenset({394.0}), frozenset({826.0}), frozenset({804.0}), frozenset({309.0}), frozenset({887.0}), frozenset({318.0}), frozenset({860.0}), frozenset({68.0}), frozenset({241.0}), frozenset({510.0}), frozenset({129.0}), frozenset({843.0}), frozenset({429.0}), frozenset({886.0}), frozenset({819.0}), frozenset({468.0}), frozenset({686.0}), frozenset({265.0}), frozenset({784.0}), frozenset({252.0}), frozenset({38.0}), frozenset({440.0}), frozenset({486.0}), frozenset({108.0}), frozenset({75.0}), frozenset({322.0}), frozenset({361.0}), froze

In [24]:
#TASK 2: Find association rules
confidence = 0.8
print('frequently bought together:')
assocationRules = AssociationRules(frequentItems, confidence,baskets_test).getAssociationRules()

frequently bought together:
frozenset({819.0}) frozenset({70.0}) 0.8333333333333334
frozenset({852.0}) frozenset({906.0}) 0.8
frozenset({852.0}) frozenset({236.0}) 0.8
frozenset({852.0}) frozenset({204.0}) 0.8
frozenset({515.0}) frozenset({217.0}) 0.8235294117647058
frozenset({709.0}) frozenset({310.0}) 0.9090909090909091
frozenset({709.0}) frozenset({970.0}) 0.8181818181818182
frozenset({709.0}) frozenset({970.0, 310.0}) 0.8181818181818182
frozenset({842.0}) frozenset({579.0}) 0.8
frozenset({842.0}) frozenset({411.0}) 0.8
frozenset({842.0}) frozenset({350.0}) 0.8
frozenset({801.0}) frozenset({862.0}) 1.0
frozenset({801.0}) frozenset({461.0}) 0.8333333333333334
frozenset({801.0}) frozenset({392.0}) 0.9166666666666666
frozenset({801.0}) frozenset({569.0}) 0.8333333333333334
frozenset({801.0}) frozenset({569.0, 461.0}) 0.8333333333333334
frozenset({801.0}) frozenset({569.0, 862.0}) 0.8333333333333334
frozenset({801.0}) frozenset({392.0, 862.0}) 0.9166666666666666
frozenset({801.0}) froze

frozenset({27.0, 357.0}) frozenset({480.0}) 0.9
frozenset({27.0, 357.0}) frozenset({480.0, 354.0}) 0.9
frozenset({27.0, 357.0}) frozenset({480.0, 58.0}) 0.9
frozenset({27.0, 357.0}) frozenset({752.0, 480.0}) 0.9
frozenset({27.0, 357.0}) frozenset({752.0, 354.0}) 1.0
frozenset({27.0, 357.0}) frozenset({752.0, 58.0}) 1.0
frozenset({27.0, 357.0}) frozenset({58.0, 354.0}) 1.0
frozenset({27.0, 357.0}) frozenset({752.0, 480.0, 58.0}) 0.9
frozenset({27.0, 357.0}) frozenset({752.0, 480.0, 354.0}) 0.9
frozenset({27.0, 357.0}) frozenset({480.0, 354.0, 58.0}) 0.9
frozenset({27.0, 357.0}) frozenset({752.0, 354.0, 58.0}) 1.0
frozenset({27.0, 357.0}) frozenset({752.0, 480.0, 354.0, 58.0}) 0.9
frozenset({354.0, 357.0}) frozenset({752.0}) 1.0
frozenset({354.0, 357.0}) frozenset({58.0}) 1.0
frozenset({354.0, 357.0}) frozenset({480.0}) 0.9
frozenset({354.0, 357.0}) frozenset({27.0}) 1.0
frozenset({354.0, 357.0}) frozenset({480.0, 58.0}) 0.9
frozenset({354.0, 357.0}) frozenset({752.0, 480.0}) 0.9
frozens

frozenset({392.0, 801.0}) frozenset({569.0, 461.0}) 0.8181818181818182
frozenset({392.0, 801.0}) frozenset({569.0, 862.0}) 0.8181818181818182
frozenset({392.0, 801.0}) frozenset({461.0, 862.0}) 0.8181818181818182
frozenset({392.0, 801.0}) frozenset({569.0, 461.0, 862.0}) 0.8181818181818182
frozenset({801.0, 461.0}) frozenset({862.0}) 1.0
frozenset({801.0, 461.0}) frozenset({392.0}) 0.9
frozenset({801.0, 461.0}) frozenset({569.0}) 1.0
frozenset({801.0, 461.0}) frozenset({569.0, 862.0}) 1.0
frozenset({801.0, 461.0}) frozenset({392.0, 862.0}) 0.9
frozenset({569.0, 862.0}) frozenset({801.0}) 0.9090909090909091
frozenset({569.0, 862.0}) frozenset({461.0}) 0.9090909090909091
frozenset({569.0, 862.0}) frozenset({392.0}) 0.8181818181818182
frozenset({569.0, 862.0}) frozenset({392.0, 801.0}) 0.8181818181818182
frozenset({569.0, 862.0}) frozenset({801.0, 461.0}) 0.9090909090909091
frozenset({461.0, 862.0}) frozenset({801.0}) 1.0
frozenset({461.0, 862.0}) frozenset({392.0}) 0.9
frozenset({461.0, 

frozenset({515.0, 217.0, 283.0}) frozenset({33.0}) 0.9230769230769231
frozenset({515.0, 217.0, 283.0}) frozenset({346.0}) 1.0
frozenset({515.0, 217.0, 283.0}) frozenset({33.0, 346.0}) 0.9230769230769231
frozenset({217.0, 346.0, 283.0}) frozenset({515.0}) 0.9285714285714286
frozenset({217.0, 346.0, 283.0}) frozenset({33.0}) 0.9285714285714286
frozenset({217.0, 346.0, 283.0}) frozenset({33.0, 515.0}) 0.8571428571428571
frozenset({617.0, 158.0, 583.0}) frozenset({354.0}) 0.8
frozenset({801.0, 461.0, 862.0}) frozenset({392.0}) 0.9
frozenset({801.0, 461.0, 862.0}) frozenset({569.0}) 1.0
frozenset({392.0, 801.0, 862.0}) frozenset({461.0}) 0.8181818181818182
frozenset({392.0, 801.0, 862.0}) frozenset({569.0}) 0.8181818181818182
frozenset({392.0, 801.0, 862.0}) frozenset({569.0, 461.0}) 0.8181818181818182
frozenset({801.0, 862.0, 569.0}) frozenset({461.0}) 1.0
frozenset({801.0, 862.0, 569.0}) frozenset({392.0}) 0.9
frozenset({801.0, 461.0, 569.0}) frozenset({862.0}) 1.0
frozenset({801.0, 461.0