# Apriori Algorithm

- a classic algorithm used in data mining for learning association rules
- it can be used to find items that are purchased together more frequently than others

## Useful tips

### Mapping through rows and columns in an array

Create an `np.vectorize` to perform mapping. Let's say we are given an array of items, and for each rows and columns, we want to just take the initials:

```python
items = np.array([['Mango', 'Onion', 'Nintendo', 'Key-chain', 'Eggs', 'Yo-yo'],
                  ['Doll', 'Onion', 'Nintendo', 'Key-chain', 'Eggs', 'Yo-yo'],
                  ['Mango', 'Apple', 'Key-chain', 'Eggs'],
                  ['Mango', 'Umbrella', 'Corn', 'Key-chain', 'Yo-yo'],
                  ['Corn', 'Onion', 'Onion', 'Key-chain', 'Ice-cream', 'Eggs']])

# Create a new np function that takes the first character from each
# items in the array (for simplification)
take_first = lambda x: x[0]
f = np.vectorize(take_first)

# Apply the function to the items
# Note that we also use frozenset to remove duplicates from each transaction
data = [frozenset(f(i)) for i in items]
```

Output:

```python
[frozenset({'E', 'K', 'M', 'N', 'O', 'Y'}),
 frozenset({'D', 'E', 'K', 'N', 'O', 'Y'}),
 frozenset({'A', 'E', 'K', 'M'}),
 frozenset({'C', 'K', 'M', 'U', 'Y'}),
 frozenset({'C', 'E', 'I', 'K', 'O'})]
```

In [223]:
import numpy as np
import pandas as pd

from functools import reduce
from itertools import combinations

from collections import Counter
from sklearn.preprocessing import LabelEncoder

In [224]:
# Every row is a transaction, and every column represent the item bought
# Note that in a single transaction, there can be similar items bought
items = np.array([['Mango', 'Onion', 'Nintendo', 'Key-chain', 'Eggs', 'Yo-yo'],
                  ['Doll', 'Onion', 'Nintendo', 'Key-chain', 'Eggs', 'Yo-yo'],
                  ['Mango', 'Apple', 'Key-chain', 'Eggs'],
                  ['Mango', 'Umbrella', 'Corn', 'Key-chain', 'Yo-yo'],
                  ['Corn', 'Onion', 'Onion', 'Key-chain', 'Ice-cream', 'Eggs']])



In [225]:
# Create a new label encoder to learn the mappings
le = LabelEncoder()

# Fit the mappings to learn them
le.fit(np.hstack(items))
# le.fit(list(set(np.hstack(items)))) # Will this be more performant?

print('Each index `i` will represent a class:\n')
for i, v in enumerate(le.classes_):
    print('{} => {}'.format(i, v))

# le.transform(['A', 'M'])

Each index `i` will represent a class:

0 => Apple
1 => Corn
2 => Doll
3 => Eggs
4 => Ice-cream
5 => Key-chain
6 => Mango
7 => Nintendo
8 => Onion
9 => Umbrella
10 => Yo-yo


In [226]:
# Now that we have learned the mappings, let's apply it to our dataset
# We will also remove duplicate items from the array using frozenset
# There is an added advantage of being able to check intersection too between sets using frozenset
encoded_items = [frozenset(le.transform(i)) for i in items]
encoded_items

[frozenset({3, 5, 6, 7, 8, 10}),
 frozenset({2, 3, 5, 7, 8, 10}),
 frozenset({0, 3, 5, 6}),
 frozenset({1, 5, 6, 9, 10}),
 frozenset({1, 3, 4, 5, 8})]

In [230]:
class Apriori:
    def __init__(self, data = None, min_support = 2):
        self.data = data
        self.min_support = min_support
        pass

    def score(self):
        out = {}
        recs = {}
        
        singles = reduce(lambda x, y: x + y, self.data)
        singles_counter = self.count(singles, self.min_support)
        
        out[1] = singles_counter
        
        for i in range(2, 10):
            pairs = self.combination(out[i - 1], i)
            pairs_support = self.check_exist(pairs)
            pairs_counter = self.count(pairs_support, self.min_support)
            if len(pairs_counter) == 0:
                print('no more items to recommend')
                break
            print('check:', i)
            
            out[i] = pairs_counter
            recs[i] = self.recommend(pairs_counter)
        return recs
    
    def check_exist(self, data):
        supports = []
        for i in data:
            i_fs = frozenset(i)
            for j in self.data:
                j_fs = frozenset(j)
                if i_fs.issubset(j_fs):
                    supports.append(i_fs)
                    continue
        return supports

    def flatten(self, data):
        if isinstance(data, list):
            if isinstance(data[0], list):
                return reduce(lambda x, y: x + y, data)
        return data

    def recommend(self, data):
        recs = []
        recs_tmp = []
        cnt = Counter()
        
        for i in data:
            i_fs = frozenset(i)
            for j in self.data:
                data_fs = frozenset(j)
                if i_fs.issubset(data_fs):
                    to_recommend = data_fs.difference(i_fs)
                    # The difference can be more than 2, but if we want
                    # to recommend only 1 item, we have to loop through it
                    for _, k in enumerate(list(to_recommend)):
                        union = i_fs.union([k])
                        if cnt[union] == 0:
                            recs_tmp.append((list(i), union))
                        cnt[union] += 1

        for i, to_recommend in recs_tmp:
            if cnt[to_recommend] > self.min_support:
                recs.append([i, list(to_recommend.difference(i))])
        return recs
        
    def count(self, data, min_support):
        cnt = Counter(data)
        support = {k: v for k, v in list(cnt.items())
                   if v > min_support}

        keys = list(support.keys())
        for i, v in enumerate(keys):
            if isinstance(v, frozenset):
                keys[i] = list(v)
        return keys
    
    def combination(self, data, r):
        '''Returns the unique combination of data'''
        # Flatten data if necessary to 1d
        if isinstance(data[0], list):
            data = list(set(reduce(lambda x, y: x + y, data)))
        combs = combinations(data, r = r)
        
        # We want an array, not tuples
        combs = [list(i) for i in combs]
        return combs
        
dataset = [list(i) for i in encoded_items]
recommendations = Apriori(dataset).score()
for i, v in recommendations.items():
    if len(v) > 0:
        for j in v:
            items_names = ', '.join(le.inverse_transform(j[0]))
            items_classes = ', '.join(le.inverse_transform(j[1]))
            print('If you buy {}, you might also like {}'.format(items_names, items_classes))

check: 2
check: 3
no more items to recommend
If you buy Eggs, Key-chain, you might also like Onion
If you buy Eggs, Key-chain, you might also like Yo-yo
If you buy Eggs, Key-chain, you might also like Mango
If you buy Key-chain, Mango, you might also like Yo-yo
If you buy Onion, Key-chain, you might also like Yo-yo
