In [1]:
import pandas as pd
from pli import (
    generate_pli, 
    pli_intersection_optimized
)

Make a toy dataset to play with

In [2]:
data = {
    'a': ['C', 'A', 'A', 'B', 'C'],
    'b': [3, 1, 1, 1, 1],
    'c': ['X', 'X', 'Y', 'X', 'X'],
    'd': [722, 289, 189, 289, 289],
    'e': [112, 553, 583, 513 ,553]
}

df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c,d,e
0,C,3,X,722,112
1,A,1,X,289,553
2,A,1,Y,189,583
3,B,1,X,289,513
4,C,1,X,289,553


<div>
<img src="img/apriori_ucc.png" width="600"/>
</div>

In [3]:
def prepare_Fk(df):
    
    Fk = []
    for col in df.columns:
        pli = generate_pli(df[col])
        Fk.append((col, pli))

    return Fk

Fk = prepare_Fk(df)
Fk

[('a', {1: [0, 4], 2: [1, 2]}),
 ('b', {2: [1, 2, 3, 4]}),
 ('c', {1: [0, 1, 3, 4]}),
 ('d', {2: [1, 3, 4]}),
 ('e', {2: [1, 4]})]

In [59]:
def generate_candidates(Fk):
    """
    Generates candidates for UCC (unique column combination) 
    discovery in Apriori algorithm.

    Input: Fk = [(f1, pli1), (f2, pli2), ..., (fk, plik)]
    Output: Candidates for the next level in the same format as Fk.
    """
    E = []

    schema = [f[0] for f, _ in Fk]

    # traverse all pairs of non-unique column combinations 
    # (if column is unique, than all generated supersets or childs will be also unique!)
    # that share the same maximal prefix (they are in lexicographic order) 
    # and differ only in one last attribute

    for f1, pli1 in Fk:
        for f2, pli2 in Fk:

            # use lexicographic order
            if f1 < f2:

                # check if f1 and f2 share the same maximal prefix 
                # and differ only in the last attribute
                if f1[:-1] == f2[:-1]:

                    # make new CC with k+1 attributes 
                    # using a union of f1 and f2
                    f = f1 + f2[-1]

                    # and make an intersection of PLIs
                    pli = pli_intersection_optimized(pli1, pli2)

                    # if all subsets with k attributes are non-unique
                    if all((set(f) - set(i)).issubset(schema) for i in set(f)):
                        E.append((''.join(f), pli))             
                               
    return E

In [60]:
l1 = generate_candidates(Fk)
l1

[('ab', {(2, 2): [1, 2]}),
 ('ac', {(1, 1): [0, 4]}),
 ('ad', {}),
 ('ae', {}),
 ('bc', {(2, 1): [1, 3, 4]}),
 ('bd', {(2, 2): [1, 3, 4]}),
 ('be', {(2, 2): [1, 4]}),
 ('cd', {(1, 2): [1, 3, 4]}),
 ('ce', {(1, 2): [1, 4]}),
 ('de', {(2, 2): [1, 4]})]

In [62]:
l2 = generate_candidates(l1)
l2

[('abc', {}), ('abd', {}), ('acd', {}), ('bcd', {((2, 1), (2, 2)): [1, 3, 4]})]