# 第十二章: 使用FP-growth算法来高效发现频繁项集

# 12.2 构建FP树

In [1]:
import fpGrowth

In [2]:
rootNode = fpGrowth.treeNode('pyramid', 9, None)

In [3]:
rootNode.children['eye'] = fpGrowth.treeNode('eye', 13, None)

In [4]:
rootNode.disp()

   pyramid   9
     eye   13


In [5]:
rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix', 3, None)

In [6]:
rootNode.disp()

   pyramid   9
     eye   13
     phoenix   3


In [7]:
simpDat = fpGrowth.loadSimpDat()
simpDat

[['r', 'z', 'h', 'j', 'p'],
 ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
 ['z'],
 ['r', 'x', 'n', 'o', 's'],
 ['y', 'r', 'x', 'z', 'q', 't', 'p'],
 ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]

In [8]:
initSet = fpGrowth.createInitSet(simpDat)
initSet

{frozenset({'z'}): 1,
 frozenset({'h', 'j', 'p', 'r', 'z'}): 1,
 frozenset({'s', 't', 'u', 'v', 'w', 'x', 'y', 'z'}): 1,
 frozenset({'n', 'o', 'r', 's', 'x'}): 1,
 frozenset({'p', 'q', 'r', 't', 'x', 'y', 'z'}): 1,
 frozenset({'e', 'm', 'q', 's', 't', 'x', 'y', 'z'}): 1}

In [9]:
myFPTree, myHeaderTab = fpGrowth.createTree(initSet, 3)

In [10]:
myFPTree.disp()

   Null Set   1
     z   5
       r   1
       x   3
         t   3
           y   2
             s   2
           r   1
             y   1
     x   1
       r   1
         s   1


## Practice

In [11]:
dataSet = initSet
headerTable = {}
#go over dataSet twice
for trans in dataSet:#first pass counts frequency of occurance
    print(trans)
    for item in trans:
        print(item)
        print(dataSet[trans])
        headerTable[item] = headerTable.get(item, 0) + dataSet[trans]

frozenset({'h', 'r', 'j', 'p', 'z'})
h
1
r
1
j
1
p
1
z
1
frozenset({'v', 'x', 'u', 't', 'y', 'w', 's', 'z'})
v
1
x
1
u
1
t
1
y
1
w
1
s
1
z
1
frozenset({'z'})
z
1
frozenset({'x', 'n', 'o', 'r', 's'})
x
1
n
1
o
1
r
1
s
1
frozenset({'x', 't', 'r', 'y', 'p', 'z', 'q'})
x
1
t
1
r
1
y
1
p
1
z
1
q
1
frozenset({'e', 'x', 't', 'y', 's', 'z', 'm', 'q'})
e
1
x
1
t
1
y
1
s
1
z
1
m
1
q
1


In [12]:
headerTable

{'e': 1,
 'h': 1,
 'j': 1,
 'm': 1,
 'n': 1,
 'o': 1,
 'p': 2,
 'q': 2,
 'r': 3,
 's': 3,
 't': 3,
 'u': 1,
 'v': 1,
 'w': 1,
 'x': 4,
 'y': 3,
 'z': 5}

In [13]:
freqItemSet = set(headerTable.keys())

In [14]:
freqItemSet

{'e',
 'h',
 'j',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [15]:
for k in headerTable:
    headerTable[k] = [headerTable[k], None]

In [16]:
headerTable

{'e': [1, None],
 'h': [1, None],
 'j': [1, None],
 'm': [1, None],
 'n': [1, None],
 'o': [1, None],
 'p': [2, None],
 'q': [2, None],
 'r': [3, None],
 's': [3, None],
 't': [3, None],
 'u': [1, None],
 'v': [1, None],
 'w': [1, None],
 'x': [4, None],
 'y': [3, None],
 'z': [5, None]}

In [17]:
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table)
bigL

['h',
 'j',
 'v',
 'u',
 'w',
 'n',
 'o',
 'e',
 'm',
 'p',
 'q',
 'r',
 't',
 'y',
 's',
 'x',
 'z']

In [18]:
dataSet.items()

dict_items([(frozenset({'h', 'r', 'j', 'p', 'z'}), 1), (frozenset({'v', 'x', 'u', 't', 'y', 'w', 's', 'z'}), 1), (frozenset({'z'}), 1), (frozenset({'x', 'n', 'o', 'r', 's'}), 1), (frozenset({'x', 't', 'r', 'y', 'p', 'z', 'q'}), 1), (frozenset({'e', 'x', 't', 'y', 's', 'z', 'm', 'q'}), 1)])

In [19]:
for tranSet, count in dataSet.items():  #go through dataset 2nd time
    localD = {}
    for item in tranSet:  #put transaction items in order
        if item in freqItemSet:
            localD[item] = headerTable[item][0]
        if len(localD) > 0:
            orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]

In [20]:
orderedItems

['z', 'x', 't', 'y', 's', 'q', 'e', 'm']

In [21]:
localD

{'e': 1, 'm': 1, 'q': 2, 's': 3, 't': 3, 'x': 4, 'y': 3, 'z': 5}

# 12.3 从一颗FP树中挖掘频繁项集

In [22]:
fpGrowth.findPrefixPath('x', myHeaderTab['x'][1])

{frozenset({'z'}): 3}

In [23]:
fpGrowth.findPrefixPath('z', myHeaderTab['z'][1])

{}

In [24]:
fpGrowth.findPrefixPath('r', myHeaderTab['r'][1])

{frozenset({'z'}): 1, frozenset({'x'}): 1, frozenset({'t', 'x', 'z'}): 1}

In [25]:
freqItems = []
fpGrowth.mineTree(myFPTree, myHeaderTab, 3, set([]), freqItems)

In [26]:
freqItems

[{'s'},
 {'s', 'x'},
 {'t'},
 {'t', 'z'},
 {'t', 'x'},
 {'t', 'x', 'z'},
 {'r'},
 {'y'},
 {'y', 'z'},
 {'t', 'y', 'z'},
 {'t', 'y'},
 {'x', 'y'},
 {'t', 'x', 'y'},
 {'x', 'y', 'z'},
 {'t', 'x', 'y', 'z'},
 {'x'},
 {'x', 'z'},
 {'z'}]

# 12.4 示例: 在Twitter源中发现一些共现词

# 12.5 示例: 从新闻网站点击流中挖掘

In [32]:
parseDat = [ line.split() for line in open('kosarak.dat').readlines()]

In [33]:
initSet = fpGrowth.createInitSet(parseDat)

In [34]:
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 88000)

In [35]:
myFreqList = []
fpGrowth.mineTree(myFPtree, myHeaderTab, 88000, set([]), myFreqList)

In [36]:
myFreqList

[{'1'},
 {'1', '6'},
 {'3'},
 {'11', '3'},
 {'11', '3', '6'},
 {'3', '6'},
 {'11'},
 {'11', '6'},
 {'6'}]