In [1]:
import pandas as pd
import time

In [2]:
# 数据加载
data = pd.read_csv('BreadBasket_DMS.csv')
# 统一小写
data['Item'] = data['Item'].str.lower()
# 去掉none项
data = data.drop(data[data.Item == 'none'].index)

In [3]:
# 采用efficient_apriori工具包
def rule1():
    from efficient_apriori import apriori
    start = time.time()
    # 得到一维数组orders_series,并且将Transaction作为index,value为Item取值
    order_series = data.set_index('Transaction')['Item']
    # 将数据集进行格式转换
    transactions = []
    temp_index = 0
    for i, v in order_series.items():
        if i != temp_index:
            temp_set = set()
            temp_index = i
            temp_set.add(v)
            transactions.append(temp_set)
        else:
            temp_set.add(v)

    # 挖掘频繁项集和频繁规则
    itemsets, rules = apriori(transactions, min_support=0.02, min_confidence=0.5)
    print('频繁项集: ', itemsets)
    print('关联规则: ', rules)
    end = time.time()
    print('用时: ',end-start)

In [4]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [5]:
# 采用mlxtend.frequent_patterns工具包
def rule2():
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    pd.options.display.max_columns = 100
    start = time.time()
    hot_encoded_df = data.groupby(['Transaction','Item'])['Item'].count().unstack().reset_index().fillna(0).set_index('Transaction')
    hot_encoded_df = hot_encoded_df.applymap(encode_units)
    frequent_itemsets = apriori(hot_encoded_df, min_support=0.02, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric='lift', min_threshold=0.5)
    print("频繁项集: ", frequent_itemsets)
    print("关联规则: ", rules[ (rules['lift'] >= 1) & (rules['confidence'] >= 0.5)])
    # print(rules['confidence'])
    end = time.time()
    print("用时: ", end-start)

In [6]:
rule1()
print('-'*100)
rule2()

频繁项集:  {1: {('alfajores',): 344, ('bread',): 3096, ('brownie',): 379, ('cake',): 983, ('coffee',): 4528, ('cookies',): 515, ('farm house',): 371, ('hot chocolate',): 552, ('juice',): 365, ('medialuna',): 585, ('muffin',): 364, ('pastry',): 815, ('sandwich',): 680, ('scandinavian',): 275, ('scone',): 327, ('soup',): 326, ('tea',): 1350, ('toast',): 318, ('truffles',): 192}, 2: {('bread', 'cake'): 221, ('bread', 'coffee'): 852, ('bread', 'pastry'): 276, ('bread', 'tea'): 266, ('cake', 'coffee'): 518, ('cake', 'tea'): 225, ('coffee', 'cookies'): 267, ('coffee', 'hot chocolate'): 280, ('coffee', 'juice'): 195, ('coffee', 'medialuna'): 333, ('coffee', 'pastry'): 450, ('coffee', 'sandwich'): 362, ('coffee', 'tea'): 472, ('coffee', 'toast'): 224}}
关联规则:  [{cake} -> {coffee}, {cookies} -> {coffee}, {hot chocolate} -> {coffee}, {juice} -> {coffee}, {medialuna} -> {coffee}, {pastry} -> {coffee}, {sandwich} -> {coffee}, {toast} -> {coffee}]
用时:  0.12666034698486328
-------------------------------