In [1]:
import pandas as pd
import time
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# 数据加载
data = pd.read_csv('BreadBasket_DMS.csv')
# 统一小写
data['Item'] = data['Item'].str.lower()
# 去掉none项
data = data.drop(data[data.Item == 'none'].index)

In [3]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

In [4]:
pd.options.display.max_columns = 100
start = time.time()
hot_encoded_df = data.groupby(['Transaction','Item'])['Item'].count().unstack().reset_index().fillna(0).set_index('Transaction')
hot_encoded_df = hot_encoded_df.applymap(encode_units)
frequent_itemsets = apriori(hot_encoded_df, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=0.5)
print("频繁项集: ", frequent_itemsets)
print("关联规则: ", rules[ (rules['lift'] >= 1) & (rules['confidence'] >= 0.5)])
# print(rules['confidence'])
end = time.time()
print("用时: ", end-start)

频繁项集:       support                 itemsets
0   0.036348              (alfajores)
1   0.327134                  (bread)
2   0.040046                (brownie)
3   0.103867                   (cake)
4   0.478445                 (coffee)
5   0.054417                (cookies)
6   0.039201             (farm house)
7   0.058326          (hot chocolate)
8   0.038567                  (juice)
9   0.061813              (medialuna)
10  0.038462                 (muffin)
11  0.086116                 (pastry)
12  0.071851               (sandwich)
13  0.029057           (scandinavian)
14  0.034552                  (scone)
15  0.034446                   (soup)
16  0.142646                    (tea)
17  0.033601                  (toast)
18  0.020287               (truffles)
19  0.023352            (bread, cake)
20  0.090025          (bread, coffee)
21  0.029163          (bread, pastry)
22  0.028107             (bread, tea)
23  0.054734           (cake, coffee)
24  0.023774              (tea, cake)
25  0

In [5]:
print(hot_encoded_df)

Item         adjustment  afternoon with the baker  alfajores  argentina night  \
Transaction                                                                     
2                     0                         0          0                0   
3                     0                         0          0                0   
4                     0                         0          0                0   
5                     0                         0          0                0   
6                     0                         0          0                0   
...                 ...                       ...        ...              ...   
9680                  0                         0          0                0   
9681                  0                         0          0                0   
9682                  0                         0          0                0   
9683                  0                         0          0                0   
9684                  0     

In [6]:
print(data)

Date      Time  Transaction           Item
1      2016/10/30  10:05:34            2   scandinavian
2      2016/10/30  10:05:34            2   scandinavian
3      2016/10/30  10:07:57            3  hot chocolate
4      2016/10/30  10:07:57            3            jam
5      2016/10/30  10:07:57            3        cookies
...           ...       ...          ...            ...
21288    2017/4/9  14:32:58         9682         coffee
21289    2017/4/9  14:32:58         9682            tea
21290    2017/4/9  14:57:06         9683         coffee
21291    2017/4/9  14:57:06         9683         pastry
21292    2017/4/9  15:04:24         9684      smoothies

[20506 rows x 4 columns]


In [5]:
hot_encoded_df = data.groupby(['Transaction','Item'])['Item'].count().unstack().reset_index().fillna(0).set_index('Transaction')
print(hot_encoded_df)

Item         adjustment  afternoon with the baker  alfajores  argentina night  \
Transaction                                                                     
2                   0.0                       0.0        0.0              0.0   
3                   0.0                       0.0        0.0              0.0   
4                   0.0                       0.0        0.0              0.0   
5                   0.0                       0.0        0.0              0.0   
6                   0.0                       0.0        0.0              0.0   
...                 ...                       ...        ...              ...   
9680                0.0                       0.0        0.0              0.0   
9681                0.0                       0.0        0.0              0.0   
9682                0.0                       0.0        0.0              0.0   
9683                0.0                       0.0        0.0              0.0   
9684                0.0     

In [12]:
data1 = data.groupby(['Transaction','Item'])['Item'].count().unstack().reset_index().fillna(0).set_index('Transaction')
print(data1)

Item         adjustment  afternoon with the baker  alfajores  argentina night  \
Transaction                                                                     
2                   0.0                       0.0        0.0              0.0   
3                   0.0                       0.0        0.0              0.0   
4                   0.0                       0.0        0.0              0.0   
5                   0.0                       0.0        0.0              0.0   
6                   0.0                       0.0        0.0              0.0   
...                 ...                       ...        ...              ...   
9680                0.0                       0.0        0.0              0.0   
9681                0.0                       0.0        0.0              0.0   
9682                0.0                       0.0        0.0              0.0   
9683                0.0                       0.0        0.0              0.0   
9684                0.0     

In [8]:
data2 = data.groupby(['Transaction','Item'])['Item'].count()
print(data2)

Transaction  Item         
2            scandinavian     2
3            cookies          1
             hot chocolate    1
             jam              1
4            muffin           1
                             ..
9682         tacos/fajita     1
             tea              1
9683         coffee           1
             pastry           1
9684         smoothies        1
Name: Item, Length: 18886, dtype: int64


In [9]:
data3 = data.groupby(['Transaction','Item'])['Item'].count().unstack()
print(data3)

Item         adjustment  afternoon with the baker  alfajores  argentina night  \
Transaction                                                                     
2                   NaN                       NaN        NaN              NaN   
3                   NaN                       NaN        NaN              NaN   
4                   NaN                       NaN        NaN              NaN   
5                   NaN                       NaN        NaN              NaN   
6                   NaN                       NaN        NaN              NaN   
...                 ...                       ...        ...              ...   
9680                NaN                       NaN        NaN              NaN   
9681                NaN                       NaN        NaN              NaN   
9682                NaN                       NaN        NaN              NaN   
9683                NaN                       NaN        NaN              NaN   
9684                NaN     

In [11]:
data4 = data.groupby(['Transaction','Item'])['Item'].count().unstack().reset_index().fillna(0)
print(data4)

Item  Transaction  adjustment  afternoon with the baker  alfajores  \
0               2         0.0                       0.0        0.0   
1               3         0.0                       0.0        0.0   
2               4         0.0                       0.0        0.0   
3               5         0.0                       0.0        0.0   
4               6         0.0                       0.0        0.0   
...           ...         ...                       ...        ...   
9459         9680         0.0                       0.0        0.0   
9460         9681         0.0                       0.0        0.0   
9461         9682         0.0                       0.0        0.0   
9462         9683         0.0                       0.0        0.0   
9463         9684         0.0                       0.0        0.0   

Item  argentina night  art tray  bacon  baguette  bakewell  bare popcorn  ...  \
0                 0.0       0.0    0.0       0.0       0.0           0.0  ... 