In [1]:
import pandas as pd # 读取数据表并进行基于DataFrame结构的操作
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
import re
%matplotlib inline

import warnings
warnings.filterwarnings('ignore') # 不显示warning信息
pd.options.display.width = 900 # Dataframe 显示宽度设置

首先我们从文件中读取数据。可以看到，共有9835条数据，每个数据两个属性，ID和items。

In [2]:
df = pd.read_csv("Datas.csv")
print(df.shape)
df.head(7)

(9835, 2)


Unnamed: 0,ID,items
0,1,"{citrus fruit,semi-finished bread,margarine,re..."
1,2,"{tropical fruit,yogurt,coffee}"
2,3,{whole milk}
3,4,"{pip fruit,yogurt,cream cheese ,meat spreads}"
4,5,"{other vegetables,whole milk,condensed milk,lo..."
5,6,"{whole milk,butter,yogurt,rice,abrasive cleaner}"
6,7,{rolls/buns}


接下来我们需要进行数据预处理，将数据转化为我们能够操作的形式。

原始数据存在于DataFrame中，首先我们将数据提取出来作为一个字典

In [3]:
data={}
for index,items in df.groupby('ID')['items']:
    data[index] = frozenset(items.values)
data 

{1: frozenset({'{citrus fruit,semi-finished bread,margarine,ready soups}'}),
 2: frozenset({'{tropical fruit,yogurt,coffee}'}),
 3: frozenset({'{whole milk}'}),
 4: frozenset({'{pip fruit,yogurt,cream cheese ,meat spreads}'}),
 5: frozenset({'{other vegetables,whole milk,condensed milk,long life bakery product}'}),
 6: frozenset({'{whole milk,butter,yogurt,rice,abrasive cleaner}'}),
 7: frozenset({'{rolls/buns}'}),
 8: frozenset({'{other vegetables,UHT-milk,rolls/buns,bottled beer,liquor (appetizer)}'}),
 9: frozenset({'{pot plants}'}),
 10: frozenset({'{whole milk,cereals}'}),
 11: frozenset({'{tropical fruit,other vegetables,white bread,bottled water,chocolate}'}),
 12: frozenset({'{citrus fruit,tropical fruit,whole milk,butter,curd,yogurt,flour,bottled water,dishes}'}),
 13: frozenset({'{beef}'}),
 14: frozenset({'{frankfurter,rolls/buns,soda}'}),
 15: frozenset({'{chicken,tropical fruit}'}),
 16: frozenset({'{butter,sugar,fruit/vegetable juice,newspapers}'}),
 17: frozenset({'{frui

原始数据提取出之后发现字典的值是字符串格式，多个项合成了一个字符串。我们无法直接对一个字符串进行操作，因此继续对字典的值进行字符串切割，提取出其中的项。

考虑到后面我们需要对项数进行统计，这里使用了不可变集合数据结构，每一个元素在添加后是确定的，可以生成一个对应的哈希值作为字典的关键字，便于我们后续处理。

In [4]:
Data={}
# tmp=set()
for item in data:
    for record in data[item]:
        for x in re.split(r'[{}]',record):
            if x == '':
                continue
            else:
#                 print(re.split(r'[,]',x))
                Data[item]= frozenset(re.split(r'[,]',x))                 
Data

{1: frozenset({'citrus fruit',
            'margarine',
            'ready soups',
            'semi-finished bread'}),
 2: frozenset({'coffee', 'tropical fruit', 'yogurt'}),
 3: frozenset({'whole milk'}),
 4: frozenset({'cream cheese ', 'meat spreads', 'pip fruit', 'yogurt'}),
 5: frozenset({'condensed milk',
            'long life bakery product',
            'other vegetables',
            'whole milk'}),
 6: frozenset({'abrasive cleaner', 'butter', 'rice', 'whole milk', 'yogurt'}),
 7: frozenset({'rolls/buns'}),
 8: frozenset({'UHT-milk',
            'bottled beer',
            'liquor (appetizer)',
            'other vegetables',
            'rolls/buns'}),
 9: frozenset({'pot plants'}),
 10: frozenset({'cereals', 'whole milk'}),
 11: frozenset({'bottled water',
            'chocolate',
            'other vegetables',
            'tropical fruit',
            'white bread'}),
 12: frozenset({'bottled water',
            'butter',
            'citrus fruit',
            'curd',
   

In [5]:
freq_1={}
for item in Data:
    for record in Data[item]:
        if frozenset([record]) in freq_1:
            freq_1[frozenset([record])] += 1
        else:
            freq_1[frozenset([record])]  = 1

第一频繁项集的候选集大小为169

In [6]:
len(freq_1)

169

设置支持度为400，提取出第一频繁项集

In [7]:
min_supp=400
freq_1_supp={v:freq_1[v] for v in freq_1 if freq_1[v]>=min_supp}
freq_1_supp

{frozenset({'margarine'}): 576,
 frozenset({'citrus fruit'}): 814,
 frozenset({'coffee'}): 571,
 frozenset({'yogurt'}): 1372,
 frozenset({'tropical fruit'}): 1032,
 frozenset({'whole milk'}): 2513,
 frozenset({'pip fruit'}): 744,
 frozenset({'other vegetables'}): 1903,
 frozenset({'butter'}): 545,
 frozenset({'rolls/buns'}): 1809,
 frozenset({'bottled beer'}): 792,
 frozenset({'white bread'}): 414,
 frozenset({'chocolate'}): 488,
 frozenset({'bottled water'}): 1087,
 frozenset({'curd'}): 524,
 frozenset({'beef'}): 516,
 frozenset({'soda'}): 1715,
 frozenset({'frankfurter'}): 580,
 frozenset({'chicken'}): 422,
 frozenset({'newspapers'}): 785,
 frozenset({'fruit/vegetable juice'}): 711,
 frozenset({'pastry'}): 875,
 frozenset({'root vegetables'}): 1072,
 frozenset({'canned beer'}): 764,
 frozenset({'sausage'}): 924,
 frozenset({'brown bread'}): 638,
 frozenset({'shopping bags'}): 969,
 frozenset({'napkins'}): 515,
 frozenset({'pork'}): 567,
 frozenset({'whipped/sour cream'}): 705,
 froze

In [9]:
def getSubset(item, k):
    
    import itertools as its
    return [frozenset(item) for item in its.combinations(item, k)]

def getAllSubsets(item):
    
    subsets = []
    
    for i in range(len(item) - 1):
    
        subsets.extend(getSubset(item, i + 1))
                       
    return subsets

In [38]:
def get_frequent_item_list_with_spuu(frequent_k,min_supp,k,item_list):
    items = frequent_k.keys()
    condidate_items=[]
    current_k={}
    candidate_items=[m.union(n) for m in items for n in items if m!=n and len(m.union(n))==k] # 合并集合,连接，生成候选项集
    for record in item_list.items():
          for item in candidate_items:
            if item.issubset(record[1]):
                if item in current_k:
                    current_k[item]+=1
                else:
                    current_k[item]=1
    return {v:current_k[v] for v in current_k if current_k[v]>=min_supp}

        

In [40]:
k = 2
final_itemsets = []
final_itemsets.append(freq_1_supp)
min_conf=0.6
frequent_k_minus_one = freq_1_supp
while k<=6:
    print(len(frequent_k_minus_one))
    print(frequent_k_minus_one)
    print("")
    frequent_k=get_frequent_item_list_with_spuu(frequent_k_minus_one,min_supp,k,Data)
    final_itemsets.append(frequent_k)
    frequent_k_minus_one=frequent_k
    k+=1
    

32
{frozenset({'margarine'}): 576, frozenset({'citrus fruit'}): 814, frozenset({'coffee'}): 571, frozenset({'yogurt'}): 1372, frozenset({'tropical fruit'}): 1032, frozenset({'whole milk'}): 2513, frozenset({'pip fruit'}): 744, frozenset({'other vegetables'}): 1903, frozenset({'butter'}): 545, frozenset({'rolls/buns'}): 1809, frozenset({'bottled beer'}): 792, frozenset({'white bread'}): 414, frozenset({'chocolate'}): 488, frozenset({'bottled water'}): 1087, frozenset({'curd'}): 524, frozenset({'beef'}): 516, frozenset({'soda'}): 1715, frozenset({'frankfurter'}): 580, frozenset({'chicken'}): 422, frozenset({'newspapers'}): 785, frozenset({'fruit/vegetable juice'}): 711, frozenset({'pastry'}): 875, frozenset({'root vegetables'}): 1072, frozenset({'canned beer'}): 764, frozenset({'sausage'}): 924, frozenset({'brown bread'}): 638, frozenset({'shopping bags'}): 969, frozenset({'napkins'}): 515, frozenset({'pork'}): 567, frozenset({'whipped/sour cream'}): 705, frozenset({'domestic eggs'}): 62

In [41]:
final_itemset

[{frozenset({'margarine'}): 576,
  frozenset({'citrus fruit'}): 814,
  frozenset({'coffee'}): 571,
  frozenset({'yogurt'}): 1372,
  frozenset({'tropical fruit'}): 1032,
  frozenset({'whole milk'}): 2513,
  frozenset({'pip fruit'}): 744,
  frozenset({'other vegetables'}): 1903,
  frozenset({'butter'}): 545,
  frozenset({'rolls/buns'}): 1809,
  frozenset({'bottled beer'}): 792,
  frozenset({'white bread'}): 414,
  frozenset({'chocolate'}): 488,
  frozenset({'bottled water'}): 1087,
  frozenset({'curd'}): 524,
  frozenset({'beef'}): 516,
  frozenset({'soda'}): 1715,
  frozenset({'frankfurter'}): 580,
  frozenset({'chicken'}): 422,
  frozenset({'newspapers'}): 785,
  frozenset({'fruit/vegetable juice'}): 711,
  frozenset({'pastry'}): 875,
  frozenset({'root vegetables'}): 1072,
  frozenset({'canned beer'}): 764,
  frozenset({'sausage'}): 924,
  frozenset({'brown bread'}): 638,
  frozenset({'shopping bags'}): 969,
  frozenset({'napkins'}): 515,
  frozenset({'pork'}): 567,
  frozenset({'whip

In [44]:
association_rules=[]
for itemset in final_itemsets:# iter list
    for value in itemset:
        for condition in getAllSubsets(value):
            conclusion_items=[x for x in value if x not in condition]
            if len(conclusion_items)>0:
                confidence=float(final_itemsets[len(value)-1][value])/float(final_itemsets[len(condition)-1][condition])
                if confidence>min_conf:
                    association_rules.append([[condition,conclusion_items],[confidence]])

In [45]:
len(association_rules)

410

In [46]:
association_rules

[[[frozenset({'other vegetables'}), ['whole milk']], [0.7735155018392013]],
 [[frozenset({'yogurt'}), ['whole milk']], [0.8032069970845481]],
 [[frozenset({'butter'}), ['whole milk']], [0.9944954128440368]],
 [[frozenset({'tropical fruit'}), ['other vegetables']], [0.6841085271317829]],
 [[frozenset({'citrus fruit'}), ['whole milk']], [0.7371007371007371]],
 [[frozenset({'tropical fruit'}), ['whole milk']], [0.8062015503875969]],
 [[frozenset({'bottled water'}), ['whole milk']], [0.6218951241950322]],
 [[frozenset({'curd'}), ['whole milk']], [0.9809160305343512]],
 [[frozenset({'root vegetables'}), ['other vegetables']],
  [0.8694029850746269]],
 [[frozenset({'sausage'}), ['rolls/buns']], [0.6515151515151515]],
 [[frozenset({'root vegetables'}), ['whole milk']], [0.8973880597014925]],
 [[frozenset({'pork'}), ['whole milk']], [0.7689594356261023]],
 [[frozenset({'whipped/sour cream'}), ['whole milk']], [0.8992907801418439]],
 [[frozenset({'pork'}), ['other vegetables']], [0.751322751322