# 商务案例: 购物篮数据关联分析
# 1. 关联分析Python库 - mlxtend
**mlxtend** (http://rasbt.github.io/mlxtend/) 是一款高级的机器学习扩展库，可用于日常机器学习任务的主要工具，也可以作为sklearn的一个补充和辅助工具。 

In [3]:
import pandas as pd
# pip install mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

# 2. 针对购物篮数据开展关联分析
## 2.1. 构造示例数据集如下：

In [6]:
dataset = [['A', 'B', 'C'], ['A', 'D', 'E'], ['A', 'C'], ['D', 'E'], ['B', 'C'], ['A', 'C', 'D', 'E'], ['B', 'C'], ['A', 'C', 'E'], ['B', 'E']]


In [7]:
dataset

[['A', 'B', 'C'],
 ['A', 'D', 'E'],
 ['A', 'C'],
 ['D', 'E'],
 ['B', 'C'],
 ['A', 'C', 'D', 'E'],
 ['B', 'C'],
 ['A', 'C', 'E'],
 ['B', 'E']]

## 2.2. 将上述数据集转换为购物篮数据

In [8]:
te = TransactionEncoder()

In [9]:
te_ary = te.fit(dataset).transform(dataset)
te_ary

array([[ True,  True,  True, False, False],
       [ True, False, False,  True,  True],
       [ True, False,  True, False, False],
       [False, False, False,  True,  True],
       [False,  True,  True, False, False],
       [ True, False,  True,  True,  True],
       [False,  True,  True, False, False],
       [ True, False,  True, False,  True],
       [False,  True, False, False,  True]])

In [10]:
df = pd.DataFrame(te_ary, columns = te.columns_)
df

Unnamed: 0,A,B,C,D,E
0,True,True,True,False,False
1,True,False,False,True,True
2,True,False,True,False,False
3,False,False,False,True,True
4,False,True,True,False,False
5,True,False,True,True,True
6,False,True,True,False,False
7,True,False,True,False,True
8,False,True,False,False,True


## 2.3. 使用Apriori算法找出频繁项集

In [13]:
from mlxtend.frequent_patterns import apriori
freq = apriori(df, min_support = 0.4, use_colnames = True)

In [14]:
freq

Unnamed: 0,support,itemsets
0,0.555556,(A)
1,0.444444,(B)
2,0.666667,(C)
3,0.555556,(E)
4,0.444444,"(A, C)"


In [16]:
freq = apriori(df, min_support = 0.4, use_colnames = False)
freq

Unnamed: 0,support,itemsets
0,0.555556,(0)
1,0.444444,(1)
2,0.666667,(2)
3,0.555556,(4)
4,0.444444,"(0, 2)"


In [21]:
freq = apriori(df, min_support = 0.4, use_colnames = True)

## 2.4. 生成关联规则

In [22]:
from mlxtend.frequent_patterns import association_rules
Rules = association_rules(freq, metric = "confidence", min_threshold = 0.75)
Rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(A),(C),0.555556,0.666667,0.444444,0.8,1.2,0.074074,1.666667,0.375


In [14]:
type(Rules)

pandas.core.frame.DataFrame

In [15]:
Rules2 = Rules[['antecedents', 'consequents', 'support', 'confidence']]
Rules2

Unnamed: 0,antecedents,consequents,support,confidence
0,(1),(6),0.6,1.0
1,(1),(10),0.6,1.0
2,(2),(6),0.6,1.0
3,(9),(6),0.6,1.0
4,(10),(6),0.8,1.0
5,(6),(10),0.8,0.8
6,"(1, 10)",(6),0.6,1.0
7,"(1, 6)",(10),0.6,1.0
8,(1),"(10, 6)",0.6,1.0


In [16]:
Rules3 = Rules2[Rules2['confidence'] >= 0.9]
Rules3

Unnamed: 0,antecedents,consequents,support,confidence
0,(1),(6),0.6,1.0
1,(1),(10),0.6,1.0
2,(2),(6),0.6,1.0
3,(9),(6),0.6,1.0
4,(10),(6),0.8,1.0
6,"(1, 10)",(6),0.6,1.0
7,"(1, 6)",(10),0.6,1.0
8,(1),"(10, 6)",0.6,1.0


# 3. 从外部导入数据进行分析

In [17]:
import numpy as np
import matplotlib.pyplot as plt

In [18]:
# 读取外部数据
purchase_data = pd.read_csv('./PurchaseData.csv', header = None, keep_default_na = False)

In [19]:
purchase_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [20]:
num_records = len(purchase_data)
num_records

7501

## 3.1. 数据预处理

In [21]:
records = []
for i in range(0, num_records):
    records.append([str(purchase_data.values[i, j]) for j in range(0, 20)])

In [22]:
records[2]
# 去除空值
for i in range(0, num_records):
     records[i] = [x for x in records[i] if x] 

In [23]:
records[1]

['burgers', 'meatballs', 'eggs']

In [24]:
te_ary = te.fit(records).transform(records)
te_ary[0]

array([False,  True,  True, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False,  True, False, False, False, False,  True,
        True, False, False, False, False, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False,  True,  True, False, False, False, False,  True, False,
       False, False, False,  True, False, False, False,  True, False,
       False, False, False,  True, False, False,  True, False, False,
        True, False, False])

In [25]:
df = pd.DataFrame(te_ary, columns = te.columns_)
df.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


## 3.2. 关联规则挖掘

In [26]:
# 生成频繁项集
freq = apriori(df, min_support = 0.05, use_colnames = True)

In [27]:
freq

Unnamed: 0,support,itemsets
0,0.087188,(burgers)
1,0.081056,(cake)
2,0.059992,(chicken)
3,0.163845,(chocolate)
4,0.080389,(cookies)
5,0.05106,(cooking oil)
6,0.179709,(eggs)
7,0.079323,(escalope)
8,0.170911,(french fries)
9,0.063325,(frozen smoothie)


In [28]:
# 计算关联规则
Rules = association_rules(freq, metric = "confidence", min_threshold = 0.2)
Rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(mineral water),(chocolate),0.238368,0.163845,0.05266,0.220917,1.348332,0.013604,1.073256
1,(chocolate),(mineral water),0.163845,0.238368,0.05266,0.3214,1.348332,0.013604,1.122357
2,(eggs),(mineral water),0.179709,0.238368,0.050927,0.283383,1.188845,0.00809,1.062815
3,(mineral water),(eggs),0.238368,0.179709,0.050927,0.213647,1.188845,0.00809,1.043158
4,(spaghetti),(mineral water),0.17411,0.238368,0.059725,0.343032,1.439085,0.018223,1.159314
5,(mineral water),(spaghetti),0.238368,0.17411,0.059725,0.250559,1.439085,0.018223,1.102008
