In [31]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [17]:
# 读取CSV文件
data = pd.read_csv('bike_data.csv')

# 打印DataFrame的前几行进行检查
data.head()

Unnamed: 0,OrderNumber,LineNumber,Model
0,cumid51178,1,山地英骑
1,cumid51178,2,山地车水壶架
2,cumid51178,3,运动水壶
3,cumid51184,1,山地英骑
4,cumid51184,2,hl山地外胎


In [19]:
# 1. 移除名为LineNumber的列
data = data.drop('LineNumber', axis=1)  

# 2. 按照OrderNumber进行分组，对每个分组中的Model列应用lambda函数
# lambda函数将每个分组中的Model值连接成一个字符串，使用'|'作为分隔符
# 最后重置索引，使分组结果成为一个新的DataFrame
data = data.groupby('OrderNumber')['Model'].apply(lambda x: x.str.cat(sep='|')).reset_index()  

# 3. 对处理后的data进行操作
# 首先移除Model列
data_oh = data.drop('Model', axis=1)  
# 然后对data中的Model列进行独热编码（get_dummies），并将编码结果与前面移除Model列后的数据合并
data_oh = data_oh.join(data.Model.str.get_dummies())  
# 最后将索引设置为OrderNumber
data_oh.set_index(['OrderNumber'], inplace=True)  

# 打印独热编码后的数据形状
print(data_oh.shape)  
# 打印独热编码后的数据的前几行
data_oh.head()  

(21255, 37)


Unnamed: 0_level_0,Women's Mountain Shorts,hl公路外胎,hl山地外胎,ll公路车外胎,ll山地胎,ml公路外胎,ml山地外胎,万能自行车座,修补工具,公路车350,...,水壶包,洗车喷剂,短袖经典车衣,竞速公路车,竞速袜,经典背心,自行车帽,运动型头盔,运动水壶,长袖骑车衣
OrderNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cumid51176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cumid51177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
cumid51178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
cumid51179,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
cumid51180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1


In [33]:
#设定阈值，输出频繁项集
frequent_itemsets_data = apriori(data_oh,use_colnames=True, min_support=0.05)
frequent_itemsets_data



Unnamed: 0,support,itemsets
0,0.062621,(hl山地外胎)
1,0.050953,(ml山地外胎)
2,0.141614,(修补工具)
3,0.104258,(公路车内胎)
4,0.080075,(公路车水壶架)
5,0.064126,(半掌手套)
6,0.116537,(山地英骑)
7,0.136815,(山地车内胎)
8,0.094754,(山地车挡泥板)
9,0.09132,(山地车水壶架)


In [43]:
#设置置信度进行关联规则
rules_data = association_rules(frequent_itemsets_data, metric='confidence', min_threshold=0.8)
rules_data.sort_values(by=['lift'],ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(公路车水壶架),(运动水壶),0.080075,0.191767,0.071183,0.888954,4.635604,0.055827,7.278377,0.852546
1,(山地车水壶架),(运动水壶),0.09132,0.191767,0.076359,0.836167,4.360336,0.058846,4.933273,0.848109


In [39]:
rules_data = association_rules(frequent_itemsets_data, metric='lift', min_threshold=0.9)
rules_data.sort_values(by=['lift'],ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(公路车水壶架),(运动水壶),0.080075,0.191767,0.071183,0.888954,4.635604,0.055827,7.278377,0.852546
1,(运动水壶),(公路车水壶架),0.191767,0.080075,0.071183,0.371197,4.635604,0.055827,1.462978,0.970361
4,(山地车水壶架),(运动水壶),0.09132,0.191767,0.076359,0.836167,4.360336,0.058846,4.933273,0.848109
5,(运动水壶),(山地车水壶架),0.191767,0.09132,0.076359,0.398184,4.360336,0.058846,1.509898,0.953512
2,(山地车内胎),(运动型头盔),0.136815,0.290332,0.058339,0.42641,1.468699,0.018618,1.237239,0.369707
3,(运动型头盔),(山地车内胎),0.290332,0.136815,0.058339,0.20094,1.468699,0.018618,1.080251,0.449682
