#### 연관분석
##### RunTest : H0 : binary 값이 임의적이다. H1 : binary 값이 임의적이지 않다. 즉 연관이 있다.
##### 장바구나 분석(무엇을 같이 사는지?), 서열분석(A를 구매하고 B를 구매하는지?)
##### 사건들 간의 규칙 발견 목적, 두 아이템 집합이 얼마나 빈번하게 발생?
##### 지지도(Support) = A와 B를 동시에 구매한 거래 수 / 전체 거래 수
##### 신뢰도(Confidence) = A와 B를 동시에 구매한 거래 수 / A를 구매한 거래 수
##### 향상도(Lift) = A와 B를 동시에 구매한 거래 수 / A를 구매한 거래 수 * B를 구매한 거래 수
##### 트랜잭션 데이터 변환 > 최소지지도 > 해석 > 효율적인 상품진열, 패키지 상품개발, 교차판매(실시간 상품추천, B를 추천)

In [1]:
#### Groceries 데이터셋으로 연관규칙분석을 실시하라

In [57]:
import pandas as pd
df = pd.read_csv('data/groceries.csv')
df
# 1. 전처리
# 1) 컬명명이 실제 데이터이기 때문에 변환 필요
df.loc[9834, df.columns.values[0]] = df.columns.values[0]
df
df.columns = ['purchase']
df

Unnamed: 0,purchase
0,"tropical fruit,yogurt,coffee"
1,whole milk
2,"pip fruit,yogurt,cream cheese,meat spreads"
3,"other vegetables,whole milk,condensed milk,lon..."
4,"whole milk,butter,yogurt,rice,abrasive cleaner"
...,...
9830,cooking chocolate
9831,"chicken,citrus fruit,other vegetables,butter,y..."
9832,"semi-finished bread,bottled water,soda,bottled..."
9833,"chicken,tropical fruit,other vegetables,vinega..."


In [58]:
# 2) , 구분
df = df.iloc[:, 0].str.split(',', expand = True) # expand = True 하나의 컬럼을 여러개로 분리 

In [61]:
# 3) Transacation Encoder를 통해 Transaction 데이터로 변환
groceries = []
for i in range(0, len(df.values)):
    temp = list(filter(None, df.values[i]))
    groceries.append(temp)

from mlxtend.preprocessing import TransactionEncoder
import pandas as pd

te = TransactionEncoder()
groceries_tr = te.fit(groceries).transform(groceries)
groceries_df = pd.DataFrame(groceries_tr, columns = te.columns_)

groceries_df

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9831,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
9832,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9833,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [63]:
# 2. 연관분석
from mlxtend.frequent_patterns import apriori
#1) 지지도가 5% 이상인 빈번 항목 집합 탐색
groceries_ap = apriori(groceries_df, min_support = 0.01, use_colnames = True)
groceries_ap

Unnamed: 0,support,itemsets
0,0.033452,(UHT-milk)
1,0.017692,(baking powder)
2,0.052466,(beef)
3,0.033249,(berries)
4,0.026029,(beverages)
...,...,...
328,0.011998,"(whole milk, root vegetables, tropical fruit)"
329,0.014540,"(yogurt, whole milk, root vegetables)"
330,0.010473,"(soda, whole milk, yogurt)"
331,0.015150,"(yogurt, whole milk, tropical fruit)"


In [65]:
# 2) 연관규칙 : 신뢰도가 0.3 이상인 빈번 항목 집합 탐색
from mlxtend.frequent_patterns import association_rules
association_rules(groceries_ap, metric = 'confidence', min_threshold = 0.3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(beef),(other vegetables),0.052466,0.193493,0.019725,0.375969,1.943066,0.009574,1.292416
1,(beef),(root vegetables),0.052466,0.108998,0.017387,0.331395,3.040367,0.011668,1.332628
2,(beef),(whole milk),0.052466,0.255516,0.021251,0.405039,1.585180,0.007845,1.251315
3,(berries),(other vegetables),0.033249,0.193493,0.010269,0.308869,1.596280,0.003836,1.166938
4,(berries),(whole milk),0.033249,0.255516,0.011795,0.354740,1.388328,0.003299,1.153774
...,...,...,...,...,...,...,...,...,...
120,"(soda, yogurt)",(whole milk),0.027351,0.255516,0.010473,0.382900,1.498535,0.003484,1.206423
121,"(yogurt, tropical fruit)",(whole milk),0.029283,0.255516,0.015150,0.517361,2.024770,0.007668,1.542528
122,"(whole milk, tropical fruit)",(yogurt),0.042298,0.139502,0.015150,0.358173,2.567516,0.009249,1.340701
123,"(yogurt, whipped/sour cream)",(whole milk),0.020742,0.255516,0.010880,0.524510,2.052747,0.005580,1.565719


In [68]:
# 2-1) 연관규칙 : 향상도가 3이상인 빈번 항목 집합 탐색
a_rules = association_rules(groceries_ap, metric = 'lift', min_threshold = 3)
a_rules['ante_len'] = a_rules['antecedents'].apply(lambda x : len(x))
a_rules['cons_len'] = a_rules['consequents'].apply(lambda x : len(x))
a_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,ante_len,cons_len
0,(beef),(root vegetables),0.052466,0.108998,0.017387,0.331395,3.040367,0.011668,1.332628,1,1
1,(root vegetables),(beef),0.108998,0.052466,0.017387,0.159515,3.040367,0.011668,1.127366,1,1
2,"(other vegetables, citrus fruit)",(root vegetables),0.028876,0.108998,0.010371,0.359155,3.295045,0.007224,1.390354,2,1
3,"(root vegetables, citrus fruit)",(other vegetables),0.017692,0.193493,0.010371,0.586207,3.029608,0.006948,1.949059,2,1
4,(other vegetables),"(root vegetables, citrus fruit)",0.193493,0.017692,0.010371,0.0536,3.029608,0.006948,1.037941,1,2
5,(root vegetables),"(other vegetables, citrus fruit)",0.108998,0.028876,0.010371,0.095149,3.295045,0.007224,1.073242,1,2
6,"(yogurt, whole milk)",(curd),0.056024,0.053279,0.010066,0.179673,3.372304,0.007081,1.154078,2,1
7,(curd),"(yogurt, whole milk)",0.053279,0.056024,0.010066,0.188931,3.372304,0.007081,1.163866,1,2
8,"(other vegetables, tropical fruit)",(root vegetables),0.035892,0.108998,0.012303,0.342776,3.14478,0.008391,1.355705,2,1
9,"(root vegetables, tropical fruit)",(other vegetables),0.021047,0.193493,0.012303,0.584541,3.020999,0.008231,1.941244,2,1


In [77]:
a_rules = a_rules.sort_values(by = 'lift', ascending = False)
a_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,ante_len,cons_len
6,"(yogurt, whole milk)",(curd),0.056024,0.053279,0.010066,0.179673,3.372304,0.007081,1.154078,2,1
7,(curd),"(yogurt, whole milk)",0.053279,0.056024,0.010066,0.188931,3.372304,0.007081,1.163866,1,2
5,(root vegetables),"(other vegetables, citrus fruit)",0.108998,0.028876,0.010371,0.095149,3.295045,0.007224,1.073242,1,2
2,"(other vegetables, citrus fruit)",(root vegetables),0.028876,0.108998,0.010371,0.359155,3.295045,0.007224,1.390354,2,1
12,"(yogurt, other vegetables)",(whipped/sour cream),0.043416,0.071683,0.010168,0.234192,3.267062,0.007056,1.212206,2,1
13,(whipped/sour cream),"(yogurt, other vegetables)",0.071683,0.043416,0.010168,0.141844,3.267062,0.007056,1.114697,1,2
11,(root vegetables),"(other vegetables, tropical fruit)",0.108998,0.035892,0.012303,0.112873,3.14478,0.008391,1.086776,1,2
8,"(other vegetables, tropical fruit)",(root vegetables),0.035892,0.108998,0.012303,0.342776,3.14478,0.008391,1.355705,2,1
1,(root vegetables),(beef),0.108998,0.052466,0.017387,0.159515,3.040367,0.011668,1.127366,1,1
0,(beef),(root vegetables),0.052466,0.108998,0.017387,0.331395,3.040367,0.011668,1.332628,1,1


##### antecendents와 consequents의 향상도가 3이상이 있다.
##### 이는 consequents의 물품을 단독으로 구매했을 때 보다, antecendents를 구매하고 consequents를 구매할 확률이 3배 높다.
##### 구매 연관성이 높다는 의미이므로, 해당 상품군 간에는 결합상품 할인 쿠폰이나, 품목배치 변경을 제안할 수 있다.

In [72]:
# 2-2) 연관규칙 : 여러가지 기준
a_rules[ (a_rules['ante_len'] >= 1) &
         (a_rules['confidence'] > 0.4) &
         (a_rules['lift'] > 1)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,ante_len,cons_len
3,"(root vegetables, citrus fruit)",(other vegetables),0.017692,0.193493,0.010371,0.586207,3.029608,0.006948,1.949059,2,1
9,"(root vegetables, tropical fruit)",(other vegetables),0.021047,0.193493,0.012303,0.584541,3.020999,0.008231,1.941244,2,1
