## - 불러올 패키지 목록

In [1]:
# 수학 함수 패키지
import math

# 데이터 핸들링을 위한 패키지
import numpy as np
import pandas as pd

# 사이킷런 패키지
from sklearn.preprocessing import *  # 데이터 전처리를 위한 패키지
from sklearn.model_selection import *  # 데이터 분리, 검증 및 파라미터 튜닝을 위한 패키지
from sklearn.metrics import *  # 모델에 대한 다양한 성능 평가를 위한 패키지
from sklearn.cluster import *  # 비지도 군집화 알고리즘 제공
from sklearn.tree import *  # 의사결정나무 알고리즘 제공
from sklearn.ensemble import *  # 앙상블 알고리즘 제공
from sklearn.neighbors import *  # kNN 알고리즘 제공
from sklearn.svm import *  # 서포트 벡터 머신 알고리즘 제공
from sklearn.mixture import *  # 혼합분포군집(GMM 등) 알고리즘 제공
from sklearn.decomposition import *  # 차원축소 알고리즘 제공

# 사이파이 패키지
from scipy.cluster.hierarchy import *  # 계층적 군집 분석을 위한 패키지

# mlxtend 패키지
from mlxtend.preprocessing import *  # 연관분석에 필요한 트랜잭션 전처리 알고리즘 포함
from mlxtend.frequent_patterns import *  # 연관분석에 사용되는 알고리즘 포함 
from mlxtend.plotting import plot_decision_regions

# missingno 패키지 (데이터 전처리 전 결측치 확인)
import missingno as msno

# label별 데이터 카운트를 위한 모듈
from collections import Counter  # https://docs.python.org/3/library/collections.html

# 경고 메시지 무시
import warnings
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.cm as cm  # 내장 colormap, colormap 처리 유틸리티
from matplotlib.colors import ListedColormap  # colors and colormaps
from mlxtend.plotting import plot_decision_regions

warnings.filterwarnings('ignore')

# 시각화를 위한 패키지
%matplotlib inline
# %matplotlib inline 의 목적은 plt.show()함수가 호출되지 않은 경우에도 matplotlib 다이어그램을 렌더링하는 것이다.
# 그러나 현재 업데이트된 버전의 주피터 노트북 버전에서는 %matplotlib inline 를 사용하지 않더라도 Matplotlib 다이어그램을 개체로 표현한다.
# 따라서 굳이 필요하지는 않다. 그러나 코드를 깨끗하게 유지하고 자신이 만든 플롯을 호출하기 위해 여전히 관례적으로 권장된다.

# 그래프 스타일 서식 지정
plt.style.use('default')

# 한글 폰트 설정
plt.rc('font', family='Malgun Gothic')

# 그래프 축의 음수 표현 오류 방지
plt.rcParams['axes.unicode_minus'] = False

## [ 참고 ]
1. `apriori()`  
    API : http://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/apriori/   
    Example : http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

2. `fpgrowth()`  
    API : http://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/fpgrowth/  
    Example : http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/

3. `fpmax()`  
    API : http://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/fpmax/  
    Example : http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpmax/

- `Association_rules()`  
    API : http://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/association_rules/  
    Example : http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

# Frequent Itemsets via Apriori Algorithm
- API : `apriori()`  http://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/apriori/

- Example : http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

## Example 1 -- Generating Frequent Itemsets

In [2]:
# The apriori function expects data in a one-hot encoded pandas DataFrame.

# Suppose we have the following transaction data:

dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
dataset

[['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
 ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
 ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
 ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

In [19]:
# We can transform it into the right format via the TransactionEncoder as follows:

te = TransactionEncoder()
te_ary = te.fit_transform( dataset )

df = pd.DataFrame( te_ary,
                   columns = te.columns_)
df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [26]:
# Now, let us return the items and itemsets with at least 60% support:
# By default, apriori returns the column indices of the items, which may be useful in downstream operations such as association rule mining.

apriori( df,
         min_support = 0.6 )

Unnamed: 0,support,itemsets
0,0.8,(3)
1,1.0,(5)
2,0.6,(6)
3,0.6,(8)
4,0.6,(10)
5,0.8,"(3, 5)"
6,0.6,"(8, 3)"
7,0.6,"(5, 6)"
8,0.6,"(8, 5)"
9,0.6,"(10, 5)"


In [23]:
# For better readability, we can set `use_colnames = True` to convert these integer values into the respective item names:

apriori( df,
         min_support = 0.6,
         use_colnames = True )

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


## Example 2 -- Selecting and Filtering Results

In [27]:
# The advantage of working with pandas DataFrames is that we can use its convenient features to filter the results.
# For instance, let's assume we are only interested in itemsets of length 2 that have a support of at least 80 percent.
# First, we create the frequent itemsets via apriori and add a new column that stores the length of each itemset:

frequent_itemsets = apriori( df,
                             min_support = 0.6,
                             use_colnames = True )

frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply( lambda x: len(x) )
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.8,(Eggs),1
1,1.0,(Kidney Beans),1
2,0.6,(Milk),1
3,0.6,(Onion),1
4,0.6,(Yogurt),1
5,0.8,"(Eggs, Kidney Beans)",2
6,0.6,"(Onion, Eggs)",2
7,0.6,"(Milk, Kidney Beans)",2
8,0.6,"(Onion, Kidney Beans)",2
9,0.6,"(Yogurt, Kidney Beans)",2


In [28]:
# Then, we can select the results that satisfy our desired criteria as follows:

frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.8) ]

Unnamed: 0,support,itemsets,length
5,0.8,"(Eggs, Kidney Beans)",2


In [9]:
# Similarly, using the Pandas API, we can select entries based on the "itemsets" column:

frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]

Unnamed: 0,support,itemsets,length
6,0.6,"(Onion, Eggs)",2


## Example 3 -- Working with Sparse Representations

In [31]:
# To save memory, you may want to represent your transaction data in the sparse format.
# This is especially useful if you have lots of products and small transactions.

oht_ary = te.fit_transform( dataset, sparse = True )

sparse_df = pd.DataFrame.sparse.from_spmatrix( oht_ary,
                                               columns = te.columns_ )
sparse_df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,0,0,0,1,0,True,1,1,1,0,1
1,0,0,1,1,0,True,0,1,1,0,1
2,1,0,0,1,0,True,1,0,0,0,0
3,0,1,0,0,0,True,1,0,0,1,1
4,0,1,0,1,1,True,0,0,1,0,0


In [18]:
apriori( sparse_df, min_support = 0.6,
         use_colnames = True,
         verbose = 1 )

Processing 21 combinations | Sampling itemset size 3


Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


# Frequent Itemsets via the FP-Growth Algorithm
- `fpgrowth()`  
    API : http://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/fpgrowth/  
    Example : http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/

# Frequent Itemsets via the FP-Growth Algorithm
- `fpmax()`  
    API : http://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/fpmax/  
    Example : http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpmax/

# Association Rules Generation from Frequent Itemsets

- `Association_rules()`  
    API : http://rasbt.github.io/mlxtend/api_modules/mlxtend.frequent_patterns/association_rules/  
    Example : http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Example 1 -- Generating Association Rules from Frequent Itemsets

In [35]:
# The generate_rules takes dataframes of frequent itemsets as produced by the apriori, fpgrowth, or fpmax functions in mlxtend.association.
# To demonstrate the usage of the generate_rules method, we first create a pandas DataFrame of frequent itemsets as generated by the fpgrowth function:

dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
te_ary = te.fit_transform(dataset)
df = pd.DataFrame( te_ary,
                   columns = te.columns_ )

In [36]:
apriori( df,
         min_support = 0.6,
         use_colnames = True )

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


In [38]:
fpmax( df,
       min_support = 0.6,
       use_colnames = True )

Unnamed: 0,support,itemsets
0,0.6,"(Milk, Kidney Beans)"
1,0.6,"(Onion, Eggs, Kidney Beans)"
2,0.6,"(Yogurt, Kidney Beans)"


In [37]:
frequent_itemsets = fpgrowth( df,
                              min_support = 0.6,
                              use_colnames = True )

frequent_itemsets

Unnamed: 0,support,itemsets
0,1.0,(Kidney Beans)
1,0.8,(Eggs)
2,0.6,(Yogurt)
3,0.6,(Onion)
4,0.6,(Milk)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Yogurt, Kidney Beans)"
7,0.6,"(Onion, Eggs)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Onion, Eggs, Kidney Beans)"


In [39]:
# The `association_rules()` function allows you to (1) specify your metric of interest and (2) the according threshold.
# Currently implemented measures are confidence and lift.
# Let's say you are interested in rules derived from the frequent itemsets only if the level of confidence is above the 70 percent threshold ( min_threshold = 0.7 ):

association_rules( frequent_itemsets,
                   metric = "confidence",
                   min_threshold = 0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf
1,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0
2,(Yogurt),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
3,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
4,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
5,(Onion),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
6,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
7,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
8,"(Eggs, Kidney Beans)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
9,(Onion),"(Eggs, Kidney Beans)",0.6,0.8,0.6,1.0,1.25,0.12,inf


## Example 2 -- Rule Generation and Selection Criteria

In [40]:
# If you are interested in rules according to a different metric of interest, you can simply adjust the metric and min_threshold arguments.
# E.g. if you are only interested in rules that have a lift score of >= 1.2, you would do the following:

rules = association_rules( frequent_itemsets,
                           metric = "lift",
                           min_threshold = 1.2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
1,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
2,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,"(Eggs, Kidney Beans)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
4,(Onion),"(Eggs, Kidney Beans)",0.6,0.8,0.6,1.0,1.25,0.12,inf
5,(Eggs),"(Onion, Kidney Beans)",0.8,0.6,0.6,0.75,1.25,0.12,1.6


In [41]:
# Pandas DataFrames make it easy to filter the results further.
# Let's say we are ony interested in rules that satisfy the following criteria:
#    1. at least 2 antecedents
#    2. a confidence > 0.75
#    3. a lift score > 1.2
# We could compute the antecedent length as follows:

rules["antecedent_len"] = rules["antecedents"].apply( lambda x: len(x) )
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,1
1,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1
2,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,2
3,"(Eggs, Kidney Beans)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,2
4,(Onion),"(Eggs, Kidney Beans)",0.6,0.8,0.6,1.0,1.25,0.12,inf,1
5,(Eggs),"(Onion, Kidney Beans)",0.8,0.6,0.6,0.75,1.25,0.12,1.6,1


In [42]:
# Then, we can use pandas' selection syntax as shown below:

rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
2,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,2


In [43]:
# Similarly, using the Pandas API, we can select entries based on the "antecedents" or "consequents" columns:

rules[ rules['antecedents'] == {'Eggs', 'Kidney Beans'} ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
3,"(Eggs, Kidney Beans)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,2


## Example 3 -- Frequent Itemsets with Incomplete Antecedent and Consequent Information

In [45]:
# Most metrics computed by association_rules depends on the consequent and antecedent support score of a given rule provided in the frequent itemset input DataFrame.
# Consider the following example:

dict = { 'itemsets': [ ['177', '176'], ['177', '179'], ['176', '178'],
                       ['176', '179'], ['93', '100'], ['177', '178'],
                       ['177', '176', '178'] ],
         'support': [ 0.253623, 0.253623, 0.217391,
                      0.217391, 0.181159, 0.108696,
                      0.108696] }

freq_itemsets = pd.DataFrame(dict)
freq_itemsets

Unnamed: 0,itemsets,support
0,"[177, 176]",0.253623
1,"[177, 179]",0.253623
2,"[176, 178]",0.217391
3,"[176, 179]",0.217391
4,"[93, 100]",0.181159
5,"[177, 178]",0.108696
6,"[177, 176, 178]",0.108696


Note that this is a "cropped" DataFrame that doesn't contain the support values of the item subsets.
This can create problems if we want to compute the association rule metrics for, e.g., 176 => 177.

For example, the confidence is computed as

$Confidence(A→C)$ $=$ $Support(A→C)\over Support(A)$, $( range: [0,1] )$  

But we do not have support(A).
All we know about "A"'s support is that it is at least 0.253623.

In these scenarios, where not all metric's can be computed, due to incomplete input DataFrames, you can use the `support_only = True` option, which will only compute the support column of a given rule that does not require as much info:

$Support(A→C)$ $=$ $Support(A∪C)$, $( range: [0,1] )$

In [46]:
# "NaN's" will be assigned to all other metric columns:

res = association_rules( freq_itemsets,
                         support_only = True,
                         min_threshold = 0.1 )
res

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(177),(176),,,0.253623,,,,
1,(176),(177),,,0.253623,,,,
2,(177),(179),,,0.253623,,,,
3,(179),(177),,,0.253623,,,,
4,(178),(176),,,0.217391,,,,
5,(176),(178),,,0.217391,,,,
6,(179),(176),,,0.217391,,,,
7,(176),(179),,,0.217391,,,,
8,(100),(93),,,0.181159,,,,
9,(93),(100),,,0.181159,,,,


In [47]:
# To clean up the representation, you may want to do the following:

res = res[ ['antecedents', 'consequents', 'support'] ]
res

Unnamed: 0,antecedents,consequents,support
0,(177),(176),0.253623
1,(176),(177),0.253623
2,(177),(179),0.253623
3,(179),(177),0.253623
4,(178),(176),0.217391
5,(176),(178),0.217391
6,(179),(176),0.217391
7,(176),(179),0.217391
8,(100),(93),0.181159
9,(93),(100),0.181159
