# 연관분석

## **1. 모듈 불러오기**

In [None]:
import numpy as np
import pandas as pd

# for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# for defining path
from google.colab import files
myfile = files.upload()

# for market basket analysis
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
data = pd.read_csv('Market_Basket_Optimisation.csv', header = None)
data.info

In [None]:
data.head()

In [None]:
data.sample(10)

In [None]:
data.describe()

## **2. Data visualizations**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud

plt.figure(figsize = (15,15))
wordcloud = WordCloud(background_color = 'white', width = 1200, height = 1200,
                      max_words = 120).generate(str(data[0]))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Most Popular Items',fontsize = 20)
plt.show()

In [None]:
# data 빈도 막대그래프로

plt.figure(figsize = (18,7))

data[0].value_counts().head(40).plot.bar(color = color)

plt.title('frequency of most popular items', fontsize = 20)

plt.show()

In [None]:
# Tree map
#!pip install squarify
import squarify

y = data[0].value_counts().head(50).to_frame()
y.index

plt.figure(figsize = (20,20))
color = plt.cm.cool(np.linspace(0, 1, 50))
squarify.plot(sizes = y.values, label = y.index, alpha=.8, color = color)
plt.title('Tree Map for Popular Items')
plt.show()


In [None]:
# food라는 column에 Food 입력
data['food'] = 'Food'

# data에서 15개씩 잘라냄...
food = data.truncate(before = -1, after = 15)

import networkx as nx

food = nx.from_pandas_edgelist(food, source = 'food', target = 0, edge_attr = True)

In [None]:
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (20, 20)
pos = nx.spring_layout(food)
color = plt.cm.Wistia(np.linspace(0, 15, 1))

nx.draw_networkx_nodes(food, pos, node_size = 15000, node_color = color)
nx.draw_networkx_edges(food, pos, width = 3, alpha = 0.6, edge_color = 'black')
nx.draw_networkx_labels(food, pos, font_size = 20, font_family = 'sans-serif')

plt.axis('off')
plt.grid()
plt.title('Top 15 First Choices', fontsize = 40)
plt.show()

In [None]:
data['secondchoice'] = 'Second Choice'
secondchoice = data.truncate(before = -1, after = 15)
secondchoice = nx.from_pandas_edgelist(secondchoice, source = 'food', target = 1, edge_attr = True)

In [None]:
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (20, 20)
pos = nx.spring_layout(secondchoice)
color = plt.cm.Blues(np.linspace(0, 15, 1))
nx.draw_networkx_nodes(secondchoice, pos, node_size = 15000, node_color = color)
nx.draw_networkx_edges(secondchoice, pos, width = 3, alpha = 0.6, edge_color = 'brown')
nx.draw_networkx_labels(secondchoice, pos, font_size = 20, font_family = 'sans-serif')
plt.axis('off')
plt.grid()
plt.title('Top 15 Second Choices', fontsize = 40)
plt.show()

In [None]:
data['thirdchoice'] = 'Third Choice'
secondchoice = data.truncate(before = -1, after = 10)
secondchoice = nx.from_pandas_edgelist(secondchoice, source = 'food', target = 2, edge_attr = True)

In [None]:
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (20, 20)
pos = nx.spring_layout(secondchoice)
color = plt.cm.Reds(np.linspace(0, 15, 1))
nx.draw_networkx_nodes(secondchoice, pos, node_size = 15000, node_color = color)
nx.draw_networkx_edges(secondchoice, pos, width = 3, alpha = 0.6, edge_color = 'pink')
nx.draw_networkx_labels(secondchoice, pos, font_size = 20, font_family = 'sans-serif')
plt.axis('off')
plt.grid()
plt.title('Top 10 Third Choices', fontsize = 40)
plt.show()

## **3. Data preprocessing**

In [None]:
# making each customers shopping items an identical list
trans = []
for i in range(0, 7501):
    trans.append([str(data.values[i,j]) for j in range(0, 20)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

In [None]:
trans

## **4. Using Transaction encoder**

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data = te.fit_transform(trans)
data = pd.DataFrame(data, columns = te.columns_)

# getting the shape of the data
data.shape

In [None]:
data

In [None]:
# let's check the columns

data.columns

In [None]:
data.head()

## **5. Applying apriori**

1단계 : 빈발 품목 집합(Frequent item set) 생성
2단계 : 연관 규칙 생성

In [None]:
from mlxtend.frequent_patterns import apriori

#Now, let us return the items and itemsets with at least 5% support:
apriori(data, min_support = 0.05, use_colnames = True).sort_values(by='support', ascending=False)

In [None]:
frequent_itemsets = apriori(data, min_support = 0.05, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets.sort_values(by='support', ascending=False)

In [None]:
# getting th item sets with length = 2 and support more han 5%

frequent_itemsets[ (frequent_itemsets['length'] == 3) &
                   (frequent_itemsets['support'] >= 0.05) ].sort_values(by='support', ascending=False)

## **6. Association Mining**

In [None]:
rules = association_rules(frequent_itemsets, metric = "confidence", 
                          min_threshold=0.05).sort_values(by = ['lift', 
                          'confidence', 'support'], ascending = False)
rules

In [None]:
rules = association_rules(frequent_itemsets, metric = "lift", min_threshold=1).sort_values(by='lift', ascending=False)
rules

In [None]:
# 특정 상품에 어울리는 조합 찾기(frozenset) 
rules[rules['antecedents']==frozenset({'spaghetti'})].sort_values(by='lift', ascending=False)

출처

실습
1. https://www.kaggle.com/roshansharma/market-basket-analysis/notebook

이론
1. https://zephyrus1111.tistory.com/119
