**PART 03** 제2유형: 데이터 분석

> **CHAPTER 04** 비지도학습
> >**01 군집분석을 이용한 문제 해결**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans     # K-Means 패키지 임포트

In [2]:
# 깃허브에 있는 csv 파일을 읽어와서 데이터프레임 df로 넣는다.
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv")

In [3]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
# species 컬럼의 값을 0,1,2로 레이블인코딩 한다.
from sklearn.preprocessing import LabelEncoder
df["species"] = LabelEncoder().fit_transform(df["species"])
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
# 분석 데이터셋 준비
X = df[["sepal_length", "sepal_width", "petal_length", "petal_width"]]

In [7]:
# KMeans 객체 생성
cluster1 = KMeans(n_clusters=3, n_init=10, max_iter=500, random_state=42)

# 생성모델로 데이터 학습
cluster1.fit(X)

In [8]:
# 결과 값을 변수에 저장
cluster_center = cluster1.cluster_centers_  # 각 군집의 중심점 결과 저장
cluster_prediction = cluster1.predict(X)    # 각 예측군집 결과 저장
print(pd.DataFrame(cluster_center))
print(cluster_prediction)

          0         1         2         3
0  5.901613  2.748387  4.393548  1.433871
1  5.006000  3.428000  1.462000  0.246000
2  6.850000  3.073684  5.742105  2.071053
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [9]:
# 기존 데이터에 예측된 군집 결과를 붙인다.
df["cluster"]=cluster_prediction
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,cluster
0,5.1,3.5,1.4,0.2,0,1
1,4.9,3.0,1.4,0.2,0,1
2,4.7,3.2,1.3,0.2,0,1
3,4.6,3.1,1.5,0.2,0,1
4,5.0,3.6,1.4,0.2,0,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,2
146,6.3,2.5,5.0,1.9,2,0
147,6.5,3.0,5.2,2.0,2,2
148,6.2,3.4,5.4,2.3,2,2


In [10]:
# 적정한 K에 대해 붓꽃 데이터프레임을 넣어 K에 따른 inertia를 비교
# 값(3)이 적합한 변화시점임을 알 수 있음
scope = range(1,10)
inertias = []
for k in scope:
    model = KMeans(n_clusters = k, n_init=10, max_iter=500, random_state=42)
    model.fit(X)
    inertias.append(model.inertia_)
    print(k, inertias[k-1]) # 리스트 인덱스는 0부터 시작하므로 k-1로 표현


1 681.3706
2 152.3479517603579
3 78.851441426146
4 57.22847321428572
5 46.446182051282065
6 39.03998724608726
7 34.46949589883801
8 30.1865551948052
9 28.28937085137085


>>**02 연관분석을 이용한 문제 해결**

In [1]:
import numpy as np
import pandas as pd

# apriori, association_rules 모듈 호출
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# pandas의 read_csv 함수 통한 csv 데이터 로드
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/retail_dataset.csv', sep=',')

  and should_run_async(code)


In [3]:
df

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,
...,...,...,...,...,...,...,...
310,Bread,Eggs,Cheese,,,,
311,Meat,Milk,Pencil,,,,
312,Bread,Cheese,Eggs,Meat,Pencil,Diaper,Wine
313,Meat,Cheese,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       315 non-null    object
 1   1       285 non-null    object
 2   2       245 non-null    object
 3   3       187 non-null    object
 4   4       133 non-null    object
 5   5       71 non-null     object
 6   6       41 non-null     object
dtypes: object(7)
memory usage: 17.4+ KB


  and should_run_async(code)


In [5]:
# 장바구니 데이터 고유항목 구분 출력
items = set()
for col in df:
    items.update(df[col].unique())

# 장바구니 목록 값(텍스트)을 수치로 표현-각 항목당 매칭될 경우 1로, 아니면 0으로 표시(one-hot encoding)
itemset = set(items)
encoding = []
for index, row in df.iterrows():
    rowset = set(row)
    labels = {}
    dismatching = list(itemset - rowset)
    matching = list(itemset.intersection(rowset))
    for i in dismatching:
        labels[i] = 0
    for j in matching:
        labels[j] = 1
    encoding.append(labels)
encoding[0]
result = pd.DataFrame(encoding)

result

  and should_run_async(code)


Unnamed: 0,Bagel,Milk,NaN,Meat,Pencil,Wine,Diaper,Bread,Cheese,Eggs
0,0,0,0,1,1,1,1,1,1,1
1,0,1,0,1,1,1,1,1,1,0
2,0,1,1,1,0,1,0,0,1,1
3,0,1,1,1,0,1,0,0,1,1
4,0,0,1,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
310,0,0,1,0,0,0,0,1,1,1
311,0,1,1,1,1,0,0,0,0,0
312,0,0,0,1,1,1,1,1,1,1
313,0,0,1,1,0,0,0,0,1,0


In [6]:
#첫째 NaN 열항목 삭제
result = result.drop(result.columns[0], axis=1)

result

  and should_run_async(code)


Unnamed: 0,Milk,NaN,Meat,Pencil,Wine,Diaper,Bread,Cheese,Eggs
0,0,0,1,1,1,1,1,1,1
1,1,0,1,1,1,1,1,1,0
2,1,1,1,0,1,0,0,1,1
3,1,1,1,0,1,0,0,1,1
4,0,1,1,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
310,0,1,0,0,0,0,1,1,1
311,1,1,1,1,0,0,0,0,0
312,0,0,1,1,1,1,1,1,1
313,0,1,1,0,0,0,0,1,0


In [7]:
# apriori 함수 적용
freq_items = apriori(result, min_support=0.2, use_colnames=True)

  and should_run_async(code)


In [8]:
# association_rules로 규칙 도출(신뢰도 임계치 0.6 기반)
rules = association_rules(freq_items, metric="confidence", min_threshold=0.6)
rules.head()

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Milk),(nan),0.501587,0.869841,0.409524,0.816456,0.938626,-0.026778,0.709141,-0.115976
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
2,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
3,(Meat),(nan),0.47619,0.869841,0.368254,0.773333,0.889051,-0.045956,0.57423,-0.192405
4,(Pencil),(nan),0.361905,0.869841,0.266667,0.736842,0.8471,-0.048133,0.494603,-0.220499
