## 붗꽃 데이터 예제를 이용한 K-평균 군집화

In [14]:
# 데이터 가져오기
import pandas as pd
iris = pd.read_csv('./Data/iris.csv')

In [15]:
# 데이터 확인하기
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [16]:
# 데이터 컬럼 및 데이터 타입 확인하기
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [17]:
# 종속변수 : class
# 실제로는 종속변수도 후보가 여러개 존재하는데 이 중, 어떤 것이 가장 적절한 것인지는 판단해야 한다.
# 독립변수 : 연속형 변수
# 종속변수 : 명목형 변수
# iris 데이터를 x_iris와 y_iris로 나눈다
x_iris = iris.drop('class', axis = 1)
y_iris = iris['class']

In [18]:
# k-means 모델 생성하기
from sklearn.cluster import KMeans
k_means_fit = KMeans(n_clusters = 3, max_iter = 300, random_state = 42)
k_means_fit.fit(x_iris)

KMeans(n_clusters=3, random_state=42)

In [23]:
# confusion maxtrix
pd.crosstab(y_iris, k_means_fit.labels_, rownames = ['Actual'], colnames = ['Predicted'])

Predicted,0,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,0,50,0
Iris-versicolor,48,0,2
Iris-virginica,14,0,36


In [25]:
# silhouette-score
from sklearn.metrics import silhouette_score
silhouette_score(x_iris, k_means_fit.labels_, metric = 'euclidean' )

0.5525919445499757

In [31]:
# silhouette_score이 최대값인 k보다 1만큼 큰 k+1이 최적의 군집수이다.
for k in range(2,10):
    k_means_fitk = KMeans(n_clusters = k, max_iter = 300, random_state = 42)
    k_means_fitk.fit(x_iris)
    print('For K value', k, ', silhouette-score: ', silhouette_score(x_iris, k_means_fitk.labels_, metric = 'euclidean'))

For K value 2 , silhouette-score:  0.6808136202936816
For K value 3 , silhouette-score:  0.5525919445499757
For K value 4 , silhouette-score:  0.4978256901095472
For K value 5 , silhouette-score:  0.4885175508886279
For K value 6 , silhouette-score:  0.36820569682713084
For K value 7 , silhouette-score:  0.3553790560401653
For K value 8 , silhouette-score:  0.3651645360269737
For K value 9 , silhouette-score:  0.32883150368756536


In [32]:
K = range(1,10)

In [37]:
KM = [KMeans(n_clusters = k, max_iter = 300, random_state = 42).fit(x_iris) for k in K]
KM

[KMeans(n_clusters=1, random_state=42),
 KMeans(n_clusters=2, random_state=42),
 KMeans(n_clusters=3, random_state=42),
 KMeans(n_clusters=4, random_state=42),
 KMeans(n_clusters=5, random_state=42),
 KMeans(n_clusters=6, random_state=42),
 KMeans(n_clusters=7, random_state=42),
 KMeans(random_state=42),
 KMeans(n_clusters=9, random_state=42)]

In [41]:
# 중심 좌표 구하기
centroids = [k.cluster_centers_ for k in KM]

In [56]:
from scipy.spatial.distance import cdist, pdist
# 중심점으로부터 거리
D_k = [cdist(x_iris, centrds, 'euclidean') for centrds in centroids]

In [58]:
import numpy as np
# 가장 가까운 중심점의 인덱스를 표시
cIdx = [np.argmin(D, axis = 1) for D in D_k]

In [60]:
dist = [np.min(D, axis = 1) for D in D_k]

In [61]:
avgWithinSS = [sum(d)/x_iris.shape[0] for d in dist]