<a href="https://colab.research.google.com/github/aettikang/bigdata_analysis_basic/blob/main/%EB%B9%84%EA%B3%84%EC%B8%B5%EC%A0%81%EA%B5%B0%EC%A7%91%ED%99%94.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 샘플 데이터와 Non-Hierarchical Clustering 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


np.random.seed(2021)

## 1. Data

### 1.1 Sample Data

In [None]:
from sklearn.datasets import make_blobs


data, label = make_blobs(n_samples=1500, random_state=170)

In [None]:
data

In [None]:
label

In [None]:
plt.scatter(data[:, 0], data[:, 1], c=label)

## 2. K Means

### 2.1 정확한 군집의 갯수를 맞춘 경우

In [None]:
from sklearn.cluster import KMeans

correct_kmeans = KMeans(n_clusters=3)

In [None]:
correct_kmeans.fit(data)

In [None]:
correct_pred = correct_kmeans.predict(data)

In [None]:
data.shape

In [None]:
correct_pred

In [None]:
correct_kmeans.cluster_centers_

In [None]:
correct_center = correct_kmeans.cluster_centers_

In [None]:
plt.scatter(data[:, 0], data[:, 1], c=correct_pred)
plt.scatter(correct_center[:, 0], correct_center[:, 1], marker="*", s=100, color="red")

### 2.2 군집의 갯수를 틀린 경우

#### 2.2.1 적은 경우

In [None]:
small_kmeans = KMeans(n_clusters=2)

In [None]:
small_kmeans.fit(data)

In [None]:
small_pred = small_kmeans.predict(data)

In [None]:
small_center = small_kmeans.cluster_centers_

In [None]:
plt.scatter(data[:, 0], data[:, 1], c=small_pred)
plt.scatter(small_center[:, 0], small_center[:, 1], marker="*", s=100, color="red")

#### 2.2.1 큰 경우

In [None]:
large_kmeans = KMeans(n_clusters=4)

In [None]:
large_kmeans.fit(data)

In [None]:
large_pred = large_kmeans.predict(data)

In [None]:
large_pred

In [None]:
large_center = large_kmeans.cluster_centers_

In [None]:
plt.scatter(data[:, 0], data[:, 1], c=large_pred)
plt.scatter(large_center[:, 0], large_center[:, 1], marker="*", s=100, color="red")

### 2.3 적절한 K를 찾기

In [None]:
sse_per_n = []

for n in range(1, 12, 2):
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(data)
    sse = kmeans.inertia_
    sse_per_n += [sse]

In [None]:
plt.plot(range(1, 12, 2), sse_per_n)
plt.title("Sum of Sqaured Error")

## 3. K Means의 한계

### 3.1 서로 다른 크기의 군집

In [None]:
size_data, size_label = make_blobs(
    n_samples=1500,
    cluster_std=[1.0, 2.5, 0.5],
    random_state=170
)

In [None]:
size_data

In [None]:
size_data = np.vstack(
    (size_data[size_label == 0][:500],
     size_data[size_label == 1][:100],
     size_data[size_label == 2][:10])
)
size_label = [0] * 500 + [1] * 100 + [2] * 10

In [None]:
size_data.shape

In [None]:
plt.scatter(size_data[:, 0], size_data[:, 1], c=size_label)

In [None]:
size_kmeans = KMeans(n_clusters=3, random_state=2021)

In [None]:
size_data.shape

In [None]:
size_pred = size_kmeans.fit_predict(size_data)

In [None]:
size_data

In [None]:
size_center = size_kmeans.cluster_centers_

In [None]:
plt.scatter(size_data[:, 0], size_data[:, 1], c=size_pred)
plt.scatter(size_center[:, 0], size_center[:, 1], marker="*", s=100, color="red")

### 3.2 서로 다른 밀도의 군집

In [None]:
density_data, density_label = make_blobs(
    n_samples=1500,
    cluster_std=[1.0, 2.5, 0.5],
    random_state=170
)

In [None]:
plt.scatter(density_data[:, 0], density_data[:, 1], c=density_label)

In [None]:
density_kmeans = KMeans(n_clusters=3, random_state=2021)

In [None]:
density_pred = density_kmeans.fit_predict(density_data)

In [None]:
density_center = density_kmeans.cluster_centers_

In [None]:
plt.scatter(density_data[:, 0], density_data[:, 1], c=density_pred)
plt.scatter(density_center[:, 0], density_center[:, 1], marker="*", s=100, color="red")

### 3.3 지역적 패턴이 있는 군집

In [None]:
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
pattern_data = np.dot(data, transformation)

In [None]:
plt.scatter(pattern_data[:, 0], pattern_data[:, 1], c=label)

In [None]:
pattern_kmeans = KMeans(n_clusters=3, random_state=2021)

In [None]:
pattern_pred = pattern_kmeans.fit_predict(pattern_data)

In [None]:
pattern_center = pattern_kmeans.cluster_centers_

In [None]:
plt.scatter(pattern_data[:, 0], pattern_data[:, 1], c=pattern_pred)
plt.scatter(pattern_center[:, 0], pattern_center[:, 1], marker="*", s=100, color="red")

## 4. DBSCAN

이번에는 DBSCAN을 이용해 K Means의 한계가 있던 데이터에 적용해 보겠습니다.

In [None]:
from sklearn.cluster import DBSCAN

### 4.1 서로 다른 크기의 군집

In [None]:
size_dbscan = DBSCAN(eps=1.0)

In [None]:
size_db_pred = size_dbscan.fit_predict(size_data)

In [None]:
plt.scatter(size_data[:, 0], size_data[:, 1], c=size_db_pred)

### 4.2 서로 다른 밀도의 군집

In [None]:
density_dbscan = DBSCAN()

In [None]:
density_db_pred = density_dbscan.fit_predict(density_data)

In [None]:
plt.scatter(density_data[:, 0], density_data[:, 1], c=density_db_pred)

### 4.3 지역적 패턴이 있는 군집

In [None]:
pattern_db = DBSCAN(eps=.3, min_samples=20)

In [None]:
pattern_db_pred = pattern_db.fit_predict(pattern_data)

In [None]:
plt.scatter(pattern_data[:, 0], pattern_data[:, 1], c=pattern_db_pred)