# 6. より高度な分析3：食べ物の好みで都道府県を分類する

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
import japanize_matplotlib 

### データの読み込み

In [None]:
df = pd.read_csv("meat.csv", encoding="sjis")
df

### コード6.1：k-means(k=3)

In [None]:
X = df[["牛肉", "豚肉", "鶏肉"]].values

k = 3
kmeans = KMeans(init="random", n_clusters=k, random_state=24)
kmeans.fit(X)
y_pred = kmeans.predict(X)

df["cluster"] = y_pred
df

### 表6.1：k-meansによるクラスタリングの結果

In [None]:
for cluster in range(k):
    print(f"[cluster {cluster}]")
    for i, city in enumerate(df[df['cluster']==cluster]['市'], 1):
        print(city, end=' ')
        if i % 10 == 0:
            print()
    print("\n")

### 図6.4～6：クラスタごとの牛肉，豚肉，鶏肉の購入量の比較

In [None]:
for meat in ['牛肉', '豚肉', '鶏肉']:
    sns.boxplot(data=df, x='cluster', y=meat, whis=[0, 100], width=0.5)
    plt.show()

### コード6.2および図6.9：肉類の購入量データへのエルボー法の適用

In [None]:
sse = []
max_k = 10
for k in range(1,max_k):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X)
    sse.append(kmeans.inertia_ / 1e8)  # 10^8で割る
plt.plot(range(1, max_k), sse, marker='+')
plt.xlabel("$k$")
plt.ylabel(r"SSE $(\times 10^8)$")
plt.show()

### コード6.3および図6.11

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist

result = linkage(pdist(X, "euclidean"), "ward")
figure = plt.figure(figsize=(12, 5))
labels = ["\n".join(city) for city in df["市"]]
dendrogram(result, labels=labels, leaf_font_size=14, leaf_rotation=0)
plt.ylabel("距\n離", rotation=0, labelpad=10)
plt.tight_layout()
plt.show()