<a href="https://colab.research.google.com/github/ailab-nda/ML/blob/main/chapter05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第５章の例

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

## K-means


### モジュールのインポート

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

### サンプルデータの読み込み

In [None]:
url = "https://github.com/rinsaka/sample-data-sets/blob/master/clustering-sample.csv?raw=true"
# url = "clustering-sample.csv"  # カレントディレクトリから読み込む場合
df =pd.read_csv(url)
df

### 散布図の作成

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
ax.scatter(df['x'], df['y'], alpha=0.5)
ax.set_title("Sample data")
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_xlim(-1, 11)
ax.set_ylim(-1, 11)
# plt.savefig('cluster_sample.png', dpi=300, facecolor='white')
plt.show()

### 学習

In [None]:
xy = df.loc[:, ['x', 'y']].values

k = 3
clf = KMeans(n_clusters=k) # モデルの設定
# clf = KMeans(n_clusters=k, random_state=1) # 再現性を持たせたい場合
clf.fit(xy) # クラスタリングの計算
pred = clf.predict(xy) # 計算結果からサンプルデータがどのクラスタに属するかを予測する
df['cluster_id'] = pred

fig, ax = plt.subplots(1, 1, figsize=(6, 6))
colors = ['Red', 'Blue', 'Green']

for cls in range(k):
    x = df.loc[df['cluster_id'] == cls, 'x']
    y = df.loc[df['cluster_id'] == cls, 'y']
    ax.scatter(x, y, alpha=0.5, label=f"cluseter {cls}", color=colors[cls])

ax.set_title("Clustering results")
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_xlim(-1, 11)
ax.set_ylim(-1, 11)
ax.legend(loc='upper left')
# plt.savefig('cluster_scatter.png', dpi=300, facecolor='white')
plt.show()

## 決定木
https://smart-hint.com/ml/tree/

### 事前準備

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import plot_tree

### データの読み取りと前処理

In [None]:
# タイタニック号のデータ読み取り
df = sns.load_dataset('titanic')

# 目的変数と説明変数の分割
df_x = df[['sex','pclass','fare']]
df_y = df['survived']

# 目的変数のダミー変数化
df_x = pd.get_dummies(df_x, drop_first=True)

### 学習

In [None]:
# 学習用-テスト用のデータに分割
train_x, test_x, train_y, test_y = train_test_split(df_x,df_y,random_state=1)

# 決定木モデルの作成
model = tree.DecisionTreeClassifier(max_depth=2, random_state=1)
model.fit(train_x, train_y)

# 決定木モデルの可視化
plot_tree(model, feature_names=train_x.columns, class_names=True, filled=True)

# 決定木モデルのスコア
print("score: ", model.score(test_x,test_y))