# ここではscikit-learnを使用した機械学習モデルの基本的な使い方をまとめる

## 1. scikit-learnのデータ

### 糖尿病データ

In [2]:
# scikit-learnでは実験用のデータセットがAPIとして搭載されている
# 糖尿病の診断データを使用

from sklearn.datasets import load_diabetes
diabetes = load_diabetes()

In [16]:
print(type(diabetes))
print('説明変数のデータ数：',diabetes.data.shape)    # 説明変数のデータ
print('説明変数のデータの種類：',type(diabetes.data))
print('目的変数のデータ数：',diabetes.target.shape)    # 目的変数のデータ
print('目的変数のデータの種類：',type(diabetes.target))
print('カラム名：',diabetes.feature_names)    # カラム
print('カラム名の種類',type(diabetes.feature_names))

<class 'sklearn.utils._bunch.Bunch'>
説明変数のデータ数： (442, 10)
説明変数のデータの種類： <class 'numpy.ndarray'>
目的変数のデータ数： (442,)
目的変数のデータの種類： <class 'numpy.ndarray'>
カラム名： ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
カラム名の種類 <class 'list'>


In [20]:
# 全データがnumpy.ndarray形式、見にくいのでDataFrame型に変換
import pandas as pd

df_diabetes = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)
df_diabetes

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [21]:
# targetをdfに追加
df_diabetes['target'] = diabetes.target
df_diabetes

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [24]:
# 最初の8行を表示
df_diabetes.head(8)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.041176,-0.096346,97.0
6,-0.045472,0.05068,-0.047163,-0.015999,-0.040096,-0.0248,0.000779,-0.039493,-0.062917,-0.038357,138.0
7,0.063504,0.05068,-0.001895,0.066629,0.09062,0.108914,0.022869,0.017703,-0.035816,0.003064,63.0


### ワインの科学的特徴のデータセット

In [26]:
from sklearn.datasets import load_wine
wine = load_wine()

In [28]:
# DataFrameに変換
df_wine = pd.DataFrame(data = wine.data, columns=wine.feature_names)
df_wine['target'] = wine.target
df_wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [29]:
# 詳細を見てみる
df_wine.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


In [38]:
# targetの種類を把握、3種類か
print(df_wine['target'].value_counts())

target
1    71
0    59
2    48
Name: count, dtype: int64


## ２. データの前処理

### 標準化

In [39]:
# 標準化：平均０、標準偏差１にスケーリング
# NN分野では標準化しないとモデル制度が落ちることもある、機械は数値の単位を考慮できないから
# df_wineのtotal_phenols(総フェノール量)について標準化行う
df_wine.describe()['total_phenols']    # 平均2.29で標準偏差0.62くらいか...


count    178.000000
mean       2.295112
std        0.625851
min        0.980000
25%        1.742500
50%        2.355000
75%        2.800000
max        3.880000
Name: total_phenols, dtype: float64

In [41]:
# データの標準化
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(df_wine) # データの適応
df_wine_sc = pd.DataFrame(sc.transform(df_wine), columns = df_wine.columns)    # sc.transformだけだとnp.ndarray配列
df_wine_sc.describe()['total_phenols']    # うまく標準化できてそう

count    178.000000
mean       0.000000
std        1.002821
min       -2.107246
25%       -0.885468
50%        0.095960
75%        0.808997
max        2.539515
Name: total_phenols, dtype: float64

In [37]:
sc.transform(df_wine)

array([[ 1.51861254, -0.5622498 ,  0.23205254, ...,  1.84791957,
         1.01300893, -1.21394365],
       [ 0.24628963, -0.49941338, -0.82799632, ...,  1.1134493 ,
         0.96524152, -1.21394365],
       [ 0.19687903,  0.02123125,  1.10933436, ...,  0.78858745,
         1.39514818, -1.21394365],
       ...,
       [ 0.33275817,  1.74474449, -0.38935541, ..., -1.48544548,
         0.28057537,  1.37386437],
       [ 0.20923168,  0.22769377,  0.01273209, ..., -1.40069891,
         0.29649784,  1.37386437],
       [ 1.39508604,  1.58316512,  1.36520822, ..., -1.42894777,
        -0.59516041,  1.37386437]])

### 正規化

In [42]:
# データの正規化
# 正規化：特徴量の値の範囲を一定の範囲に収めるスケーリングをする
# ワインのデータを用いて正規化を行う

In [44]:
# ワインデータの最小、最大値を確認
df_wine.describe().loc[['min', 'max']]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


In [52]:
df_wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [54]:
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler((-1, 1))
ms.fit(df_wine)
df_wine_ms = pd.DataFrame(ms.transform(df_wine), columns=df_wine.columns)
df_wine_ms.describe().loc[['min','max']]    # 確かに-1から1の範囲で調節できた。

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### ラベルエンコーディング

In [79]:
#  ラベルエンコード: カテゴリ変数を数値にエンコードする
df = pd.DataFrame({'野菜名': ['キャベツ', 'レタス', 'ネギ', 'ネギ', 'ネギ', 'キャベツ'], '値段': ['100', '140', '120', '200', '80', '50']})
df

Unnamed: 0,野菜名,値段
0,キャベツ,100
1,レタス,140
2,ネギ,120
3,ネギ,200
4,ネギ,80
5,キャベツ,50


In [80]:
# ラベルエンコード
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['野菜名'])

In [81]:
df['野菜名'] = le.transform(df['野菜名'])
df

Unnamed: 0,野菜名,値段
0,0,100
1,2,140
2,1,120
3,1,200
4,1,80
5,0,50


In [82]:
# 訓練データに含まれるユニークなクラスラベルとそのタイプがわかる
le.classes_

array(['キャベツ', 'ネギ', 'レタス'], dtype=object)

In [83]:
# 値段の方もラベルエンコーディングしてみる
le = LabelEncoder()
le.fit(df['値段'])
df['値段'] = le.transform(df['値段'])
df

Unnamed: 0,野菜名,値段
0,0,0
1,2,2
2,1,1
3,1,3
4,1,5
5,0,4


In [85]:
# 確かに昇順になって、ラベルエンコーディングされている
le.classes_

array(['100', '120', '140', '200', '50', '80'], dtype=object)

### ワンホットエンコーディング

In [102]:
# ワンほっとエンコーディング: カテゴリ変数を多次元ベクトルで0, 1のみを用いて表す方法
df = pd.DataFrame({'野菜名': ['キャベツ','レタス','ネギ','ネギ','ネギ','キャベツ']})
df

Unnamed: 0,野菜名
0,キャベツ
1,レタス
2,ネギ
3,ネギ
4,ネギ
5,キャベツ


In [103]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)    # sparseをTrue(デフォルト)にすると疎行列が返される.疎行列は０の要素を持つ大きな行列を効率的に格納するためのデータ構造で、非ゼロの要素とその位置のみを格納する。
# sparseをFalseにすると単純なnumpy.ndarrayが返る

In [104]:
enc.fit_transform(df)
# 単純なnumpy配列になっていることがわかる



array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [106]:
# このように、インスタンス.categories_でエンコーディングする時のラベルを取得できる
enc.categories_

[array(['キャベツ', 'ネギ', 'レタス'], dtype=object)]

In [107]:
df = pd.DataFrame(enc.fit_transform(df), columns=enc.categories_)
df



Unnamed: 0,キャベツ,ネギ,レタス
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
5,1.0,0.0,0.0


### データ分割