# Sprint 機械学習スクラッチ入門

## 【問題1】train_test_splitのスクラッチ
スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。以下の雛形をベースとして関数を完成させてください。


sklearn.model_selection.train_test_split — scikit-learn 0.21.3 documentation


なお、作成した関数がscikit-learnのtrain_test_splitと同じ動作をしているか必ず確認をするようにしましょう。

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [34]:
def scratch_train_test_split(X, y, test_size=0.2):
    """
    検証データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      訓練データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    
    if len(X) != len(y):
        raise ValueError("Found input variables with inconsistent numbers of samples: [{}{}]".format(len(X), len(y)))
    
    if type(X) is pd.core.frame.DataFrame:
        X = X.values
    
    if type(y) is  pd.core.frame.DataFrame:
        y = y.values
    
    shape_count = X.shape[0]   # ndarray行数
    
    # 分割の境界線
    boundary = int(shape_count * test_size)
    
    # Xとyのシャッフルインデックスを作る
    np.random.seed(0)
    X_shuffle = np.random.permutation(X)
    np.random.seed(0)
    y_shuffle = np.random.permutation(y)

    # 分割する
    X_train = X_shuffle[boundary:, :]
    X_test = X_shuffle[:boundary, :]
    y_train = y_shuffle[boundary:]
    y_test = y_shuffle[:boundary:]
    
    return X_train, X_test, y_train, y_test

In [35]:
import numpy as np
np.random.seed(seed=0)

f0 = np.random.randint(1, 100, 100)
f1 = np.random.random(size=100)

X = np.stack([f0, f1], axis=1)
display(X.shape)
y = np.concatenate((np.ones(50), np.zeros(50)))
display(y.shape)

(100, 2)

(100,)

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("---sklearn---")
print("X_train:{} shape:{}".format(X_train, X_train.shape))
print("X_test:{} shape:{}".format(X_test, X_test.shape))
print("y_train:{} shape:{}".format(y_train, y_train.shape))
print("y_test:{} shape:{}".format(y_test, y_test.shape))


---sklearn---
X_train:[[5.80000000e+01 6.38921076e-01]
 [4.30000000e+01 4.70132189e-01]
 [6.80000000e+01 6.22846096e-01]
 [4.70000000e+01 5.51816259e-01]
 [3.20000000e+01 6.80055569e-01]
 [3.60000000e+01 1.16201910e-01]
 [8.40000000e+01 8.78193471e-01]
 [4.20000000e+01 3.65496106e-01]
 [7.00000000e+00 9.95690891e-02]
 [5.40000000e+01 1.41263905e-01]
 [8.00000000e+01 6.18808565e-01]
 [7.60000000e+01 5.37022521e-01]
 [2.10000000e+01 7.79051020e-01]
 [5.80000000e+01 1.02863359e-01]
 [2.90000000e+01 6.75439081e-01]
 [8.10000000e+01 8.42342080e-01]
 [4.70000000e+01 2.16822138e-01]
 [5.40000000e+01 6.59668412e-01]
 [5.90000000e+01 7.16074531e-01]
 [1.00000000e+00 3.20997241e-01]
 [4.80000000e+01 6.62526867e-01]
 [5.00000000e+00 4.74867515e-01]
 [1.00000000e+01 8.96038388e-01]
 [6.60000000e+01 3.96098275e-01]
 [6.80000000e+01 6.73659631e-01]
 [6.60000000e+01 6.88661183e-01]
 [8.80000000e+01 9.18235466e-01]
 [3.30000000e+01 7.56106694e-01]
 [1.50000000e+01 2.88476437e-01]
 [1.00000000e+01 9.71

In [37]:
X_train_scr, X_test_scr, y_train_scr, y_test_scr = scratch_train_test_split(X, y)

print("---scratch---")
print("X_train_scr:{} shape:{}".format(X_train_scr, X_train_scr.shape))
print("X_test_scr:{} shape:{}".format(X_test_scr, X_test_scr.shape))
print("y_train_scr:{} shape:{}".format(y_train_scr, y_train_scr.shape))
print("y_test_scr:{} shape:{}".format(y_test_scr, y_test_scr.shape))


---scratch---
X_train_scr:[[5.80000000e+01 6.38921076e-01]
 [4.30000000e+01 4.70132189e-01]
 [6.80000000e+01 6.22846096e-01]
 [4.70000000e+01 5.51816259e-01]
 [3.20000000e+01 6.80055569e-01]
 [3.60000000e+01 1.16201910e-01]
 [8.40000000e+01 8.78193471e-01]
 [4.20000000e+01 3.65496106e-01]
 [7.00000000e+00 9.95690891e-02]
 [5.40000000e+01 1.41263905e-01]
 [8.00000000e+01 6.18808565e-01]
 [7.60000000e+01 5.37022521e-01]
 [2.10000000e+01 7.79051020e-01]
 [5.80000000e+01 1.02863359e-01]
 [2.90000000e+01 6.75439081e-01]
 [8.10000000e+01 8.42342080e-01]
 [4.70000000e+01 2.16822138e-01]
 [5.40000000e+01 6.59668412e-01]
 [5.90000000e+01 7.16074531e-01]
 [1.00000000e+00 3.20997241e-01]
 [4.80000000e+01 6.62526867e-01]
 [5.00000000e+00 4.74867515e-01]
 [1.00000000e+01 8.96038388e-01]
 [6.60000000e+01 3.96098275e-01]
 [6.80000000e+01 6.73659631e-01]
 [6.60000000e+01 6.88661183e-01]
 [8.80000000e+01 9.18235466e-01]
 [3.30000000e+01 7.56106694e-01]
 [1.50000000e+01 2.88476437e-01]
 [1.00000000e+01 

## 分割はしっかりできたのだが、個々arrayの値が違う現象。seed値の問題と思われるが、考えられる箇所を修正しても正しくならないため、いったん提出する。メンターからアドバイスをもらう。

## 【問題2】 分類問題を解くコードの作成
上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

In [38]:
#irisデータ
from sklearn.datasets import load_iris

iris_data = load_iris()
X = iris_data.data[50:]
y = iris_data.target[50:]

In [39]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# スクラッチ関数で分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y)

# ロジスティック回帰
lr = SGDClassifier(loss="log")
lr.fit(X_train, y_train)
lr_result = lr.predict(X_test)

# SVM
svm = SVC()
svm.fit(X_train, y_train)
svm_result = svm.predict(X_test)

# 決定木
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_result = dt.predict(X_test)

model_dic = {"ロジスティック回帰": lr_result, "SVM": svm_result, "決定木": dt_result}

for model_name, model_result in model_dic.items():
    print("------"+model_name+"------")
    print("混同行列: {}\n accuracy: {:.2f}\n precision:{:.2f}\n recall:{:.2f}\n f1:{:.2f}"
          .format(confusion_matrix(y_test, model_result),
                  accuracy_score(y_test, model_result),
                  precision_score(y_test, model_result),
                  recall_score(y_test, model_result),
                  f1_score(y_test, model_result),
                 ))
    print()

------ロジスティック回帰------
混同行列: [[ 0 10]
 [ 0 10]]
 accuracy: 0.50
 precision:0.00
 recall:0.00
 f1:0.00

------SVM------
混同行列: [[ 9  1]
 [ 0 10]]
 accuracy: 0.95
 precision:1.00
 recall:0.90
 f1:0.95

------決定木------
混同行列: [[ 8  2]
 [ 0 10]]
 accuracy: 0.90
 precision:1.00
 recall:0.80
 f1:0.89



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## 正答率が低いのは問題1の関数の原因か。。。

In [40]:
# シンプルデータセット1
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

# スクラッチ関数で分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(400, 2)
(100, 2)
(400,)
(100,)


## 分割までは問題なし

In [41]:
# ロジスティック回帰
lr = SGDClassifier(loss="log")
lr.fit(X_train, y_train)
lr_result = lr.predict(X_test)

# SVM
svm = SVC()
svm.fit(X_train, y_train)
svm_result = svm.predict(X_test)

# 決定木
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_result = dt.predict(X_test)

model_dic = {"ロジスティック回帰": lr_result, "SVM": svm_result, "決定木": dt_result}

for model_name, model_result in model_dic.items():
    print("------"+model_name+"------")
    print("混同行列: {}\n accuracy: {:.2f}\n precision:{:.2f}\n recall:{:.2f}\n f1:{:.2f}"
          .format(confusion_matrix(y_test, model_result),
                  accuracy_score(y_test, model_result),
                  precision_score(y_test, model_result),
                  recall_score(y_test, model_result),
                  f1_score(y_test, model_result),
                 ))
    print()

------ロジスティック回帰------
混同行列: [[40  0]
 [ 0 60]]
 accuracy: 1.00
 precision:1.00
 recall:1.00
 f1:1.00

------SVM------
混同行列: [[40  0]
 [ 0 60]]
 accuracy: 1.00
 precision:1.00
 recall:1.00
 f1:1.00

------決定木------
混同行列: [[39  1]
 [ 0 60]]
 accuracy: 0.99
 precision:0.98
 recall:1.00
 f1:0.99





In [42]:
# シンプルデータセット2
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# スクラッチ関数で分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32, 2)
(8, 2)
(32,)
(8,)


In [43]:
# ロジスティック回帰
lr = SGDClassifier(loss="log")
lr.fit(X_train, y_train)
lr_result = lr.predict(X_test)

# SVM
svm = SVC()
svm.fit(X_train, y_train)
svm_result = svm.predict(X_test)

# 決定木
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_result = dt.predict(X_test)

model_dic = {"ロジスティック回帰": lr_result, "SVM": svm_result, "決定木": dt_result}

for model_name, model_result in model_dic.items():
    print("------"+model_name+"------")
    print("混同行列: {}\n accuracy: {:.2f}\n precision:{:.2f}\n recall:{:.2f}\n f1:{:.2f}"
          .format(confusion_matrix(y_test, model_result),
                  accuracy_score(y_test, model_result),
                  precision_score(y_test, model_result),
                  recall_score(y_test, model_result),
                  f1_score(y_test, model_result),
                 ))
    print()

------ロジスティック回帰------
混同行列: [[1 3]
 [2 2]]
 accuracy: 0.38
 precision:0.40
 recall:0.50
 f1:0.44

------SVM------
混同行列: [[2 2]
 [2 2]]
 accuracy: 0.50
 precision:0.50
 recall:0.50
 f1:0.50

------決定木------
混同行列: [[3 1]
 [1 3]]
 accuracy: 0.75
 precision:0.75
 recall:0.75
 f1:0.75





## ひとまず全データセットで動くには動いたが、各指標値が低いので、分割時（スクラッチ関数内）に何か原因がありそう。

## 【問題3】 回帰問題を解くコードの作成
線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [44]:
df = pd.read_csv("../Week4/train.csv")
df = df.set_index("Id")
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [45]:
# X変数には2つ、y変数にはSalePriceを抽出
X = df.loc[:, ["GrLivArea", "YearBuilt"]]
y = df.loc[:, ["SalePrice"]]

# スクラッチ関数で分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1168, 2)
(292, 2)
(1168, 1)
(292, 1)


In [46]:
# 標準化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)   #後でスケーリングするために使用する平均と標準を計算します。
X_train_scaler = scaler.transform(X_train)   # 標準化　センタリングとスケーリングによって標準化を実行する
X_test_scaler = scaler.transform(X_test)     # 標準化　センタリングとスケーリングによって標準化を実行する



In [47]:
# 線形回帰
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_scaler, y_train)
lr_pred = lr.predict(X_test_scaler)

In [48]:
# 評価
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

mse = mean_squared_error(y_test, lr_pred)
mae = mean_absolute_error(y_test, lr_pred)
r2s = r2_score(y_test, lr_pred)

print("MSE:{:.2f}\nMAE:{:.2f}\nR2(決定係数):{:.2f}".format(mse, mae, r2s))

MSE:2942066921.67
MAE:32711.08
R2(決定係数):0.57


## 決定係数が0と回帰に説明がつかないという結果になった。
やはりスクラッチ関数内に問題があると思われる。

## メンターからのアドバイスで修正、分割がしっかり行われたため各指標値が全て良くなった。
Xとyで同インデックスのものをtrain・testに分割するということがコードに組み込まれていなかった（私も作成時に気づけていなかった）

'''python
np.random.seed(0)
X_shuffle = np.random.permutation(X)
np.random.seed(0)
y_shuffle = np.random.permutation(y)
'''

## 参考
permutationについて　http://kaisk.hatenadiary.com/entry/2014/10/30/170522

## 作業時間
6h
修正で1h