[課題のURL](https://diver.diveintocode.jp/curriculums/1643)

# Sprint機械学習スクラッチ入門

# 【問題1】train_test_splitのスクラッチ

In [1]:
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.animation as animation

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
X = np.arange(10).reshape(5, 2)
y = np.arange(5).reshape(5, 1)

In [3]:
X.shape, y.shape

((5, 2), (5, 1))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

(4, 2) (1, 2) (4, 1) (1, 1)
[[8 9]
 [4 5]
 [0 1]
 [6 7]]
[[2 3]]
[[4]
 [2]
 [0]
 [3]]
[[1]]


In [6]:
def scratch_train_test_split(X, y, train_size=0.8,):
    """
    検証データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      訓練データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    # エラー判定用の変数
    judge = 1
    
    # train_sizeの確認
    if (train_size > 1) or (train_size < 0):
        print("should be either positive and smaller than the number of samples 10 or a float in the (0, 1) range")
        judge = 0
        
    # データのサイズの確認
    if X.shape[0] != y.shape[0]:
        print("Found input variables ")
        judge = 0
    
    if judge == 1:
        n_row = X.shape[0]

        # 分割後のデータ数を求める
        n_train_size = math.ceil(n_row * train_size)
        n_test_size = n_row - n_train_size

        # Xとyを結合し各行をランダムに並び替える
        Xy = np.c_[X, y]
        np.random.seed(seed=0)
        np.random.shuffle(Xy)

        # Xとyを分離する
        X = Xy[:, 0:(-1)]
        y = Xy[:,-1]

        # 任意のデータ数に分ける
        X_test = X[:n_test_size]
        X_train = X[n_test_size:(n_test_size + n_train_size)]
        y_test = y[:n_test_size]
        y_train = y[n_test_size:(n_test_size + n_train_size)]
    
    return X_train, X_test, y_train.reshape(-1,1), y_test.reshape(-1,1)


In [7]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

(4, 2) (1, 2) (4, 1) (1, 1)
[[0 1]
 [2 3]
 [6 7]
 [8 9]]
[[4 5]]
[[0]
 [1]
 [3]
 [4]]
[[2]]


# 【問題2】 分類問題を解くコードの作成

下記, 3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

分類は3種類の手法をスクラッチします。

- ロジスティック回帰  
- SVM  
- 決定木  

データセットは3種類用意します。
- irisデータセット  
-- 2値分類としたいため、2つの目的変数(virgicolorとvirginica)のみ利用します。特徴量は4種類全て使います。
- シンプルデータセット1
- シンプルデータセット2

## 1.irisデータセット

In [9]:
# データの読み込み
data_iris = load_iris()

#X = pd.DataFrame(data_iris.data, columns=["sepal_length","sepal_width","petal_length","petal_width"])
X = pd.DataFrame(data_iris.data, columns=["sepal_length","sepal_width","petal_length","petal_width"])
y = pd.DataFrame(data_iris.target, columns=["Species"])

df = pd.concat([X, y], axis=1)

In [10]:
# データフレームから"virgicolor"と"virginica"を抜き出してください。
# 参考情報:setosa:0, virgiclor:1, virginica:2
# virgiclor → 0, virginica → 1に変更する

df_temp1 = df[ df["Species"] == 1 ]
df_temp1 = df_temp1.replace({'Species': {1: 0}})

df_temp2 = df[ df["Species"] == 2 ]
df_temp2 = df_temp2.replace({'Species': {2: 1}})
df2 = pd.concat([df_temp1, df_temp2], axis=0)

In [11]:
display(df2.head(3))
display(df2.tail(3))

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Species
50,7.0,3.2,4.7,1.4,0
51,6.4,3.2,4.5,1.5,0
52,6.9,3.1,4.9,1.5,0


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Species
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1
149,5.9,3.0,5.1,1.8,1


In [12]:
# 訓練データの分割
# 特徴量（説明変数）をX、正解（目的変数）をyというndarrayに格納
X_ndarray = df2[["sepal_length", "sepal_width", "petal_length", "petal_width"]].values
y_ndarray = df2[["Species"]].values

# X_train, X_test, y_train, y_test = train_test_split(X_ndarray, y_ndarray, test_size=0.25, random_state=0)
X_train, X_test, y_train, y_test = scratch_train_test_split(X_ndarray, y_ndarray, train_size=0.75)

In [13]:
# 前処理・標準化
scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

### ロジスティック回帰

In [14]:
# ロジスティック回帰
clf = linear_model.SGDClassifier(loss="log").fit(X_train_std, y_train.reshape(75,))
y_pred = clf.predict(X_test_std)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 0.96
Precision（適合率） 0.9230769230769231
Recall（再現率） 1.0
F値 0.9600000000000001


### SVM

In [15]:
# SVM
from sklearn.svm import SVC

clf = SVC().fit(X_train_std, y_train.reshape(75,))

y_pred = clf.predict(X_test_std)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 0.92
Precision（適合率） 0.8571428571428571
Recall（再現率） 1.0
F値 0.923076923076923


### 決定木

In [17]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=1)
clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 0.84
Precision（適合率） 0.75
Recall（再現率） 1.0
F値 0.8571428571428571


## 2.シンプルデータセット1

In [26]:
import numpy as np
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

In [27]:
# X_train, X_test, y_train, y_test = train_test_split(X_ndarray, y_ndarray, test_size=0.25, random_state=0)
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.75)

In [28]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(375, 2) (125, 2) (375, 1) (125, 1)


### ロジスティック回帰

In [29]:
clf = linear_model.SGDClassifier(loss="log").fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 1.0
Precision（適合率） 1.0
Recall（再現率） 1.0
F値 1.0


  y = column_or_1d(y, warn=True)


### SVM

In [31]:
from sklearn.svm import SVC

clf = SVC().fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 1.0
Precision（適合率） 1.0
Recall（再現率） 1.0
F値 1.0


  y = column_or_1d(y, warn=True)


### 決定木

In [33]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=1)
clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 0.96
Precision（適合率） 0.948051948051948
Recall（再現率） 0.9864864864864865
F値 0.9668874172185431


## 3.シンプルデータセット2

In [34]:
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [35]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.75)

In [36]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(30, 2) (10, 2) (30, 1) (10, 1)


### ロジスティック回帰

In [37]:
clf = linear_model.SGDClassifier(loss="log").fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 0.5
Precision（適合率） 0.5
Recall（再現率） 0.4
F値 0.4444444444444445


  y = column_or_1d(y, warn=True)


### SVM

In [38]:
from sklearn.svm import SVC

clf = SVC().fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 0.6
Precision（適合率） 0.6666666666666666
Recall（再現率） 0.4
F値 0.5


  y = column_or_1d(y, warn=True)


### 決定木

In [39]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=1)
clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy（正解率）", accuracy_score(y_test, y_pred))
print("Precision（適合率）", precision_score(y_test, y_pred))
print("Recall（再現率）", recall_score(y_test, y_pred))
print("F値",f1_score(y_test, y_pred))

Accuracy（正解率） 0.6
Precision（適合率） 0.6666666666666666
Recall（再現率） 0.4
F値 0.5


# 【問題3】 回帰問題を解くコードの作成

線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。    
線形回帰は勾配降下法を用いて計算するSGDRegressorクラスを利用してください。  

In [40]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import scipy
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
df = pd.read_csv('train.csv', index_col=0)

In [42]:
# GrLivAreaとYearBuiltを抜き出す。
X = df[["GrLivArea", "YearBuilt"]]
y = df["SalePrice"]

In [46]:
# データの前処理
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

# データを分割
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.75)

In [47]:
# 学習
clf =  SGDRegressor()
clf.fit(X_train, y_train)
        
# 推定
y_pred = clf.predict(X_test)

print("平均二乗誤差（標準偏差、ばらつき、MSE）", mean_squared_error(y_test, y_pred))

平均二乗誤差（標準偏差、ばらつき、MSE） 1.3079014150157336e+29


  y = column_or_1d(y, warn=True)
