In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## 【問題1】train_test_splitのスクラッチ
スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。

In [2]:
def scratch_train_test_split(X, y, train_size=0.8, random_state=None, shuffle=False, stratify=None):
    """
    検証用データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      学習データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    #ここにコードを書く
    import numpy as np
    import pandas as pd
    
    # 乱数シードの作成
    np.random.seed(random_state)
    
    # Xとyが列方向に結合できない時、Errorと出力してNoneを返す
    if X.shape[0] != len(y):
        print('Error')
        return None
    
    # X, yの結合
    data = np.concatenate([X, y.reshape((-1, 1))], axis=1)
    # 返り値の作成
    row, col = X.shape
    X_train = np.empty((0, col), float)
    y_train = np.empty((0,), float)
    X_test = np.empty((0, col), float)
    y_test = np.empty((0,), float)
    
    # 変数shuffleがTrueのとき、dataをシャッフルする
    if shuffle:
        np.random.shuffle(data)
    
    # stratifyに値が与えられたとき、それぞれの値ごとに上からtrain, testに分割
    if stratify is not None:
        unique_value, unique_count = np.unique(stratify, return_counts=True)
        odd_count = 0
        for value, count in zip(unique_value, unique_count):
            tmp_data = data[data[:,-1]==value]
            if count%2 == 1:
                odd_count += 1
                if odd_count%2 == 1:
                    size = int(count*train_size) + 1
                else:
                    size = int(count*train_size)
            else:
                size = int(count*train_size)
            
            X_train = np.concatenate([X_train, tmp_data[:size, :col]], axis=0)
            y_train = np.concatenate([y_train, tmp_data[:size, -1]], axis=0)
            X_test = np.concatenate([X_test, tmp_data[size:, :col]], axis=0)
            y_test = np.concatenate([y_test, tmp_data[size:, -1]], axis=0)
            
    # stratify==Noneのとき、dataを上からtrain, testに分割
    else:
        size = int(row*train_size)
        X_train = data[:size, :-1]
        y_train = data[:size, -1]
        X_test = data[size:, :-1]
        y_test = data[size:, -1]
        
    return X_train, X_test, y_train, y_test

## 【問題2】分類問題を解くコードの作成
ロジスティック回帰、SVM、決定木の3種類の手法でデータセットを学習・推定するコードを作成してください。

#### LogisticRegression

In [3]:
from sklearn.datasets import load_iris
data = pd.DataFrame(load_iris().data, 
        columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
target = pd.DataFrame(load_iris().target, columns=['Species'])
df = pd.concat([data, target], axis=1)
df = df[df['Species'].isin([1, 2])]

In [4]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,Species
50,7.0,3.2,4.7,1.4,1
51,6.4,3.2,4.5,1.5,1
52,6.9,3.1,4.9,1.5,1
53,5.5,2.3,4.0,1.3,1
54,6.5,2.8,4.6,1.5,1


In [5]:
X = df.drop(columns=['Species']).values
y = df['Species'].values

In [6]:
score = []
for _ in range(100):
    X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, shuffle=True, train_size=0.75, stratify=y)
    sgd = SGDClassifier(loss='log')
    sgd.fit(X_train, y_train)
    score.append(sgd.score(X_test, y_test))
np.mean(score)

0.8546153846153848

In [7]:
score = []
for _ in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.75, stratify=y)
    sgd = SGDClassifier(loss='log')
    sgd.fit(X_train, y_train)
    score.append(sgd.score(X_test, y_test))
np.mean(score)

0.8375999999999999

#### SVM

In [8]:
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

In [9]:
score = []
for _ in range(3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, shuffle=True, stratify=y)
    svc = SVC(gamma='scale')
    svc.fit(X_train, y_train)
    score.append(svc.score(X_test, y_test))
np.mean(score)

1.0

In [10]:
score = []
for _ in range(3):
    X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.75, shuffle=True, stratify=y)
    svc = SVC(gamma='scale')
    svc.fit(X_train, y_train)
    score.append(svc.score(X_test, y_test))
np.mean(score)

1.0

#### 決定木

In [11]:
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [12]:
score = []
for _ in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, shuffle=True, stratify=y)
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    score.append(tree.score(X_test, y_test))
np.mean(score)

0.6389999999999999

In [13]:
score = []
for _ in range(100):
    X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.75, shuffle=True, stratify=y)
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    score.append(tree.score(X_test, y_test))
np.mean(score)

0.628

## 【問題3】回帰問題を解くコードの作成
線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [14]:
train = pd.read_csv('../week3/train.csv')

In [15]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [16]:
X = train[['GrLivArea', 'YearBuilt']].values
y = train['SalePrice'].values

In [21]:
score = []
for _ in range(10):
    X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, shuffle=True, train_size=0.75)
    sgd = SGDClassifier(loss='log')
    sgd.fit(X_train, y_train)
    score.append(sgd.score(X_test, y_test))
np.mean(score)

0.002191780821917808

In [22]:
score = []
for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.75)
    sgd = SGDClassifier(loss='log')
    sgd.fit(X_train, y_train)
    score.append(sgd.score(X_test, y_test))
np.mean(score)

0.0027397260273972603