In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../week3/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
X = df[['GrLivArea', 'YearBuilt']].values
y = df[['SalePrice']].values
xscaler = StandardScaler()
X_scaled = xscaler.fit_transform(X)
yscaler = StandardScaler()
y_scaled = yscaler.fit_transform(y).ravel()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2)

## 【問題1】ブレンディングのスクラッチ実装
ブレンディングをスクラッチ実装し、単一モデルより精度があがる例を 最低3つ 示してください。  
精度があがるとは、検証用データに対する平均二乗誤差（MSE）が小さくなることを指します。

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [5]:
class Blending():
    """
    各モデルの出力の平均を返す
    """
    def __init__(self, *model):
        self.model_list = [*model]
        self.num_model = len(model)
        
        
    def fit(self, X_train, y_train):
        for i in range(self.num_model):
            self.model_list[i].fit(X_train, y_train)
        
        
    def predict(self, X_test, weight=None):
        y_pred = np.zeros((len(X_test), self.num_model))
        if not weight:
            weight = np.array([1/self.num_model]*self.num_model)
        else:
            weight = np.array(weight)
        for i in range(self.num_model):
            y_pred[:,i] = self.model_list[i].predict(X_test)
        return np.sum(y_pred*weight, axis=1)

In [6]:
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred = linear.predict(X_test)
print(f'linear regression: {mse(y_pred, y_test):.4f}')

svr = SVR(gamma='auto')
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
print(f'SVR: {mse(y_pred, y_test):.4f}')

tree = DecisionTreeRegressor(max_depth=5)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
print(f'decisiontreeregressor: {mse(y_pred, y_test):.4f}')

linear regression: 0.3650
SVR: 0.3252
decisiontreeregressor: 0.3206


In [7]:
linear = LinearRegression()
svr = SVR(gamma='auto')
tree = DecisionTreeRegressor(max_depth=5)

b = Blending(linear, svr, tree)
b.fit(X_train, y_train)
y_pred = b.predict(X_test, weight=[1/3, 1/3, 1/3])
mse(y_pred, y_test)

0.30574468789172277

In [8]:
svr = SVR(gamma='auto')
tree = DecisionTreeRegressor(max_depth=5)

b = Blending(svr, tree)
b.fit(X_train, y_train)
y_pred = b.predict(X_test, weight=[2/3, 1/3])
mse(y_pred, y_test)

0.30170700842677417

In [9]:
linear = LinearRegression()
svr = SVR(C=0.5, gamma='auto')
tree = DecisionTreeRegressor(max_depth=5)

b = Blending(linear, svr, tree)
b.fit(X_train, y_train)
y_pred = b.predict(X_test, weight=[1/5, 2/5, 2/5])
mse(y_pred, y_test)

0.3052890014081833

In [10]:
linear = LinearRegression()
tree = DecisionTreeRegressor(max_depth=5)

b = Blending(tree, linear)
b.fit(X_train, y_train)
y_pred = b.predict(X_test, weight=[1/6, 5/6])
mse(y_pred, y_test)

0.33709299418406935

## 【問題2】バギングのスクラッチ実装
バギングをスックラッチ実装し、単一モデルより精度が上がる例を最低一つ示してください。

In [11]:
class Bagging():
    def __init__(self, model, n=10, bs=0.8):
        self.model_list = [model]*n
        self.n = n
        self.bs = bs
        
        
    def fit(self, X, y):
        for i in range(self.n):
            X_train, _, y_train, _ = train_test_split(X, y, train_size=self.bs, shuffle=True)
            self.model_list[i].fit(X_train, y_train)

            
    def predict(self, X_test):
        y = 0
        for i in range(self.n):
            y += self.model_list[i].predict(X_test)
        return y / self.n

In [12]:
tree = DecisionTreeRegressor(max_depth=5)
b = Bagging(tree, n=1000, bs=0.8)
b.fit(X_train, y_train)
y_pred = b.predict(X_test)
mse(y_pred, y_test)

0.30048340985757094

##  【問題3】スタッキングのスクラッチ実装
スタッキングをスクラッチ実装し、単一モデルより精度が上がる例を最低一つ示してください。

In [13]:
from sklearn.model_selection import KFold

class Stacking():
    def __init__(self, *model, n_splits=2, last=None):
        """
        行方向は使用するモデルの種類、列方向はfitさせるデータの種類が異なる
        model_listの配列を使用する
        """
        self.model_list = np.repeat(np.array(model).reshape(-1,1), n_splits, axis=1)
        self.n_model = len(model)
        self.n_splits = n_splits
        self.last = last
        self.blend = None
        
        
    def fit(self, X, y):
        kf = KFold(n_splits=self.n_splits, shuffle=True)
        self.blend = np.zeros((len(X), self.n_model))
        for i, idx in enumerate(kf.split(X, y)):
            X_train, y_train = X[idx[0]], y[idx[0]]
            X_val, y_val = X[idx[1]], y[idx[1]]
            for j in range(self.n_model):
                self.model_list[j,i].fit(X_train, y_train)
                self.blend[idx[1],j] = self.model_list[j,i].predict(X_val)     
        
        self.last.fit(self.blend, y)
        
        
    def predict(self, X_test):
        y_blend = np.zeros((len(X_test), self.n_model))
        for i in range(self.n_model):
            tmp = np.zeros((len(X_test), self.n_splits))
            for j in range(self.n_splits):
                tmp[:,j] = self.model_list[i,j].predict(X_test)
            y_blend[:,i] = np.mean(tmp, axis=1)
            
        return self.last.predict(y_blend)

In [14]:
linear = LinearRegression()
tree = DecisionTreeRegressor()
svr = SVR(gamma='auto')

stack = Stacking(svr, tree, n_splits=5, last=linear)
stack.fit(X_train, y_train)

In [15]:
y_pred = stack.predict(X_test)
mse(y_pred, y_test)

0.2935390975071535