# Dataset and Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

## [Problem 1]

In [2]:
data = pd.read_csv("Housing_train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
X = data[["GrLivArea", "YearBuilt"]]
X.head()

Unnamed: 0,GrLivArea,YearBuilt
0,1710,2003
1,1262,1976
2,1786,2001
3,1717,1915
4,2198,2000


In [4]:
y = data["SalePrice"]
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

## Blending

In [5]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True)
X_train.head()

Unnamed: 0,GrLivArea,YearBuilt
64,2034,1997
682,1291,1996
960,858,1958
1384,1258,1939
1100,438,1920


In [6]:
LM= linear_model.LinearRegression()
LM.fit(X_train, y_train)
LM_pred = LM.predict(X_test)
print("MSE(Linear Model): ",mean_squared_error(y_test, LM_pred))

scaler = StandardScaler()
X_test_std = scaler.fit_transform(X_test)
X_train_std = scaler.fit_transform(X_train)
svr= SVR(kernel='rbf')
svr.fit(X_train_std, y_train)
svr_pred = svr.predict(X_test_std)
print("MSE(SVR): ",mean_squared_error(y_test, svr_pred))

dTree = DecisionTreeRegressor(max_depth=3)
dTree.fit(X_train, y_train)
dt_pred = dTree.predict(X_test)
print("MSE(Decision Tree Regressor): ",mean_squared_error(y_test, dt_pred))

MSE(Linear Model):  2690647926.377604
MSE(SVR):  7119069689.683064
MSE(Decision Tree Regressor):  2553443455.30812


In [7]:
mix_pred = np.vstack((LM_pred, svr_pred, dt_pred))
print(mix_pred.shape)

mean_pred = LM_pred*0.4 + svr_pred*0.1 + dt_pred*0.5
print("MSE(mean): ",mean_squared_error(y_test, mean_pred))

(3, 438)
MSE(mean):  2463697221.7955885


In [8]:
lass= Lasso(alpha=0.1)
lass.fit(X_train, y_train)
lass_pred = lass.predict(X_test)
print("MSE(Lasso): ",mean_squared_error(y_test, lass_pred))

scaler = StandardScaler()
X_test_std = scaler.fit_transform(X_test)
X_train_std = scaler.fit_transform(X_train)
svr= SVR(kernel='rbf',C=0.5)
svr.fit(X_train_std, y_train)
svr_pred = svr.predict(X_test_std)
print("MSE(SVR): ",mean_squared_error(y_test, svr_pred))

dTree = DecisionTreeRegressor(max_depth=3)
dTree.fit(X_train, y_train)
dt_pred = dTree.predict(X_test)
print("MSE(Decision Tree Regressor): ",mean_squared_error(y_test, dt_pred))

MSE(Lasso):  2690647942.4335785
MSE(SVR):  7128920068.0851145
MSE(Decision Tree Regressor):  2553443455.30812


In [9]:
mix_pred = np.vstack((lass_pred, svr_pred, dt_pred))
print(mix_pred.shape)

mean_pred = lass_pred*0.45 + svr_pred*0.05 + dt_pred*0.5
print("MSE(mean): ",mean_squared_error(y_test, mean_pred))

(3, 438)
MSE(mean):  2407972049.7210937


In [10]:
rid= Ridge(alpha=0.1)
rid.fit(X_train, y_train)
rid_pred = rid.predict(X_test)
print("MSE(Ridge): ",mean_squared_error(y_test, rid_pred))

scaler = StandardScaler()
X_test_std = scaler.fit_transform(X_test)
X_train_std = scaler.fit_transform(X_train)
svr= SVR(kernel='linear', C=100)
svr.fit(X_train_std, y_train)
svr_pred = svr.predict(X_test_std)
print("MSE(SVR): ",mean_squared_error(y_test, svr_pred))

Dtree = DecisionTreeRegressor(max_depth=9)
Dtree.fit(X_train, y_train)
Dtree_pred = Dtree.predict(X_test)
print("MSE(Decision Tree Regressor): ",mean_squared_error(y_test, Dtree_pred))


MSE(Ridge):  2690647942.0877924
MSE(SVR):  3389628361.3601527
MSE(Decision Tree Regressor):  1930281043.8730376


In [11]:
mix_pred = np.vstack((rid_pred, svr_pred, Dtree_pred))
print(mix_pred.shape)

mean_pred = rid_pred*0.25 + svr_pred*0.05 + Dtree_pred*0.7
print("MSE(mean): ",mean_squared_error(y_test, mean_pred))

(3, 438)
MSE(mean):  1834615062.0433598


## [Problem 2] Scratch mounting of bagging

In [12]:
class Bagging():

    def fit(self, models, X, y):
        self.model_list = []
        for j, model in enumerate(models):
            np.random.seed(j)
            self.rand_index = np.random.choice(X.index,X.shape[0], replace=True)
            self.X_rand = X.loc[self.rand_index]
            self.y_rand = y.loc[self.rand_index]
            self.model_list.append(model.fit(self.X_rand, self.y_rand))

    def predict(self, X, y):
        print(X.shape[0])
        print(len(self.model_list))
        self.pred_data = np.zeros((X.shape[0],len(self.model_list)))
        for i, model in enumerate(self.model_list):
            self.pred = model.predict(X)
            self.pred_data[:,i] = self.pred
        self.final_pred = np.mean(self.pred_data, axis=1)
        print("MSE(mean): ",mean_squared_error(y, self.final_pred))
        return self.final_pred

In [13]:
np.random.seed(1)
rand_index = np.random.choice(X_train.index,1022, replace=True)
np.random.seed(1)
rand_index = np.random.choice(X_train.index,X_train.shape[0], replace=True)
X_train1 = X_train.loc[rand_index]
y_train1 = y_train.loc[rand_index]

np.random.seed(2)
rand_index = np.random.choice(X_train.index,X_train.shape[0], replace=True)
X_train2 = X_train.loc[rand_index]
y_train2 = y_train.loc[rand_index]

np.random.seed(3)
rand_index = np.random.choice(X_train.index,X_train.shape[0], replace=True).tolist()
X_train3 = X_train.loc[rand_index]
y_train3 = y_train.loc[rand_index]

In [14]:
Ridge().get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.0001}

In [28]:
reg1 = GradientBoostingRegressor(n_estimators=100,max_depth=5).fit(X_train1, y_train1)
y_pred1 =reg1.predict(X_test)
print("MSE_1: ",mean_squared_error(y_test, y_pred1))

reg2 = GradientBoostingRegressor(n_estimators=500,max_depth=3).fit(X_train2, y_train2)
y_pred2 =reg2.predict(X_test)
print("MSE_2: ",mean_squared_error(y_test, y_pred2))

reg3 = GradientBoostingRegressor(n_estimators=250,max_depth=5).fit(X_train3, y_train3)
y_pred3 =reg3.predict(X_test)
print("MSE_3: ",mean_squared_error(y_test, y_pred3))

y_pred_mean = (y_pred1 + y_pred2 + y_pred3)/3
print("MSE(mean): ",mean_squared_error(y_test, y_pred_mean))



MSE_1:  2165256532.8107986




MSE_2:  1896394810.811034
MSE_3:  1671034045.7823763
MSE(mean):  1593862676.7987442




In [29]:
reg1 = xgb.XGBRegressor(n_estimators=100,max_depth=5).fit(X_train1, y_train1)
y_pred1 =reg1.predict(X_test)
print("MSE_1: ",mean_squared_error(y_test, y_pred1))

reg2 = xgb.XGBRegressor(n_estimators=500,max_depth=3).fit(X_train2, y_train2)
y_pred2 =reg2.predict(X_test)
print("MSE_2: ",mean_squared_error(y_test, y_pred2))

reg3 = xgb.XGBRegressor(n_estimators=250,max_depth=5).fit(X_train3, y_train3)
y_pred3 =reg3.predict(X_test)
print("MSE_3: ",mean_squared_error(y_test, y_pred3))

y_pred_mean = (y_pred1 + y_pred2 + y_pred3)/3
print("MSE(mean): ",mean_squared_error(y_test, y_pred_mean))

MSE_1:  2659526947.602061
MSE_2:  1973867137.7285433
MSE_3:  1899874320.2893667
MSE(mean):  1747102157.269015


In [15]:
Dtree1 = DecisionTreeRegressor(max_depth=3).fit(X_train1, y_train1)
y_pred1 = Dtree1.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred1))

Dtree2 = DecisionTreeRegressor(max_depth=5).fit(X_train2, y_train2)
y_pred2 =Dtree2.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred2))

Dtree3 = DecisionTreeRegressor(max_depth=9).fit(X_train3, y_train3)
y_pred3 =Dtree3.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred3))

y_pred_mean = (y_pred1 + y_pred2 + y_pred3)/3
print("MSE(mean): ",mean_squared_error(y_test, y_pred_mean))

MSE:  2664699338.9402046
MSE:  2208194456.6989045
MSE:  2193007582.7991037
MSE(mean):  1808190527.5706503


In [16]:
rid1= Ridge(alpha=0.1).fit(X_train1, y_train1)
y_pred1 =rid1.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred1))

rid2= Ridge(alpha=1.0, tol=0.001).fit(X_train2, y_train2)
y_pred2 =rid2.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred2))

rid3= Ridge(alpha=1.0, tol=0.005).fit(X_train3, y_train3)
y_pred3 =rid3.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred3))

y_pred_mean = (y_pred1 + y_pred2 + y_pred3)/3
print("MSE(mean): ",mean_squared_error(y_test, y_pred_mean))

MSE:  2686029105.2011023
MSE:  2668867039.9868913
MSE:  2716211753.264745
MSE(mean):  2681223886.5419602


In [17]:
lgb1 = lgb.LGBMRegressor(n_estimators=100,max_depth=5).fit(X_train1, y_train1)
y_pred1 =lgb1.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred1))

lgb2 = lgb.LGBMRegressor(n_estimators=500,max_depth=3).fit(X_train2, y_train2)
y_pred2 =lgb2.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred2))

lgb3 = lgb.LGBMRegressor(n_estimators=250,max_depth=5).fit(X_train3, y_train3)
y_pred3 =lgb3.predict(X_test)
print("MSE: ",mean_squared_error(y_test, y_pred3))

y_pred_mean = (y_pred1 + y_pred2 + y_pred3)/3
print("MSE(mean): ",mean_squared_error(y_test, y_pred_mean))
y_pred3[1]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 338
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 2
[LightGBM] [Info] Start training from score 181552.910959
MSE:  1996989149.286145
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 2
[LightGBM] [Info] Start training from score 181238.730920
MSE:  2217785612.142733
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 340
[LightGBM] [Info] Number of data points in the train set: 1022, numbe

149261.18481853406

In [31]:
models = [xgb.XGBRegressor(n_estimators=100,max_depth=5), GradientBoostingRegressor(n_estimators=100,max_depth=5), lgb.LGBMRegressor(n_estimators=100,max_depth=5)]
bg = Bagging()
bg.fit(models=models, X = X_train, y = y_train)
bg.predict(X_test, y_test)[:10]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 2
[LightGBM] [Info] Start training from score 181238.730920
438
3
MSE(mean):  1814631973.8013737


array([218055.89613302, 154910.58515447, 130638.67269985, 166969.34172012,
       116097.58711692,  94462.82505701, 203757.6657332 , 124846.08936305,
       565684.79964423, 153076.47330579])

In [32]:
models = [xgb.XGBRegressor(n_estimators=100,max_depth=5), xgb.XGBRegressor(n_estimators=100,max_depth=5), xgb.XGBRegressor(n_estimators=100,max_depth=5)]
bg = Bagging()
bg.fit(models=models, X = X_train, y = y_train)
bg.predict(X_test, y_test)[:10]

438
3
MSE(mean):  1871564578.070478


array([224069.70833333, 149611.39583333, 131918.59375   , 167035.140625  ,
       115791.65625   ,  89797.33854167, 213509.984375  , 117679.70052083,
       560267.73958333, 151523.21875   ])

In [33]:
models = [lgb.LGBMRegressor(n_estimators=100,max_depth=5), lgb.LGBMRegressor(n_estimators=100,max_depth=5), lgb.LGBMRegressor(n_estimators=100,max_depth=5)]
bg = Bagging()
bg.fit(models=models, X = X_train, y = y_train)
bg.predict(X_test, y_test)[:10]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 341
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 2
[LightGBM] [Info] Start training from score 183311.007828
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 338
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 2
[LightGBM] [Info] Start training from score 181552.910959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 2
[LightGBM] [Info] Start tr

array([186616.01381421, 154654.02881744, 128076.41901312, 184057.70727728,
       128618.50371924,  91991.24366953, 202164.25872405, 126932.11921891,
       441678.27135167, 146946.96392326])

In [18]:
models = [DecisionTreeRegressor(max_depth=3), DecisionTreeRegressor(max_depth=5), DecisionTreeRegressor(max_depth=9)]
bg = Bagging()
bg.fit(models=models, X = X_train, y = y_train)
bg.predict(X_test, y_test)[:10]

438
3
MSE(mean):  2082229343.4478562


array([195193.94905274, 157110.48739213, 124787.49062646, 187879.01518027,
       131999.57030482, 106066.27635264, 198676.36175465, 120975.92384017,
       517266.01388889, 151956.47769667])

In [19]:
models = [Ridge(alpha=0.1), Ridge(alpha=1.0, tol=0.001), Ridge(alpha=1.0, tol=0.005)]
bg = Bagging()
bg.fit(models=models, X = X_train, y = y_train)
bg.predict(X_test, y_test)[:10]

438
3
MSE(mean):  2689812086.8563147


array([264112.64410977, 155652.55610252, 127886.33564298, 235903.28577607,
       138286.3105352 ,  69456.63737513, 211451.27681943, 131612.95223843,
       496702.73759219, 172779.36738094])

In [20]:
models = [Ridge(alpha=0.1), lgb.LGBMRegressor(n_estimators=100,max_depth=5), DecisionTreeRegressor(max_depth=9)]
bg = Bagging()
bg.fit(models=models, X = X_train, y = y_train)
bg.predict(X_test, y_test)[:10]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 338
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 2
[LightGBM] [Info] Start training from score 181552.910959
438
3
MSE(mean):  2036815718.3806436


array([210476.95894213, 156890.54157627, 125680.79404234, 197150.55713645,
       138542.24963586,  85672.01381959, 213653.13741677, 123972.69742348,
       481443.71176392, 158152.00156221])

## [Problem 3] Stacking scratch mounting

In [27]:
X1_train, X2_train, X3_train  = np.split(X_train, [340, 681])
y1_train, y2_train, y3_train  = np.split(y_train, [340, 681])
X_train1 = np.vstack((X2_train, X3_train))
y_train1 = np.hstack((y2_train, y3_train))

reg1_Gr = GradientBoostingRegressor(n_estimators=100,max_depth=5).fit(X_train1, y_train1)
y_pred1_reg1_Gr =reg1_Gr.predict(X1_train)
print("MSE_1: ",mean_squared_error(y1_train, y_pred1_reg1_Gr))

X_train2 = np.vstack((X1_train, X3_train))
y_train2 = np.hstack((y1_train, y3_train))
reg2_Gr = GradientBoostingRegressor(n_estimators=500,max_depth=3).fit(X_train2, y_train2)
y_pred2_reg2_Gr =reg2_Gr.predict(X2_train)
print("MSE_2: ",mean_squared_error(y2_train, y_pred2_reg2_Gr))

X_train3 = np.vstack((X1_train, X2_train))
y_train3 = np.hstack((y1_train, y2_train))
reg3_Gr = GradientBoostingRegressor(n_estimators=250,max_depth=5).fit(X_train3, y_train3)
y_pred3_reg3_Gr =reg3_Gr.predict(X3_train)
print("MSE_3: ",mean_squared_error(y3_train, y_pred3_reg3_Gr))
y_train_Gr = np.hstack((y_pred1_reg1_Gr, y_pred2_reg2_Gr, y_pred2_reg2_Gr))



MSE_1:  2047547875.312971




MSE_2:  2292128931.2118974
MSE_3:  1440499776.3230839




In [22]:
X_train1 = np.vstack((X2_train, X3_train))
y_train1 = np.hstack((y2_train, y3_train))
reg1_LG = lgb.LGBMRegressor(n_estimators=100,max_depth=5).fit(X_train1, y_train1)
y_pred1_reg1_LG =reg1_LG.predict(X1_train)
print("MSE_1: ",mean_squared_error(y1_train, y_pred1_reg1_LG))

X_train2 = np.vstack((X1_train, X3_train))
y_train2 = np.hstack((y1_train, y3_train))
reg2_LG = lgb.LGBMRegressor(n_estimators=500,max_depth=3).fit(X_train2, y_train2)
y_pred2_reg2_LG =reg2_LG.predict(X2_train)
print("MSE_2: ",mean_squared_error(y2_train, y_pred2_reg2_LG))

X_train3 = np.vstack((X1_train, X2_train))
y_train3 = np.hstack((y1_train, y2_train))
reg3_LG = lgb.LGBMRegressor(n_estimators=250,max_depth=5).fit(X_train3, y_train3)
y_pred3_reg3_LG =reg3_LG.predict(X3_train)
print("MSE_3: ",mean_squared_error(y3_train, y_pred3_reg3_LG))
y_train_LG = np.hstack((y_pred1_reg1_LG, y_pred2_reg2_LG, y_pred2_reg2_LG))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 311
[LightGBM] [Info] Number of data points in the train set: 682, number of used features: 2
[LightGBM] [Info] Start training from score 180268.259531
MSE_1:  1789291655.0779407
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points in the train set: 681, number of used features: 2
[LightGBM] [Info] Start training from score 176699.019090
MSE_2:  2088962682.1700788
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 314
[LightGBM] [Info] Number of data points in the train set: 681, nu

In [23]:
X_train1 = np.vstack((X2_train, X3_train))
y_train1 = np.hstack((y2_train, y3_train))
reg1_RF = RandomForestRegressor(n_estimators=100,max_depth=5).fit(X_train1, y_train1)
y_pred1_reg1_RF =reg1_RF.predict(X1_train)
print("MSE_1: ",mean_squared_error(y1_train, y_pred1_reg1_RF))

X_train2 = np.vstack((X1_train, X3_train))
y_train2 = np.hstack((y1_train, y3_train))
reg2_RF = RandomForestRegressor(n_estimators=500,max_depth=3).fit(X_train2, y_train2)
y_pred2_reg2_RF =reg2_RF.predict(X2_train)
print("MSE_2: ",mean_squared_error(y2_train, y_pred2_reg2_RF))

X_train3 = np.vstack((X1_train, X2_train))
y_train3 = np.hstack((y1_train, y2_train))
reg3_RF = RandomForestRegressor(n_estimators=250,max_depth=5).fit(X_train3, y_train3)
y_pred3_reg3_RF =reg3_RF.predict(X3_train)
print("MSE_3)",mean_squared_error(y3_train, y_pred3_reg3_RF))
y_train_RF = np.hstack((y_pred1_reg1_RF, y_pred2_reg2_RF, y_pred2_reg2_RF))



MSE_1:  1970509968.506887




MSE_2:  2421858114.9768577
MSE_3) 1182802514.6276407




In [24]:
train_mix = np.vstack((y_train_Gr, y_train_LG, y_train_RF)).T

DecisionTreeRegressor
reg_end = DecisionTreeRegressor(max_depth=9).fit(train_mix, y_train)

X_test1_Gr =reg1_Gr.predict(X_test)
X_test2_Gr =reg2_Gr.predict(X_test)
X_test3_Gr = reg3_Gr.predict(X_test)
y_test_Gr = (X_test1_Gr + X_test2_Gr + X_test3_Gr) /3

X_test1_LG =reg1_LG.predict(X_test)
X_test2_LG =reg2_LG.predict(X_test)
X_test3_LG = reg3_LG.predict(X_test)
y_test_LG = (X_test1_LG + X_test2_LG + X_test3_LG) /3

X_test1_RF =reg1_RF.predict(X_test)
X_test2_RF =reg2_RF.predict(X_test)
X_test3_RF = reg3_RF.predict(X_test)
y_test_RF = (X_test1_RF + X_test2_RF + X_test3_RF) /3





In [25]:

class Stacking():

    def __init__(self,models,end_model):
        self.models = models
        self.end_model = end_model

    def fit(self,X,y,K,seed):
        self.K = K
        KF = KFold(n_splits=K, random_state=seed, shuffle=True)
        self.pred_data = np.array([])
        X = np.array(X)
        y = np.array(y)
        self.preds = np.array([])
        self.model_list = []
        for i, model in enumerate(self.models):
            for train_index, test_index in KF.split(X):
                self.model_list.append(model.fit(X[train_index],y[train_index]))
                y_pred = model.predict(X[test_index])
                self.preds = np.append(self.preds, y_pred)
        self.preds = self.preds.reshape(len(self.models),X.shape[0]).T
        self.end_model.fit(self.preds, y)

    def predict(self,X, y):
        X = np.array(X)
        y = np.array(y)
        self.test_array = np.array([])
        self.final_test = np.zeros((X.shape[0],self.K))
        for model in self.model_list:
            y_pred_test = model.predict(X)
            self.test_array = np.append(self.test_array, y_pred_test)
        self.test_array=self.test_array.reshape(len(self.models)*self.K, X.shape[0]).T
        print("Test-->", self.test_array.shape)
        for j, i in enumerate(range(0,len(self.models)*self.K-self.K, self.K)):
            self.mean_pred = np.mean(self.test_array[:,i:i+self.K], axis=1)
            self.final_test[:,j] = self.mean_pred
        self.final_pred = self.end_model.predict(self.final_test)
        print("Last stage-->", self.final_test.shape)
        return self.final_pred

In [38]:
models = [SVR(kernel='linear', C=100), LinearRegression(), lgb.LGBMRegressor(n_estimators=100,max_depth=5)]
St = Stacking(models,end_model=RandomForestRegressor(n_estimators=100,max_depth=5))
St.fit(X=X_train, y=y_train,K=3,seed=0)
y_pred_end = St.predict(X_test, y_test)
print("MSE(Stacking): ",mean_squared_error(y_test, y_pred_end))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 312
[LightGBM] [Info] Number of data points in the train set: 681, number of used features: 2
[LightGBM] [Info] Start training from score 180474.484581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 309
[LightGBM] [Info] Number of data points in the train set: 681, number of used features: 2
[LightGBM] [Info] Start training from score 180849.674009
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points in the train set: 682, number of used features: 2
[LightGBM] [Info] Start train

In [34]:
models = [xgb.XGBRegressor(n_estimators=100,max_depth=5), GradientBoostingRegressor(n_estimators=100,max_depth=5), lgb.LGBMRegressor(n_estimators=100,max_depth=5)]
St = Stacking(models,end_model=RandomForestRegressor(n_estimators=100,max_depth=5))
St.fit(X=X_train, y=y_train,K=3,seed=0)
y_pred_end = St.predict(X_test, y_test)
print("MSE(Stacking): ",mean_squared_error(y_test, y_pred_end))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 312
[LightGBM] [Info] Number of data points in the train set: 681, number of used features: 2
[LightGBM] [Info] Start training from score 180474.484581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 309
[LightGBM] [Info] Number of data points in the train set: 681, number of used features: 2
[LightGBM] [Info] Start training from score 180849.674009
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points in the train se

In [39]:
models = [SVR(kernel='linear', C=100), LinearRegression(), xgb.XGBRegressor(n_estimators=100,max_depth=5), GradientBoostingRegressor(n_estimators=100,max_depth=5), lgb.LGBMRegressor(n_estimators=100,max_depth=5)]
St = Stacking(models,end_model=RandomForestRegressor(n_estimators=100,max_depth=5))
St.fit(X=X_train, y=y_train,K=5,seed=0)
y_pred_end = St.predict(X_test, y_test)
print("MSE(Stacking): ",mean_squared_error(y_test, y_pred_end))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 343
[LightGBM] [Info] Number of data points in the train set: 817, number of used features: 2
[LightGBM] [Info] Start training from score 179914.347613
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 341
[LightGBM] [Info] Number of data points in the train set: 817, number of used features: 2
[LightGBM] [Info] Start training from score 181066.908201
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 340
[LightGBM] [Info] Number of data points in the train set: 818, number of used features: 2
[LightGBM] [Info] Start train