In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import random

In [3]:
df = pd.read_csv("./train.csv",
                 usecols=["GrLivArea", "YearBuilt", "SalePrice"])
df.head()

Unnamed: 0,YearBuilt,GrLivArea,SalePrice
0,2003,1710,208500
1,1976,1262,181500
2,2001,1786,223500
3,1915,1717,140000
4,2000,2198,250000


In [5]:
# Logarithmic transformation
df = np.log(df)

# Split
X = df[["YearBuilt", "GrLivArea"]]
y = df[["SalePrice"]]
# ndarray conversion
X_array = X.values
y_array = y.values
# split, shuffle True
X_train, X_valid, y_train, y_valid = train_test_split(
     X_array, y_array, test_size=0.2, random_state=0, shuffle=True)

# Standardize each feature
# Separate X and y and instantiate
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_valid = scaler_X.transform(X_valid)
y_train = scaler_y.fit_transform(y_train).ravel()
y_valid = scaler_y.transform(y_valid).ravel()

# y_train = y_train.ravel()
# y_valid = y_valid.ravel()

In [6]:
# LinearRegression
reg_b = LinearRegression().fit(X_train, y_train)
pred_reg_b = reg_b.predict(X_valid)
print("MSE : LinearRegression")
print('{:.1e}'.format(mean_squared_error(y_valid, pred_reg_b)))  

MSE : LinearRegression
2.7e-01


In [7]:
# SVR
svr_b = SVR().fit(X_train, y_train)
pred_svr_b = svr_b.predict(X_valid)
print("MSE : SVR")
print('{:.1e}'.format(mean_squared_error(y_valid, pred_svr_b)))  

MSE : SVR
2.3e-01


In [8]:
# DecisionTreeRegressor
dt_b = DecisionTreeRegressor().fit(X_train, y_train)
pred_dt_b = dt_b.predict(X_valid)
print("MSE : DecisionTreeRegressor")
print('{:.1e}'.format(mean_squared_error(y_valid, pred_dt_b))) 

MSE : DecisionTreeRegressor
4.6e-01


In [10]:
# blending (average)

# LinearRegression + SVR
pred_reg_svr = (pred_reg_b + pred_svr_b)/2
# LinearRegression + DecisionTreeRegressor
pred_reg_dt = (pred_reg_b + pred_dt_b)/2
# SVR + DecisionTreeRegressor
pred_svr_dt = (pred_svr_b + pred_dt_b)/3

print("MSE")
print("LinearRegression + SVR : ")
print('{:.1e}\n'.format(mean_squared_error(y_valid, pred_reg_svr)))
print("LinearRegression + DecisionTreeRegressor : ")
print('{:.1e}\n'.format(mean_squared_error(y_valid, pred_reg_dt)))
print("SVR + DecisionTreeRegressor : ")
print('{:.1e}'.format(mean_squared_error(y_valid, pred_svr_dt)))

MSE
LinearRegression + SVR : 
2.4e-01

LinearRegression + DecisionTreeRegressor : 
2.9e-01

SVR + DecisionTreeRegressor : 
3.1e-01


In [12]:
X_train.shape[0]

1168

In [13]:
# Random sampling with duplicates from the sample index 500
index_list = [i for i in range(X_train.shape[0])]

# Initialize predictor
pred = 0
# repeat 10 times
for n in range(10):
     index = random.choices(index_list, k=500)
     # DecisionTreeRegressor
     dt = DecisionTreeRegressor().fit(X_train[index], y_train[index])
     pred_dt = dt.predict(X_valid)
     # total
     pred += pred_dt

# output the average
print("MSE : DecisionTreeRegressor / bagging")
print('{:.1e}'.format(mean_squared_error(y_valid, pred/10)))

MSE : DecisionTreeRegressor / bagging
2.7e-01


In [14]:
# Cross-validation learning
# Scratch every K-fold

# Division number
N = 4
# Number of samples after division
num = X_train.shape[0]//N

# list of learning models
model_list = [LinearRegression, SVR, DecisionTreeRegressor]
# list for retrieving trained models
model_para=[]
model_para_all = []
# Empty data to store the blend data
blend_data = np.zeros([y_train.shape[0], 3])

for i, model in enumerate(model_list):
     # Initialize per-model list
     model_para = []
     for n in range(N):
         # No shuffling here, as train_test_split is shuffling
         # get index after split
         index_valid = [a for a in range(num*n, num*(n+1))]
         index_train = [i for i in index_list if i not in index_valid]
         # study
         model_reg = model().fit(X_train[index_train], y_train[index_train])
         # predict
         blend_data[index_valid, i] = model_reg.predict(X_train[index_valid])
         # get model
         model_para.append(model_reg)
     model_para_all.append(model_para)

print(blend_data.shape)

(1168, 3)


In [15]:
svr_st = SVR(C=1.0, epsilon=0.2, kernel="linear").fit(blend_data, y_train)

In [20]:
# cross-validation prediction

# Initialize predictor
blend_pred = np.zeros([y_valid.shape[0], 3], dtype=np.float)
# predict
for n, models in enumerate(model_para_all):
     # Initialize predictions for each model
     pred = np.zeros_like(y_valid)
     for model in models:
        pred = pred + model.predict(X_valid)
     blend_pred[:, n] = pred/3

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.


In [21]:
# Prediction on blended data
print("MSE : stacking")
print('{:.1e}'.format(mean_squared_error(y_valid, svr_st.predict(blend_pred))))

MSE : stacking
2.9e-01
