# Kaggle Competition Ames Housing Prices: Team Integreat

In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.plotly as py
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Load and combine test and train

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

# Now drop the 'Id' colum since we can not use it as a feature to train our model.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

#Y_train = train['SalePrice']
#X_train = train.drop('SalePrice', axis=1)
#X_test = test.copy()
target = ['SalePrice']

all_data = pd.concat([train, test], ignore_index=True)
all_data.head()

** initial EDA**

In [None]:
#inital EDA
all_data.describe()

#investigate relationships
train.describe().columns
cols = ['LotArea','OverallQual','OverallCond','TotalBsmtSF','GarageArea', 'SalePrice']
sns.pairplot(train[cols], size=2)

#SalePrice right-skewed - log or box cox transformation
#TotalBsmtSF and GarageArea normally distributed

#Correlations SalePrice: TotalBsmtSF, GaraArea, OverallQual
#Multicollinearity: OverallQual and TotalBsmtSF, TotalBsmtSF and GarageArea

cor = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.5)
heat = sns.heatmap(cor, cbar=True,
                  annot=True,
                  square=True,
                  fmt='.2f',
                  annot_kws={'size':15},
                  yticklabels=cols,
                  xticklabels=cols)

#SalePrice correlate with: GarageArea, TotalSF

In [None]:
# investigate skewness
plt.rcParams['figure.figsize'] = (20, 20)
all_data.skew()index.plot(kind = "barh")
plt.title("Skewness of the Continuous Numerical Features in the Data")
plt.show()

** feature engineering**

In [None]:
from preprocess import impute, Encoder, Skewness, dummify

all_data = dummify(all_data) #make dummy variables for missing values
all_data = impute(all_data) # impute missing values
all_data = Encoder(all_data) # encode categorical variables
all_data = Skewness(all_data) # fix skewness of selected variables
all_data.head()

**split dataset again**

In [None]:
# split dataset
all_data_nomiss = all_data.copy()
trainset = len(train)
train = all_data_nomiss[:trainset]
test = all_data_nomiss[trainset:]


features = list(set(list(all_data_nomiss.columns))-set(target))
Y_train = np.log1p(train['SalePrice'])
X_train = train[list(features)]
X_test = test[list(features)]

#if validation within test set is wanted
#train = train.copy()
#train['is_train'] = np.random.uniform(0, 1, len(train)) <= .75
#Train, Validate = train[train['is_train']==True], train[train['is_train']==False]

#x_train = Train[list(features)].values
#y_train = Train["SalePrice"].values

#x_validate = Validate[list(features)].values
#y_validate = Validate["SalePrice"].values

#x_test=test[list(features)].values

## **Models**

**simple models**

In [None]:
#linear regression
from sklearn.linear_model import LinearRegression
linear = LinearRegression()

linear.fit(X_train, Y_train)
linear.score(X_train, Y_train)

pred = linear.predict(X_test)

submission = pd.DataFrame({'Id': test_ID, 'SalePrice': pred})
submission.to_csv('linear_no1.csv', index=False)

In [None]:
#ElasticNet
from sklearn.linear_model import ElasticNet
elastic = ElasticNet(alpha = 1, l1_ratio = 0.5)

elastic.fit(X_train, Y_train)
elastic.score(X_train, Y_train)

pred = elastic.predict(X_test)

submission = pd.DataFrame({'Id': test_ID, 'SalePrice': pred})
submission.to_csv('elastic_no1.csv', index=False)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

lasso_reg = Lasso(alpha=0.001, tol=0.01, random_state=1)

Lasso_linear = make_pipeline(SelectFromModel(lasso_reg, prefit=False, threshold=None), LinearRegression())

Lasso_linear.fit(X_train, Y_train)
pred = Lasso_linear.predict(X_test)

print('The R^2 is:', Lasso_linear.score(X_train, Y_train))
print('CV score is:',np.mean(cross_val_score(estimator = Lasso_linear, X = X_train, y = Y_train, cv=10, n_jobs=-1)))
print('The RMSE is:', rmse(Y_train, pred))

sns.regplot(Y_train, pred, fit_reg=True, color='red')

#change Y_train_preprocessed to Y_test_preprocessed for final prediction and reverse the log
final_status = np.expm1(Lasso_linear.predict(X_test_preprocessed))
submission = pd.DataFrame({'Id': test_ID, 'SalePrice':final_status})
submission.to_csv('lass0_linear.csv', index=False)

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

Scaler_PCA_linear = make_pipeline(PCA(n_components = 59), LinearRegression())

Scaler_PCA_linear.fit(X_train, Y_train)
Scaler_PCA_linear.score(X_train, Y_train)

In [None]:
# DecisionTree
from  sklearn.treesklearn  import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(X_train, Y_train)
tree.score(X_train, Y_train)

pred = np.expm1(tree.predict(X_test))

submission = pd.DataFrame({'Id': test_ID, 'SalePrice':pred})
submission.to_csv('tree.csv', index=False)

In [None]:
# import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

feat_labels = X_train.columns[0:]

forest = RandomForestClassifier(n_estimators = 500, random_state=1)
forest.fit(X_train, Y_train)

#save values of importance and indices of the columns
importances = forest.feature_importances_
indices = np.argsort(importances)
indices

# print the columns and importance
for feature in range(X_train_std.shape[1]):
    print("%2d) %-*s %feature" % (feature + 1, 30, feat_labels[indices[feature]],
                                 importances[indices[feature]]))
    
#visualize result with all columns
from matplotlib import cm
color = cm.inferno_r(np.linspace(.4,.8, 69))

plt.figure(figsize=(20,20))
plt.title('Feature Importance')

x = plt.barh(range(X_train_std.shape[1]), importances[indices], align='center', color=color)
x = plt.yticks(range(X_train_std.shape[1]), feat_labels, rotation = 0, size=12)
x = plt.ylim([-1, X_train_std.shape[1]])

# set threshold as mean to pick features
from sklearn.feature_selection import SelectFromModel
importances2 = forest.feature_importances_
indices2 = np.argsort(importances2)[::-1]

sfm = SelectFromModel(forest, prefit=True, threshold=None)
X_selected = sfm.transform(X_train_std)


for feature in range(X_selected.shape[1]):
    print("%2d) %-*s %f" % (feature + 1, 30, feat_labels[indices2[feature]],
                                 importances2[indices2[feature]]))
    
#plt.figure(figsize=(20,20))
#plt.title('Feature Importance')

#x = plt.barh(range(X_selected.shape[1]), importances[indices], align='center', color=color)
#x = plt.yticks(range(X_selected.shape[1]), feat_labels, rotation = 0, size=15)

In [None]:
#Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline

pipe_tree = make_pipeline(SelectFromModel(elastic, prefit=False, threshold=None),svm())
pipe_tree.fit(X_train, Y_train)
pipe_tree.score(X_train, Y_train)

pred = pipe_tree.predict(X_test)

submission = pd.DataFrame({'Id': test_ID, 'SalePrice':pred})
submission.to_csv('elastic_svm.csv', index=False)

## **GridSearch with RF and Boost models**

In [None]:
from models import modelfitRF
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators=500, random_state=1, oob_score=True, n_jobs=-1)

modelfirRF(rf, X_train, features)

In [None]:
#tuning n_estimators
param_test1 = {'n_estimators':[50,70,80,90,100,200,500,1000]}
gsearch1 = GridSearchCV(estimator = RandomForestRegressor(oob_score=True),
                        param_grid = param_test1,n_jobs=-1,iid=False, cv=5)

gsearch1.fit(X_train,Y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
rf1 = RandomForestRegressor(oob_score=True, n_estimators=100)

modelfitRF(rf1, X_train, features)

In [None]:
# tuning max_depth
param_test2 = {'max_depth':[3,5,7,9]}

gsearch2 = GridSearchCV(estimator = RandomForestRegressor(oob_score=True, n_estimators = 100),
                        param_grid = param_test2, n_jobs=-1,iid=False, cv=5)

gsearch2.fit(X_train,Y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
#zooming into max_depth
param_test3 = {'max_depth':[8,9,10,11,12,15]}

gsearch3 = GridSearchCV(estimator = RandomForestRegressor(oob_score=True, n_estimators = 100),
                        param_grid = param_test3, n_jobs=-1,iid=False, cv=5)

gsearch3.fit(X_train,Y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
# tuning min_samples_leaf and min_samples_size
param_test4 = {'min_samples_split':[2,10,30,50],
               'min_samples_leaf':[20,50,10,200,400]}
                                    
gsearch4 = GridSearchCV(estimator = RandomForestRegressor(oob_score=True, n_estimators = 100, max_depth=10),
                        param_grid = param_test4, n_jobs=-1,iid=False, cv=5)

gsearch4.fit(X_train,Y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_                                    

In [None]:
modelfitRF(gsearch4.best_estimator_, X_train, features)

In [None]:
# tuning max_features
param_test5 = {'max_features':[10,20,30,40,50,60,70,80]}
                                    
gsearch5 = GridSearchCV(estimator = RandomForestRegressor(oob_score=True, n_estimators = 100, max_depth=9,
                                                         min_samples_leaf=10, min_samples_split=10),
                        param_grid = param_test5, n_jobs=-1,iid=False, cv=5)

gsearch5.fit(X_train,Y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_ 

** final Random Tree**

In [None]:
RandomForestRegressor(oob_score=True,
                      n_estimators = 100,
                      max_depth=9,
                        max_features=40,
                     min_samples_leaf=10,
                     min_samples_split=10)


modelfitRF(gsearch5.best_estimator_, X_train, features)

## **XGBoost**

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20, 4

from models import modelfitxgb

In [None]:
xgb1 = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:linear',
 n_jobs=-1,
 scale_pos_weight=1,
 seed=27)

modelfitxgb(xgb1, X_train)

In [None]:
#tuning step 1

param_test1 = {
 'max_depth':[3,5,7,9],
 'min_child_weight':[1,3,5]
}
gsearch1 = GridSearchCV(estimator = xgb1,param_grid = param_test1,n_jobs=-1,iid=False, cv=5)
gsearch1.fit(X_train,Y_train)

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
#tuning step 2: zooming in

param_test2 = {
 'max_depth':[8,9,10],
 'min_child_weight':[1]
}
gsearch2 = GridSearchCV(estimator = xgb1,param_grid = param_test2,n_jobs=-1,iid=False, cv=5)
gsearch2.fit(X_train,Y_train)

gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
#tuning step 2: zooming in

param_test3 = {
 'max_depth':[3],
 'min_child_weight':[4,5,6]
}
gsearch3 = GridSearchCV(estimator = xgb1,param_grid = param_test3,n_jobs=-1,iid=False, cv=5)
gsearch3.fit(X_train,Y_train)

gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
# tuning gamma
param_test4 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gsearch4 = GridSearchCV(estimator = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=4,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:linear',
 n_jobs=-1,
 scale_pos_weight=1,
 seed=27),param_grid = param_test4,n_jobs=-1,iid=False, cv=5) 
                        
                        
gsearch4.fit(X_train,Y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
xgb2 = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=4,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:linear',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfitxgb(xgb2, X_train)

#max_depth: 5
#min_child_weight: 4
#gamma: 0

In [None]:
# tuning subsmaple and colsample
# take values 0.6,0.7,0.8,0.9 for both to start with

param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch5 = GridSearchCV(estimator = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=4,
 gamma=0,
 objective= 'reg:linear',
 n_jobs=-1,
 scale_pos_weight=1,
 seed=27),param_grid = param_test5,n_jobs=-1,iid=False, cv=5) 
                        
                        
gsearch5.fit(X_train,Y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

In [None]:
# tuning subsmaple and colsample
# go in 0.05 increments around 0.8 or 0.6 respectively
param_test6 = {
 'subsample':[i/100.0 for i in range(75,85,5)],
 'colsample_bytree':[i/100.0 for i in range(55,65,5)]
}

gsearch6 = GridSearchCV(estimator = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=4,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:linear',
 n_jobs=-1,
 scale_pos_weight=1,
 seed=27),param_grid = param_test6,n_jobs=-1,iid=False, cv=5) 
                        
                        
%timeit gsearch6.fit(X_train,Y_train)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

In [None]:
param_test7 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch7 = GridSearchCV(estimator = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=4,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.55,
 objective= 'reg:linear',
 n_jobs=-1,
 scale_pos_weight=1,
 seed=27),param_grid = param_test7,n_jobs=-1,iid=False, cv=5) 


%timeit gsearch7.fit(X_train,Y_train)
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

** final XGBoost**

In [None]:
xgb3 = XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=4,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.55,
 objective= 'reg:linear',
 nthread=4,
 scale_pos_weight=1,
 reg_alpha=0.01,
 seed=27)

modelfitxgb(xgb3, X_train)

#max_depth: 5
#min_child_weight: 4
#gamma: 0
#subsample=0.8,
#colsample_bytree=0.55,
#reg_alpha=0.01

## **GradientBoost**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid searchc
from models import modelfitGB

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20, 4

In [None]:
# baseline model

gbm0 = GradientBoostingRegressor(random_state=10)
modelfitGB(gbm0, X_train, features)

In [None]:
# tuning n_estimators
param_test1 = {'n_estimators':[70,80,90,100]}
gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1,
                                                              min_samples_split=500,
                                                              min_samples_leaf=50,
                                                              max_depth=8,
                                                              max_features='sqrt',
                                                              subsample=0.8,
                                                              random_state=10), 
param_grid = param_test1, scoring='neg_mean_squared_error',n_jobs=-1,iid=False, cv=5)
gsearch1.fit(X_train,Y_train)

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {'max_depth':[5, 7, 9, 11, 13, 15], 'min_samples_split':[200, 400, 600, 800, 1000]}
gsearch2 = GridSearchCV(estimator = GradientBoostingRegressor(n_estimators=100,
                                                              learning_rate=0.1,
                                                              min_samples_split=500,
                                                              min_samples_leaf=50,
                                                              max_depth=8,
                                                              max_features='sqrt',
                                                              subsample=0.8,
                                                              random_state=10), 
param_grid = param_test2, scoring='neg_mean_squared_error',n_jobs=-1,iid=False, cv=5)

gsearch2.fit(X_train, Y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_


In [None]:
param_test3 = {'min_samples_split':[50,150,200,250,300], 'min_samples_leaf':[2,5,10,20,30,40,50,60,70]}

gsearch3 = GridSearchCV(estimator = GradientBoostingRegressor(n_estimators=100,
                                                              learning_rate=0.1,
                                                              min_samples_split=200,
                                                              min_samples_leaf=50,
                                                              max_depth=7,
                                                              max_features='sqrt',
                                                              subsample=0.8,
                                                              random_state=10),
param_grid = param_test3, scoring='neg_mean_squared_error',n_jobs=-1,iid=False, cv=5)
gsearch3.fit(X_train,Y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
modelfitGB(gsearch3.best_estimator_, X_train, features)

In [None]:
param_test4 = {'max_features':[5, 10,20,30,40,50,60,70,80]}

gsearch4 = GridSearchCV(estimator = GradientBoostingRegressor(n_estimators=100,
                                                              learning_rate=0.1,
                                                              min_samples_split=150,
                                                              min_samples_leaf=2,
                                                              max_depth=7,
                                                              max_features='sqrt',
                                                              subsample=0.8,
                                                              random_state=10),
param_grid = param_test4, scoring='neg_mean_squared_error',n_jobs=-1,iid=False, cv=5)
gsearch4.fit(X_train,Y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

#n_estimators=100
#min_samples_split: 150
#min_samples_leaf: 2
#max_depth: 7
#max_features: 40

In [None]:
param_test4_1 = {'max_features':[5, 6,7,8,9,10,11,12,13]}

gsearch4_1 = GridSearchCV(estimator = GradientBoostingRegressor(n_estimators=100,
                                                              learning_rate=0.1,
                                                              min_samples_split=150,
                                                              min_samples_leaf=2,
                                                              max_depth=7,
                                                              max_features='sqrt',
                                                              subsample=0.8,
                                                              random_state=10),
param_grid = param_test4_1, scoring='neg_mean_squared_error',n_jobs=-1,iid=False, cv=5)
gsearch4_1.fit(X_train,Y_train)
gsearch4_1.grid_scores_, gsearch4_1.best_params_, gsearch4_1.best_score_

#n_estimators=100
#min_samples_split: 150
#min_samples_leaf: 2
#max_depth: 7
#max_features: 40

In [None]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}

gsearch5 = GridSearchCV(estimator = GradientBoostingRegressor(n_estimators=100,
                                                              learning_rate=0.1,
                                                              min_samples_split=150,
                                                              min_samples_leaf=2,
                                                              max_depth=7,
                                                              max_features=9,
                                                              subsample=0.8,
                                                              random_state=10),
param_grid = param_test5, scoring='neg_mean_squared_error',n_jobs=-1,iid=False, cv=5)

gsearch5.fit(X_train,Y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

** final GradientBoost**

In [None]:
gbm_tuned_1 = GradientBoostingRegressor(n_estimators=200,
                                                              learning_rate=0.05,
                                                              min_samples_split=150,
                                                              min_samples_leaf=2,
                                                              max_depth=7,
                                                              max_features=9,
                                                              subsample=0.8,
                                                              random_state=10)
modelfitGB(gbm_tuned_1, X_train, features)

In [None]:
#wasnt able to intall Light GBM
import lightgbm as lgb

train_data=lgb.Dataset(x_train,label=y_train)
params = {'learning_rate':0.001}
model= lgb.train(params, train_data, 100)
from sklearn.metrics import mean_squared_error
rmse=mean_squared_error(y_pred,y_test)**0.5

## KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from models import modelfitKNN

In [None]:
knn = KNeighborsRegressor()

modelfitKNN(knn, X_train)

In [None]:
param_test1 = { 'n_neighbors': [3, 5, 7, 9,11,13,17],
                'weights' : ['uniform','distance']}

gsearch1 = GridSearchCV(estimator = KNeighborsRegressor(),
                        param_grid = param_test1,n_jobs=-1,iid=False, cv=5)


gsearch1.fit(X_train,Y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

## **Stacking**

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.base import clone

def stacking_regression(models, meta_model, X_train, y_train, X_test,
             transform_target=None, transform_pred=None,
             metric=None, n_folds=3, average_fold=True,
             shuffle=False, random_state=0, verbose=1):
   
    if verbose > 0:
        print('metric: [%s]\n' % metric.__name__)

    # Split indices to get folds
    kf = KFold(n_splits = n_folds, shuffle = shuffle, random_state = random_state)

    if X_train.__class__.__name__ == "DataFrame":
    	X_train = X_train.as_matrix()
    	X_test = X_test.as_matrix()

    # Create empty numpy arrays for stacking features
    S_train = np.zeros((X_train.shape[0], len(models)))
    S_test = np.zeros((X_test.shape[0], len(models)))

    # Loop across models
    for model_counter, model in enumerate(models):
        if verbose > 0:
            print('model %d: [%s]' % (model_counter, model.__class__.__name__))

        # Create empty numpy array, which will contain temporary predictions for test set made in each fold
        S_test_temp = np.zeros((X_test.shape[0], n_folds))
        # Loop across folds
        for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            X_te = X_train[te_index]
            y_te = y_train[te_index]
            
            # Clone the model because fit will mutate the model.
            instance = clone(model)
            
            # Fit 1-st level model
            instance.fit(X_tr, transformer(y_tr, func = transform_target))
            
            # Predict out-of-fold part of train set
            S_train[te_index, model_counter] = transformer(instance.predict(X_te), func = transform_pred)
            
            # Predict full test set
            S_test_temp[:, fold_counter] = transformer(instance.predict(X_test), func = transform_pred)

            # Delete temperatory model
            del instance

            if verbose > 1:
                print('    fold %d: [%.8f]' % (fold_counter, metric(y_te, S_train[te_index, model_counter])))

        # Compute mean or mode of predictions for test set
        if average_fold:
            S_test[:, model_counter] = np.mean(S_test_temp, axis = 1)
        else:
            model.fit(X_train, transformer(y_train, func = transform_target))
            S_test[:, model_counter] = transformer(model.predict(X_test), func = transform_pred)

        if verbose > 0:
            print('    ----')
            print('    MEAN:   [%.8f]\n' % (metric(y_train, S_train[:, model_counter])))

    # Fit our second layer meta model
    
    meta_model.fit(S_train, transformer(y_train, func = transform_target))
    
    # Make our final prediction
    stacking_prediction = transformer(meta_model.predict(S_test), func = transform_pred)

    return stacking_prediction

In [None]:
models = [
    # KNN
    
    # RandomForest
    #RandomForestRegressor(oob_score=True,
    #                      n_estimators = 500,
    #                      max_depth=12,
    #                      max_features=40,
    #                      min_sample_split=3),
    
    # XGBoost
    XGBRegressor(learning_rate =0.1,
                 n_estimators=1000,
                 max_depth=5,
                 min_child_weight=4,
                 gamma=0,
                 subsample=0.8,
                 colsample_bytree=0.55,
                 objective= 'reg:linear',
                 nthread=4,
                 scale_pos_weight=1,
                 reg_alpha=0.01,
                 seed=27),
    
    # GradientBoost
    GradientBoostingRegressor(n_estimators=200,
                             learning_rate=0.05,
                             min_samples_split=150,
                             min_samples_leaf=2,
                             max_depth=7,
                              max_features=9,
                             subsample=0.8,
                             random_state=10),
    
    # Light GBoost
    #gbr(random_state = 0, learning_rate = 0.01, max_features='sqrt',
    #    min_samples_leaf=10, min_samples_split=5, 
    #    n_estimators = 1000, max_depth = 9)
    ]

meta_model = LinearRegression(normalize=True)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(Y_train, Y_pred):
    rmse = sqrt(mean_squared_error(Y_train, Y_pred))
    
    return rmse

def rmse_cv(X_train_preprocessed, Y_train):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring = 'neg_mean_squared_error', cv = 5))
    return rmse

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
%%time
final_prediction = stacking_regression(models, meta_model, X_train, Y_train, X_test,
                               transform_target=None, transform_pred = np.expm1, 
                               metric=rmsle, verbose=1)

In [None]:
submission = pd.DataFrame({'Id': test_ID, 'SalePrice':final_prediction})
submission.to_csv('stack.csv', index=False)