In [78]:
import numpy as np

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline

path = 'D:/CS/ML/Kaggle/HousePrices/all/'

train = pd.read_csv('%s%s' %(path, 'train.csv'))

test = pd.read_csv('%s%s' %(path, 'test.csv'))

In [79]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
from sklearn.preprocessing import RobustScaler

In [80]:
from numpy.core.umath_tests import inner1d
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

# 数据清洗

In [81]:
# 删除异常点
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

# 切分点记录
ntrain = train.shape[0]
ntest = test.shape[0]

# 合并测试集训练集
test['SalePrice']= None
train_test = pd.concat((train, test)).reset_index(drop=True)

# 该特征值基本都是同一特征，删除
train_test = train_test.drop(['Utilities'], axis=1)

# 用none填补缺失值
none_col = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 
            'GarageQual', 'GarageFinish', 'GarageCond',
            'GarageType', 'BsmtExposure', 'BsmtCond', 'BsmtQual', 
            'BsmtFinType2', 'BsmtFinType1', 'MasVnrType' ] #'GarageYrBlt',
for nnone in none_col:
    train_test[nnone].fillna('None', inplace=True)
    
# 用0填补缺失值
zero_col = ['MasVnrArea', 'GarageYrBlt','BsmtFullBath', 'BsmtHalfBath','GarageArea', 'BsmtFinSF2', 'TotalBsmtSF', 'GarageCars', 'BsmtUnfSF', 'BsmtFinSF1']
for zzero in zero_col:
    train_test[zzero].fillna(0, inplace=True)
    
# 用众数填补缺失值
all_col = ["MSZoning", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]#, "Utilities", "BsmtFullBath", "BsmtHalfBath",
for aall in all_col:
    train_test[aall].fillna(train_test[aall].mode()[0], inplace=True)
    
# 中位数填补
#train_test["LotAreaCut"] = pd.qcut(train_test.LotArea,10)
#train_test['LotFrontage']=train_test.groupby(['LotAreaCut', 'Neighborhood'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
#train_test['LotFrontage']=train_test.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
train_test["LotFrontage"] = train_test.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

# 删除无用标签和售价
#train_test.drop("LotAreaCut",axis=1,inplace=True)
train_test.drop(['SalePrice'],axis=1,inplace=True)
train_test.drop(['Id'],axis=1,inplace=True)

train_test['MSZoning'] = train_test['MSZoning'].fillna(train_test['MSZoning'].mode()[0])

# 特征工程

In [82]:
#MSSubClass=The building class
train_test['MSSubClass'] = train_test['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
train_test['OverallCond'] = train_test['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
train_test['YrSold'] = train_test['YrSold'].astype(str)
train_test['MoSold'] = train_test['MoSold'].astype(str)

cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(train_test[c].values)) 
    train_test[c] = lbl.transform(list(train_test[c].values))

# shape        
print('Shape all_data: {}'.format(train_test.shape))

Shape all_data: (2917, 78)


In [83]:
# Adding total sqfootage feature 
train_test['TotalSF'] = train_test['TotalBsmtSF'] + train_test['1stFlrSF'] + train_test['2ndFlrSF']

In [84]:

numeric_feats = train_test.dtypes[train_test.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = train_test[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)




Skew in numerical features: 



Unnamed: 0,Skew
MiscVal,21.939672
PoolArea,17.688664
LotArea,13.109495
LowQualFinSF,12.084539
3SsnPorch,11.37208
LandSlope,4.973254
KitchenAbvGr,4.30055
BsmtFinSF2,4.144503
EnclosedPorch,4.002344
ScreenPorch,3.945101


In [85]:
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    train_test[feat] = boxcox1p(train_test[feat], lam)

'''
X_numeric=train_test.select_dtypes(exclude=["object"])
skewness = X_numeric.apply(lambda x: skew(x.dropna))
skewness_features = skewness[abs(skewness) >= 0.75].index
from scipy.special import boxcox1p
lam = 0.15
#for feat in skewed_features:
train_test[skewness_features] = boxcox1p(train_test[skewness_features], lam)
#train_test.drop(['Id'],axis=1,inplace=True)
'''

There are 59 skewed numerical features to Box Cox transform


'\nX_numeric=train_test.select_dtypes(exclude=["object"])\nskewness = X_numeric.apply(lambda x: skew(x.dropna))\nskewness_features = skewness[abs(skewness) >= 0.75].index\nfrom scipy.special import boxcox1p\nlam = 0.15\n#for feat in skewed_features:\ntrain_test[skewness_features] = boxcox1p(train_test[skewness_features], lam)\n#train_test.drop([\'Id\'],axis=1,inplace=True)\n'

In [86]:
train_test = pd.get_dummies(train_test)
print(train_test.shape)

(2917, 220)


In [87]:
train1 = train_test[:ntrain]
test1 = train_test[ntrain:]

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(train1)
train_x_sd1 = scaler.transform(train1)
test_x_sd1 = scaler.transform(test1)
y_log = np.log1p(train.SalePrice)
y_log = y_log.values.reshape(-1,1).ravel()
#y_log = train["SalePrice"]


In [88]:
y_log

array([12.24769912, 12.10901644, 12.31717117, ..., 12.49313327,
       11.86446927, 11.90159023])

In [89]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [90]:
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse

In [91]:
models = [LinearRegression(),Ridge(),Lasso(alpha =0.0005, random_state=1),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),
          ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3),SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
          ExtraTreesRegressor(),XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)]

In [92]:
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]
for name, model in zip(names, models):
    score = rmse_cv(model, train_x_sd1,y_log)
    print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std())) # format为格式化，使输出按照指定样式排列

LR: 0.123794, 0.0109
Ridge: 0.118211, 0.0098
Lasso: 0.111549, 0.0074
RF: 0.146921, 0.0052
GBR: 0.121499, 0.0063
SVR: 0.130264, 0.0095
LinSVR: 0.161140, 0.0185
Ela: 0.111565, 0.0074
SGD: 2617527930101.291504, 376262205831.4435
Bay: 0.114207, 0.0079
Ker: 0.118147, 0.0091
Extra: 0.135089, 0.0098
Xgb: 0.116494, 0.0058


In [93]:
lasso= Lasso(alpha =0.0005, random_state=1)
ela = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)

bay = BayesianRidge()
#gbr = GradientBoostingRegressor(alpha = 0.5, learning_rate = 0.1, n_estimators = 180)
ker = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
xgbt = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

# votiong

In [94]:
class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod
        self.weight = weight
        
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
        return self
    
    def predict(self,X):
        w = list()
        pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        for data in range(pred.shape[1]):
            single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
            w.append(np.sum(single))
        return w

In [95]:
w1 = 0.3
w2 = 0.3
w3 = 0.2
w4 = 0.05
w5 = 0.15

In [96]:
weight_avg = AverageWeight(mod = [lasso,ela,bay,ker,xgbt],weight=[w1,w2,w3,w4,w5])

In [97]:
rmse_cv(weight_avg,train_x_sd1,y_log).mean()

0.11025052578129857

In [98]:
weight_avg2  = AverageWeight(mod = [lasso,ela],weight=[0.5,0.5])
rmse_cv(weight_avg,train_x_sd1,y_log).mean()

0.11025052578129857

In [100]:
weight_avg.fit(train_x_sd1,y_log)
pred = np.exp(weight_avg.predict(test_x_sd1))
result_vot=pd.DataFrame({'Id':test.Id, 'SalePrice':pred})
result_vot.to_csv("D:/CS/ML/Kaggle/HousePrices/v1.0/submission_vote1.51.csv",index=False)

# stacking

In [69]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [71]:
stack_model = StackingAveragedModels(base_models=[lasso,ela,bay,ker,xgbt],meta_model=lasso)

In [72]:
rmse_cv(stack_model,train_x_sd1,y_log).mean()

0.10993316740785986

In [70]:
stack_model.fit(train_x_sd1,y_log)
stacked_pred = np.expm1(stack_model.predict(test_x_sd1))
result_vot=pd.DataFrame({'Id':test.Id, 'SalePrice':stacked_pred})
result_vot.to_csv("D:/CS/ML/Kaggle/HousePrices/v1.0/submission_stacking1.51.csv",index=False)

In [143]:
lasso.fit(train_x_sd1,y_log)
pred_lasso = np.expm1(lasso.predict(test_x_sd1))
ela.fit(train_x_sd1,y_log)
pred_ela = np.expm1(ela.predict(test_x_sd1))

In [144]:
pred_mix = stacked_pred *0.7+0.15*pred_ela+0.15*pred_lasso
result=pd.DataFrame({'Id':test.Id, 'SalePrice':pred_mix})
result.to_csv("D:/CS/ML/Kaggle/HousePrices/v1.0/submission_mix1.1.csv",index=False)