In [1]:
import numpy as np 
import pandas as pd 

In [2]:
# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")

## 数据集的准备

In [3]:
from sklearn.model_selection import train_test_split
train=pd.read_csv('datas/house_data.csv')
y=train['SalePrice']
train1=train.drop(['Id','SalePrice'],axis=1)
X=pd.get_dummies(train1).reset_index(drop=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)

In [4]:
# tmp=train.isnull().sum()
# tmp[tmp>0] 

#### 模型测评

In [4]:
from sklearn.metrics import mean_squared_error
def benchmark(model,testset,label):
    pred=model.predict(testset)
    if pred[pred<0].shape[0]>0:
        print('Neg Value')
    rmse=np.sqrt(mean_squared_error(label,pred))
    lrmse=np.sqrt(mean_squared_error(np.log(label),np.log(pred)))

    print('RMSE:',rmse)
    print('LRMSE:',lrmse)
    return lrmse

## 基础模型训练

### ElasticNet

In [5]:
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold

In [6]:
kfolds=KFold(n_splits=10, shuffle=True, random_state=123)
e_l1ratio=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.85,0.9,0.95]
e_alphas=np.logspace(-10,2.8,150)

In [7]:
def elastic_train_test(alpha,l1ratio):
    e_model=make_pipeline(RobustScaler(),ElasticNetCV(alphas=[alpha],l1_ratio=[l1ratio]))
    e_model.fit(X_train,y_train)
    lrmse=benchmark(e_model,X_test,y_test)
    return lrmse

In [8]:
elastic_train_test(50,0.5) 

RMSE: 64803.88956616406
LRMSE: 0.3056812482960621


0.3056812482960621

In [9]:
elastic_model=make_pipeline(RobustScaler(), ElasticNetCV(alphas=e_alphas, l1_ratio=e_l1ratio)).fit(X_train,y_train)

In [10]:
benchmark(elastic_model,X_test,y_test)

RMSE: 25991.07955736571
LRMSE: 0.12567210233778722


0.12567210233778722

In [11]:
elastic_model.steps[1][1].alpha_

0.3432183268134919

In [12]:
elastic_model.steps[1][1].l1_ratio_

0.9

### XGBoost训练

In [13]:
import xgboost as xgb
xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3400,subsample=0.7,nthread=6,seed=123)

In [14]:
xg_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.7, gamma=0, importance_type='gain',
             learning_rate=0.01, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=3400, n_jobs=1,
             nthread=6, objective='reg:linear', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=123, silent=True,
             subsample=0.7)

In [15]:
benchmark(xg_reg,X_test,y_test)

RMSE: 22926.489730019464
LRMSE: 0.10024704840338212


0.10024704840338212

<br><br>

## Stacking集成算法

### 底层算法

In [16]:
from mlxtend.regressor import StackingCVRegressor

In [17]:
alphas_alt=np.logspace(-10,2.8,150)

In [18]:
ridge=make_pipeline(RobustScaler(),RidgeCV(alphas=alphas_alt,cv=kfolds))

In [19]:
lasso=make_pipeline(RobustScaler(),LassoCV(alphas=alphas_alt,cv=kfolds))

In [20]:
elasticnet=make_pipeline(RobustScaler(),ElasticNetCV(alphas=e_alphas,cv=kfolds, l1_ratio=e_l1ratio))

In [None]:
xgboost=make_pipeline(RobustScaler(),xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3460,subsample=0.7,reg_alpha=0.00006,gamma=0,nthread=6,scale_pos_weight=1,seed=27))

### 上层算法

In [None]:
# 是否使用原训练集中的feature
stack_alg=StackingCVRegressor(regressors=(ridge, lasso, elasticnet, xgboost),
                              meta_regressor=xgboost, use_features_in_secondary=True)

In [23]:
stackX=np.array(X_train)

In [24]:
stacky=np.array(y_train)

In [None]:
stack_alg.fit(stackX, stacky)

In [None]:
p

In [None]:
benchmark(stack_alg, X_test, y_test)