In [47]:
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
import pandas as pd
import numpy as np
import xgboost as xgb

In [48]:
train_X = pd.read_csv('../input/X_train.csv')
train_y = pd.read_csv('../input/y_train.csv')
test_X = pd.read_csv('../input/X_test.csv')
id_test = pd.read_csv('../input/id_test.csv')

In [49]:
# Some useful parameters which will come in handy later on
ntrain = train_X.shape[0]
ntest = test_X.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

In [50]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
    def get_clf(self):
        return self.clf

In [51]:
# 返回训练集和测试集训练后得到的结果
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return clf, oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [52]:
def get_oof_xgb(model, x_train, x_test):
    dtrain = xgb.DMatrix(x_train)
    dtest = xgb.DMatrix(x_test)
    oof_train = np.zeros((ntrain, ))
    oof_test = np.zeros((ntest, ))
    
    oof_train = model.predict(dtrain)
    oof_test = model.predict(dtest)
    oof_train = (np.exp(oof_train)-1).reshape(-1,1)
    oof_test = (np.exp(oof_test)-1).reshape(-1,1)
    return oof_train, oof_test

In [53]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
#     'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
#     'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
#     'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
#     'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

In [54]:
# Create 4 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingRegressor, seed=SEED, params=gb_params)

In [55]:
X_train = train_X.values
y_train = train_y.values.ravel()
X_test = test_X.values

In [56]:
xgb_model = xgb.Booster({'nthread':4}) #init model
xgb_model.load_model('../model/xgb_model.model') # load data
xgb_oof_train, xgb_oof_test = get_oof_xgb(xgb_model, X_train, X_test)

In [57]:
naive_model = xgb.Booster({'nthread':4})
naive_model.load_model('../model/naivexgb.model')
naive_oof_train, naive_oof_test = get_oof_xgb(xgb_model, X_train, X_test)
print naive_oof_train

[[  4375667. ]
 [  4982616.5]
 [  4484587.5]
 ..., 
 [  5573623. ]
 [ 10069627. ]
 [  5265151.5]]


In [58]:
# Create our OOF train and test predictions. These base results will be used as new features
et_model, et_oof_train, et_oof_test = get_oof(et, X_train, y_train, X_test) # Extra Trees
rf_model, rf_oof_train, rf_oof_test = get_oof(rf,X_train, y_train, X_test) # Random Forest
ada_model, ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, X_test) # AdaBoost 
gd_Model, gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, X_test) # Gradient Boost

KeyboardInterrupt: 

In [59]:
gd_Model, gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, X_test) # Gradient Boost

In [60]:
from sklearn.externals import joblib
joblib.dump(et_model.get_clf(), '../model/et_model.pkl')
joblib.dump(rf_model.get_clf(), '../model/rf_model.pkl')
joblib.dump(ada_model.get_clf(), '../model/ada_model.pkl')
joblib.dump(gd_Model.get_clf(), '../model/gd_Model.pkl')

['../model/gd_Model.pkl']

In [None]:
# joblib.load('../model/et_model.pkl')

In [63]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel(),
      'Xgboost': xgb_oof_train.ravel(),
    })
base_predictions_train.head()

Unnamed: 0,AdaBoost,ExtraTrees,GradientBoost,RandomForest,Xgboost
0,7075219.0,5892461.0,5519803.0,6667069.0,4375667.0
1,6938046.0,5959944.0,5579825.0,7209080.0,4982616.5
2,7131336.0,5926861.0,4720377.0,6565405.0,4484587.5
3,11385940.0,10980070.0,11890450.0,9900888.0,10504041.0
4,11927070.0,13631410.0,14906440.0,14767260.0,11556605.0


In [64]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, xgb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, xgb_oof_test), axis=1)

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
#     'subsample': 1.0,
#     'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'objective': 'reg:linear',
}

In [None]:
dtrain = xgb.DMatrix(x_train, label = np.log(y_train))
dtest = xgb.DMatrix(x_test)

res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=3)

In [None]:
predictions = xgb.train(xgb_params, dtrain, 152)
result = predictions.predict(dtest)

In [None]:
df_sub = pd.DataFrame({'id': id_test.values.ravel(), 'price_doc': np.exp(result)})
df_sub.to_csv('output/stacking.csv', index=False)

In [None]:
id_test

In [None]:
ans = pd.read_csv('output/sub3.csv')

NameError: name 'df_sub' is not defined