### Stacking models after standardizing all features

In [1]:
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from scipy import sparse
import xgboost
import lightgbm

from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss,mean_absolute_error,mean_squared_error
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler,Normalizer,StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer,HashingVectorizer
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from collections import OrderedDict, defaultdict
import re, string, time
from nltk.stem.snowball import SnowballStemmer
from keras.utils import np_utils

Using TensorFlow backend.


First, stack models for classification.

In [2]:
def stacking(clf,train_x,train_y,test_x,clf_name,class_num=3):
    train=np.zeros((train_x.shape[0],class_num))
    test=np.zeros((test_x.shape[0],class_num))
    test_pre=np.empty((folds,test_x.shape[0],class_num))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf.split(train_x, train_y)):
        tr_x=train_x[train_index]
        tr_y=train_y[train_index]
        te_x=train_x[test_index]
        te_y = train_y[test_index]
        if clf_name in ["rf","ada","gb","et","lr","knn","mnb","ovr","gnb"]:
            clf.fit(tr_x,tr_y)
            pre=clf.predict_proba(te_x)
            train[test_index]=pre
            test_pre[i,:]=clf.predict_proba(test_x)
            cv_scores.append(log_loss(te_y, pre))
        elif clf_name in ["lsvc"]:
            clf.fit(tr_x,tr_y)
            pre=clf.decision_function(te_x)
            train[test_index]=pre
            test_pre[i,:]=clf.decision_function(test_x)
            cv_scores.append(log_loss(te_y, pre))
        elif clf_name in ["xgb"]:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {'booster': 'gbtree',
                      'objective': 'multi:softprob',
                      'eval_metric': 'mlogloss',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      'nthread': 12,
                      "num_class": class_num
                      }

            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'),
                         (test_matrix, 'eval')
                         ]
            if test_matrix:
                model = clf.train(params, train_matrix, num_boost_round=num_round, evals=watchlist,
                                  verbose_eval=100, early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit)
                train[test_index]=pre
                test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit)
                cv_scores.append(log_loss(te_y, pre))
        elif clf_name in ["lgb"]:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                      'boosting_type': 'gbdt',
                      #'boosting_type': 'dart',
                      'objective': 'multiclass',
                      'metric': 'multi_logloss',
                      'num_leaves': 2**5,
                      'lambda_l2': 10,
                      'feature_fraction': 0.7,
                      'bagging_fraction': 0.7,
                      'learning_rate': 0.03,
                      'seed': 2017,
                      'nthread': 4,
                      "num_class": class_num,
                      }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params, train_matrix,num_round, valid_sets=test_matrix,
                                  verbose_eval=100, early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(te_x,num_iteration=model.best_iteration)
                train[test_index]=pre
                test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration)
                cv_scores.append(log_loss(te_y, pre))
        elif clf_name in ["nn"]:
            from keras.layers import Dense, Dropout, BatchNormalization
            from keras.optimizers import SGD,RMSprop
            from keras.callbacks import EarlyStopping, ReduceLROnPlateau
            from keras.utils import np_utils
            from keras.regularizers import l2
            from keras.models import Sequential
            clf = Sequential()
            clf.add(Dense(64, input_dim=tr_x.shape[1],activation="relu", W_regularizer=l2()))
            #clf.add(SReLU())
            #clf.add(Dropout(0.2))
            clf.add(Dense(64,activation="relu",W_regularizer=l2()))
            #clf.add(SReLU())
            #clf.add(Dense(64, activation="relu", W_regularizer=l2()))
            # model.add(Dropout(0.2))
            clf.add(Dense(class_num, activation="softmax"))
            #clf.summary()
            early_stopping = EarlyStopping(monitor='val_loss', patience=20)
            reduce = ReduceLROnPlateau(min_lr=0.0002,factor=0.05)
            clf.compile(optimizer="rmsprop", loss="categorical_crossentropy")
            clf.fit(tr_x, tr_y,
                    batch_size=640,
                    nb_epoch=1000,
                    validation_data=[te_x, te_y],
                    callbacks=[early_stopping,reduce],
                    verbose=0)
            pre=clf.predict_proba(te_x)
            train[test_index]=pre
            test_pre[i,:]=clf.predict_proba(test_x)
            cv_scores.append(log_loss(te_y, pre))
        else:
            raise IOError("Please add new clf.")
        print("%s now score is:"%clf_name,cv_scores)
        with open("score.txt","a") as f:
            f.write("%s now score is:"%clf_name+str(cv_scores)+"\n")
    test[:]=test_pre.mean(axis=0)
    print("%s_score_list:"%clf_name,cv_scores)
    print("%s_score_mean:"%clf_name,np.mean(cv_scores))
    with open("score.txt", "a") as f:
        f.write("%s_score_mean:"%clf_name+str(np.mean(cv_scores))+"\n")
    return train.reshape(-1,class_num),test.reshape(-1,class_num)

def rf(x_train, y_train, x_valid):
    randomforest = RandomForestClassifier(n_estimators=1200, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
    rf_train, rf_test = stacking(randomforest, x_train, y_train, x_valid,"rf")
    return rf_train, rf_test,"rf"

def ada(x_train, y_train, x_valid):
    adaboost = AdaBoostClassifier(n_estimators=50, random_state=2017, learning_rate=0.01)
    ada_train, ada_test = stacking(adaboost, x_train, y_train, x_valid,"ada")
    return ada_train, ada_test,"ada"

def gb(x_train, y_train, x_valid):
    gbdt = GradientBoostingClassifier(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
    gbdt_train, gbdt_test = stacking(gbdt, x_train, y_train, x_valid,"gb")
    return gbdt_train, gbdt_test,"gb"

def et(x_train, y_train, x_valid):
    extratree = ExtraTreesClassifier(n_estimators=1200, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
    et_train, et_test = stacking(extratree, x_train, y_train, x_valid,"et")
    return et_train, et_test,"et"

def ovr(x_train, y_train, x_valid):
    est=RandomForestClassifier(n_estimators=400, max_depth=16, n_jobs=-1, random_state=2017, max_features="auto",
                               verbose=1)
    ovr = OneVsRestClassifier(est,n_jobs=-1)
    ovr_train, ovr_test = stacking(ovr, x_train, y_train, x_valid,"ovr")
    return ovr_train, ovr_test,"ovr"

def xgb(x_train, y_train, x_valid):
    xgb_train, xgb_test = stacking(xgboost, x_train, y_train, x_valid,"xgb")
    return xgb_train, xgb_test,"xgb"

def lgb(x_train, y_train, x_valid):
    xgb_train, xgb_test = stacking(lightgbm, x_train, y_train, x_valid,"lgb")
    return xgb_train, xgb_test,"lgb"

def gnb(x_train, y_train, x_valid):
    gnb=GaussianNB()
    gnb_train, gnb_test = stacking(gnb, x_train, y_train, x_valid,"gnb")
    return gnb_train, gnb_test,"gnb"

def lr(x_train, y_train, x_valid):
    logisticregression=LogisticRegression(n_jobs=-1,random_state=2017,C=0.1,max_iter=200)
    lr_train, lr_test = stacking(logisticregression, x_train, y_train, x_valid, "lr")
    return lr_train, lr_test, "lr"

def fm(x_train, y_train, x_valid):
    pass


def lsvc(x_train, y_train, x_valid):

    #linearsvc=SVC(probability=True,kernel="linear",random_state=2017,verbose=1)
    #linearsvc=SVC(probability=True,kernel="linear",random_state=2017,verbose=1)
    linearsvc=LinearSVC(random_state=2017)
    lsvc_train, lsvc_test = stacking(linearsvc, x_train, y_train, x_valid, "lsvc")
    return lsvc_train, lsvc_test, "lsvc"

def knn(x_train, y_train, x_valid):
    #pca = PCA(n_components=10)
    #pca.fit(x_train)
    #x_train = pca.transform(x_train)
    #x_valid = pca.transform(x_valid)

    kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
    knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn")
    return knn_train, knn_test, "knn"

def nn(x_train, y_train, x_valid):
    y_train = np_utils.to_categorical(y_train)
    nn_train, nn_test = stacking("", x_train, y_train, x_valid, "nn")
    return nn_train, nn_test, "nn"

Second, add stacking model for regression.

In [3]:
def stacking_reg(clf,train_x,train_y,test_x,clf_name):
    train=np.zeros((train_x.shape[0],1))
    test=np.zeros((test_x.shape[0],1))
    test_pre=np.empty((folds,test_x.shape[0],1))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf.split(train_x, train_y)):
        tr_x=train_x[train_index]
        tr_y=train_y[train_index]
        te_x=train_x[test_index]
        te_y = train_y[test_index]
        if clf_name in ["rf","ada","gb","et","lr","lsvc","knn"]:
            clf.fit(tr_x,tr_y)
            pre=clf.predict(te_x).reshape(-1,1)
            train[test_index]=pre
            test_pre[i,:]=clf.predict(test_x).reshape(-1,1)
            cv_scores.append(mean_squared_error(te_y, pre))
        elif clf_name in ["xgb"]:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {'booster': 'gbtree',
                      'eval_metric': 'rmse',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      'nthread': 12
                      }
            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'),
                         (test_matrix, 'eval')
                         ]
            if test_matrix:
                model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
                                  verbose_eval=100, early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit).reshape(-1,1)
                train[test_index]=pre
                test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit).reshape(-1,1)
                cv_scores.append(mean_squared_error(te_y, pre))

        elif clf_name in ["lgb"]:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            #z = clf.Dataset(test_x, label=te_y)
            #z=test_x
            params = {
                      'boosting_type': 'gbdt',
                      'objective': 'regression_l2',
                      'metric': 'mse',
                      'num_leaves': 2**5,
                      'lambda_l2': 10,
                      'feature_fraction': 0.7,
                      'bagging_fraction': 0.7,
                      'learning_rate': 0.03,
                      'seed': 2017,
                      'nthread': 4,
                      }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
                                  verbose_eval=100, early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(te_x,num_iteration=model.best_iteration).reshape(-1,1)
                train[test_index]=pre
                test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration).reshape(-1,1)
                cv_scores.append(mean_squared_error(te_y, pre))

        elif clf_name in ["nn"]:
            from keras.layers import Dense, Dropout, BatchNormalization
            from keras.optimizers import SGD,RMSprop
            from keras.callbacks import EarlyStopping, ReduceLROnPlateau
            from keras.utils import np_utils
            from keras.regularizers import l2
            from keras.models import Sequential
            clf = Sequential()
            clf.add(Dense(64, input_dim=tr_x.shape[1], activation="relu", W_regularizer=l2()))
            # model.add(Dropout(0.2))
            clf.add(Dense(64, activation="relu", W_regularizer=l2()))
            # model.add(Dropout(0.2))
            clf.add(Dense(1))
            #clf.summary()
            early_stopping = EarlyStopping(monitor='val_loss', patience=20)
            reduce = ReduceLROnPlateau(min_lr=0.0002,factor=0.05)
            clf.compile(optimizer="rmsprop", loss="mse")
            clf.fit(tr_x, tr_y,
                    batch_size=640,
                    nb_epoch=5000,
                    validation_data=[te_x, te_y],
                    callbacks=[early_stopping, reduce],
                    verbose=0)
            pre=clf.predict(te_x).reshape(-1,1)
            train[test_index]=pre
            test_pre[i,:]=clf.predict(test_x).reshape(-1,1)
            cv_scores.append(mean_squared_error(te_y, pre))
        else:
            raise IOError("Please add new clf.")
        print("%s now score is:"%clf_name,cv_scores)
        with open("score.txt","a") as f:
            f.write("%s now score is:"%clf_name+str(cv_scores)+"\n")
    test[:]=test_pre.mean(axis=0)
    print("%s_score_list:"%clf_name,cv_scores)
    print("%s_score_mean:"%clf_name,np.mean(cv_scores))
    with open("score.txt", "a") as f:
        f.write("%s_score_mean:"%clf_name+str(np.mean(cv_scores))+"\n")
    return train.reshape(-1,1),test.reshape(-1,1)

def rf_reg(x_train, y_train, x_valid):
    randomforest = RandomForestRegressor(n_estimators=600, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
    rf_train, rf_test = stacking_reg(randomforest, x_train, y_train, x_valid,"rf")
    return rf_train, rf_test,"rf_reg"

def ada_reg(x_train, y_train, x_valid):
    adaboost = AdaBoostRegressor(n_estimators=30, random_state=2017, learning_rate=0.01)
    ada_train, ada_test = stacking_reg(adaboost, x_train, y_train, x_valid,"ada")
    return ada_train, ada_test,"ada_reg"

def gb_reg(x_train, y_train, x_valid):
    gbdt = GradientBoostingRegressor(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
    gbdt_train, gbdt_test = stacking_reg(gbdt, x_train, y_train, x_valid,"gb")
    return gbdt_train, gbdt_test,"gb_reg"

def et_reg(x_train, y_train, x_valid):
    extratree = ExtraTreesRegressor(n_estimators=600, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
    et_train, et_test = stacking_reg(extratree, x_train, y_train, x_valid,"et")
    return et_train, et_test,"et_reg"

def lr_reg(x_train, y_train, x_valid):
    lr_reg=LinearRegression(n_jobs=-1)
    lr_train, lr_test = stacking_reg(lr_reg, x_train, y_train, x_valid, "lr")
    return lr_train, lr_test, "lr_reg"

def xgb_reg(x_train, y_train, x_valid):
    xgb_train, xgb_test = stacking_reg(xgboost, x_train, y_train, x_valid,"xgb")
    return xgb_train, xgb_test,"xgb_reg"

def lgb_reg(x_train, y_train, x_valid):
    lgb_train, lgb_test = stacking_reg(lightgbm, x_train, y_train, x_valid,"lgb")
    return lgb_train, lgb_test,"lgb_reg"

def nn_reg(x_train, y_train, x_valid):
    nn_train, nn_test = stacking_reg("", x_train, y_train, x_valid, "nn")
    return nn_train, nn_test, "nn_reg"

Custom functions for loading and preprocessing data.

In [4]:
def most_freq_vects(docs, max_feature=None, percent=None, token_pattern=u'(?u)\b\w\w+\b'):
    vect = CountVectorizer(token_pattern=token_pattern)
    feat_sparse = vect.fit_transform(docs.values.astype('U'))
    freq_table = list(zip(vect.get_feature_names(), np.asarray(feat_sparse.sum(axis=0)).ravel()))
    freq_table = pd.DataFrame(freq_table, columns=['feature', 'count']).sort_values('count', ascending=False)
    if not max_feature:
        if percent:
            max_feature = int(percent * len(vect.get_feature_names()))
        else:
            max_feature = len(vect.get_feature_names())
    feat_df = pd.DataFrame(feat_sparse.todense(), columns=vect.get_feature_names())
    names = list(freq_table.feature[:max_feature])
    return feat_df[names]


def load_data():
    print('Loading features files')
    basic_feat = pd.read_json('../feat_input/basic_feat.json')
    longtime_feat = pd.read_csv('../feat_input/longtime_feat.csv')
    encoded_feat = pd.read_csv('../feat_input/feat_stats_encoding.csv')

    # apply ordinal encoding to categorical feature
    print('Ordinal encoding')
    basic_feat.display_address = basic_feat.display_address.replace(r'\r$', '', regex=True)
    basic_feat.street_address = basic_feat.street_address.replace(r'\r$', '', regex=True)
    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if basic_feat[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(basic_feat[f].values))
            basic_feat[f] = lbl.transform(list(basic_feat[f].values))

    all_feat = basic_feat.merge(longtime_feat, on='listing_id')
    all_feat = all_feat.merge(encoded_feat, on='listing_id')

    print("Features document-term matrix")
    stemmer = SnowballStemmer('english')
    punct = string.punctuation
    punct = re.sub("'|-", "", punct)
    pattern = r"[0-9]|[{}]".format(punct)
    all_feat['features'] = all_feat['features'].apply(lambda x: [re.sub(pattern, "", y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: [stemmer.stem(y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ' '.join(x))
    vect_df = most_freq_vects(all_feat['features'], max_feature=100, token_pattern=r"[^ ]+")

    all_feat = pd.concat([all_feat, vect_df], axis=1)
    train = all_feat[all_feat['interest_level'] != -1].copy()
    test = all_feat[all_feat['interest_level'] == -1].copy()
    y_train = train["interest_level"]

    x_train = train.drop(["interest_level", "features"], axis=1)
    x_test = test.drop(["interest_level", "features"], axis=1)

    return x_train, y_train, x_test, x_test.columns.values, x_test.listing_id

def _preprocess(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest


def _preprocess_log(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest

Code for feature selection from sklearn.

In [5]:
from sklearn.feature_selection import SelectFromModel
def select_feature(clf,x_train,x_valid):
    clf.fit(x_train, y_train)
    model = SelectFromModel(clf, prefit=True, threshold="mean")

    print(x_train.shape)
    x_train = model.transform(x_train)
    x_valid = model.transform(x_valid)
    print(x_train.shape)

    return x_train,x_valid

Main function perform stacking.

In [6]:
if __name__=="__main__":
    np.random.seed(1)
    x_train, y_train, x_valid, _, _ = load_data()

    train_listing = x_train["listing_id"].values
    test_listing = x_valid["listing_id"].values

    # preprocessing with standardization
    x_train, x_valid = _preprocess(x_train, x_valid)

    # feature selection
    '''
    clf=GradientBoostingClassifier()
    x_train,x_valid=select_feature(clf,x_train,x_valid)
    train_df=pd.DataFrame(x_train)
    test_df=pd.DataFrame(x_valid)
    train_df["listing_id"]=train["listing_id"].values
    test_df["listing_id"]=valid["listing_id"].values
    train_df.to_csv("best_model_train_top_feature.csv",index=None)
    test_df.to_csv("best_model_test_top_feature.csv",index=None)
    '''

    x_train, y_train, x_valid = x_train.as_matrix(), y_train.as_matrix(), x_valid.as_matrix()

    folds = 5
    seed = 1
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    # models for stacking
    clf_list = [xgb,nn,lgb,knn,gb,rf,et,lr,ada_reg,rf_reg,gb_reg,et_reg,xgb_reg,nn_reg,lgb_reg]
    column_list = []
    train_data_list=[]
    test_data_list=[]
    for clf in clf_list:
        print('Stacking model: {}'.format(clf.__name__))
        train_data,test_data,clf_name=clf(x_train,y_train,x_valid)
        train_data_list.append(train_data)
        test_data_list.append(test_data)
        if "reg" in clf_name:
            ind_num=1
        else:
            ind_num=3
        for ind in range(ind_num):
            column_list.append("standardscaler_%s_%s" % (clf_name, ind))

    train = np.concatenate(train_data_list, axis=1)
    test = np.concatenate(test_data_list, axis=1)

    train = pd.DataFrame(train)
    train.columns = column_list
    train["level"] = pd.Series(y_train)
    train["listing_id"] = train_listing

    test = pd.DataFrame(test)
    test.columns = column_list
    test["listing_id"] = test_listing

    train.to_csv("stacking_train_stdscale.csv", index=None)
    test.to_csv("stacking_test_stdscale.csv", index=None)

Loading features files
Ordinal encoding
Features document-term matrix
Stacking model: xgb
[0]	train-mlogloss:1.0791	eval-mlogloss:1.0792
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.57008	eval-mlogloss:0.582863
[200]	train-mlogloss:0.516291	eval-mlogloss:0.541685
[300]	train-mlogloss:0.489328	eval-mlogloss:0.527523
[400]	train-mlogloss:0.46961	eval-mlogloss:0.520219
[500]	train-mlogloss:0.454066	eval-mlogloss:0.516173
[600]	train-mlogloss:0.440205	eval-mlogloss:0.512935
[700]	train-mlogloss:0.427082	eval-mlogloss:0.510214
[800]	train-mlogloss:0.415615	eval-mlogloss:0.508478
[900]	train-mlogloss:0.404596	eval-mlogloss:0.50727
[1000]	train-mlogloss:0.394071	eval-mlogloss:0.506423
[1100]	train-mlogloss:0.384257	eval-mlogloss:0.505408
[1200]	train-mlogloss:0.375146	eval-mlogloss:0.505035
[1300]	train-mlogloss:0.36607	eval-mlogloss:0.504746
[1400]	train-mlogloss:



nn_score_list: [0.55528223652869657, 0.56476227084721509, 0.57247125971210078, 0.55504485355804023, 0.56215258669868262]
nn_score_mean: 0.561942641469
Stacking model: lgb
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's multi_logloss: 0.574384
[200]	valid_0's multi_logloss: 0.531415
[300]	valid_0's multi_logloss: 0.518784
[400]	valid_0's multi_logloss: 0.512339
[500]	valid_0's multi_logloss: 0.508763
[600]	valid_0's multi_logloss: 0.506915
[700]	valid_0's multi_logloss: 0.506045
[800]	valid_0's multi_logloss: 0.505313
[900]	valid_0's multi_logloss: 0.504877
[1000]	valid_0's multi_logloss: 0.504617
[1100]	valid_0's multi_logloss: 0.50441
Early stopping, best iteration is:
[1080]	valid_0's multi_logloss: 0.504375
lgb now score is: [0.50437531079273012]
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's multi_logloss: 0.573847
[200]	valid_0's multi_logloss: 0.531158
[300]	valid_0's multi_logloss: 0.518733
[400]	valid_0's multi_logloss: 0.513385
[5

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   42.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    6.1s finished


rf now score is: [0.54770750711211125]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   44.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    6.0s finished


rf now score is: [0.54770750711211125, 0.5520213835254556]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   42.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    6.0s finished


rf now score is: [0.54770750711211125, 0.5520213835254556, 0.55770510360942338]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   41.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    6.1s finished


rf now score is: [0.54770750711211125, 0.5520213835254556, 0.55770510360942338, 0.54814620289581673]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   41.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    5.9s finished


rf now score is: [0.54770750711211125, 0.5520213835254556, 0.55770510360942338, 0.54814620289581673, 0.54932857275534286]
rf_score_list: [0.54770750711211125, 0.5520213835254556, 0.55770510360942338, 0.54814620289581673, 0.54932857275534286]
rf_score_mean: 0.55098175398
Stacking model: et


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   34.8s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    1.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.5s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    8.3s finished


et now score is: [0.57408780817254501]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   34.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    1.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.4s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    8.1s finished


et now score is: [0.57408780817254501, 0.57874948714619023]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   35.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    1.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.4s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    8.3s finished


et now score is: [0.57408780817254501, 0.57874948714619023, 0.58318979029242501]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   35.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    1.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.6s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    8.3s finished


et now score is: [0.57408780817254501, 0.57874948714619023, 0.58318979029242501, 0.57942974296646077]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   35.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    1.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.5s
[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:    8.2s finished


et now score is: [0.57408780817254501, 0.57874948714619023, 0.58318979029242501, 0.57942974296646077, 0.57690817695760643]
et_score_list: [0.57408780817254501, 0.57874948714619023, 0.58318979029242501, 0.57942974296646077, 0.57690817695760643]
et_score_mean: 0.578473001107
Stacking model: lr


  np.exp(prob, prob)


lr now score is: [0.58318936436249758]
lr now score is: [0.58318936436249758, 0.58978812208627596]
lr now score is: [0.58318936436249758, 0.58978812208627596, 0.60055621503372958]
lr now score is: [0.58318936436249758, 0.58978812208627596, 0.60055621503372958, 0.58285861216844537]
lr now score is: [0.58318936436249758, 0.58978812208627596, 0.60055621503372958, 0.58285861216844537, 0.58801050470240979]
lr_score_list: [0.58318936436249758, 0.58978812208627596, 0.60055621503372958, 0.58285861216844537, 0.58801050470240979]
lr_score_mean: 0.588880563671
Stacking model: ada_reg
ada now score is: [0.29177049713541114]
ada now score is: [0.29177049713541114, 0.28928966627874614]
ada now score is: [0.29177049713541114, 0.28928966627874614, 0.2947229185980848]
ada now score is: [0.29177049713541114, 0.28928966627874614, 0.2947229185980848, 0.28541868843252599]
ada now score is: [0.29177049713541114, 0.28928966627874614, 0.2947229185980848, 0.28541868843252599, 0.29591269700516049]
ada_score_lis

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  4.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    2.3s finished


rf now score is: [0.22257941246539983]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  4.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    2.3s finished


rf now score is: [0.22257941246539983, 0.22673131908933916]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    2.3s finished


rf now score is: [0.22257941246539983, 0.22673131908933916, 0.22460845004853705]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    2.3s finished


rf now score is: [0.22257941246539983, 0.22673131908933916, 0.22460845004853705, 0.22198636745993897]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    2.4s finished


rf now score is: [0.22257941246539983, 0.22673131908933916, 0.22460845004853705, 0.22198636745993897, 0.22162406564617623]
rf_score_list: [0.22257941246539983, 0.22673131908933916, 0.22460845004853705, 0.22198636745993897, 0.22162406564617623]
rf_score_mean: 0.223505922942
Stacking model: gb_reg
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.3822           0.0092            1.17m
         2           0.3738           0.0090            1.16m
         3           0.3656           0.0083            1.15m
         4           0.3560           0.0078            1.14m
         5           0.3494           0.0065            1.13m
         6           0.3427           0.0065            1.12m
         7           0.3346           0.0062            1.10m
         8           0.3290           0.0059            1.09m
         9           0.3233           0.0055            1.08m
        10           0.3182           0.0050            1.07m
        20          

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    3.0s finished


et now score is: [0.22351990000742863]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    3.0s finished


et now score is: [0.22351990000742863, 0.22656312837080539]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    3.0s finished


et now score is: [0.22351990000742863, 0.22656312837080539, 0.22372750968940733]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   43.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    3.0s finished


et now score is: [0.22351990000742863, 0.22656312837080539, 0.22372750968940733, 0.22190639691658776]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 600 out of 600 | elapsed:    3.1s finished


et now score is: [0.22351990000742863, 0.22656312837080539, 0.22372750968940733, 0.22190639691658776, 0.21968379012482411]
et_score_list: [0.22351990000742863, 0.22656312837080539, 0.22372750968940733, 0.22190639691658776, 0.21968379012482411]
et_score_mean: 0.223080145022
Stacking model: xgb_reg
[0]	train-rmse:0.63094	eval-rmse:0.632719
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.471085	eval-rmse:0.480355
[200]	train-rmse:0.450726	eval-rmse:0.466281
[300]	train-rmse:0.439264	eval-rmse:0.46145
[400]	train-rmse:0.430489	eval-rmse:0.457981
[500]	train-rmse:0.423524	eval-rmse:0.456037
[600]	train-rmse:0.417267	eval-rmse:0.454485
[700]	train-rmse:0.411462	eval-rmse:0.452929
[800]	train-rmse:0.406383	eval-rmse:0.452088
[900]	train-rmse:0.401447	eval-rmse:0.451326
[1000]	train-rmse:0.397233	eval-rmse:0.450841
[1100]	train-rmse:0.392593	eval-rmse:0.450386
[1200]	train-rmse:0.



nn now score is: [0.23283661694037869]
nn now score is: [0.23283661694037869, 0.24430250877933213]
nn now score is: [0.23283661694037869, 0.24430250877933213, 0.30225855924917983]
nn now score is: [0.23283661694037869, 0.24430250877933213, 0.30225855924917983, 0.23130604364609064]
nn now score is: [0.23283661694037869, 0.24430250877933213, 0.30225855924917983, 0.23130604364609064, 0.23413943821984715]
nn_score_list: [0.23283661694037869, 0.24430250877933213, 0.30225855924917983, 0.23130604364609064, 0.23413943821984715]
nn_score_mean: 0.248968633367
Stacking model: lgb_reg
Train until valid scores didn't improve in 100 rounds.
[100]	valid_0's l2: 0.225109
[200]	valid_0's l2: 0.211089
[300]	valid_0's l2: 0.206932
[400]	valid_0's l2: 0.204858
[500]	valid_0's l2: 0.203247
[600]	valid_0's l2: 0.202138
[700]	valid_0's l2: 0.201324
[800]	valid_0's l2: 0.200705
[900]	valid_0's l2: 0.200211
[1000]	valid_0's l2: 0.199739
[1100]	valid_0's l2: 0.199433
[1200]	valid_0's l2: 0.199257
[1300]	valid_0

In [7]:
def clf_level2():
    np.random.seed(1)
    x_train = pd.read_csv("stacking_train_stdscale.csv")
    y_train = x_train['level']
    x_train = x_train.drop(['level'], axis=1)
    x_valid = pd.read_csv("stacking_test_stdscale.csv")
    x_train, x_valid = _preprocess(x_train, x_valid)

    x_train, y_train, x_valid = x_train.as_matrix(), y_train.as_matrix(), x_valid.as_matrix()

    folds = 5
    seed = 1
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    # models for stacking
    clf_list = [xgb,nn,knn,lr,lgb]
    column_list = []
    train_data_list=[]
    test_data_list=[]
    for clf in clf_list:
        print('Train 2nd-level model: {}'.format(clf.__name__))
        train_data,test_data,clf_name=clf(x_train,y_train,x_valid)
        train_data_list.append(train_data)
        test_data_list.append(test_data)
        if "reg" in clf_name:
            ind_num=1
        else:
            ind_num=3
        for ind in range(ind_num):
            column_list.append("standardscaler_%s_%s" % (clf_name, ind))

In [8]:
clf_level2()

Train 2nd-level model: xgb
[0]	train-mlogloss:1.07418	eval-mlogloss:1.07418
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.505043	eval-mlogloss:0.516752
[200]	train-mlogloss:0.475361	eval-mlogloss:0.498135
[300]	train-mlogloss:0.462942	eval-mlogloss:0.496802
[400]	train-mlogloss:0.452458	eval-mlogloss:0.497149
Stopping. Best iteration:
[330]	train-mlogloss:0.459927	eval-mlogloss:0.496714

xgb now score is: [0.49671411228775153]
[0]	train-mlogloss:1.07418	eval-mlogloss:1.07429
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.504197	eval-mlogloss:0.519002
[200]	train-mlogloss:0.474539	eval-mlogloss:0.500735
[300]	train-mlogloss:0.462414	eval-mlogloss:0.499789
Stopping. Best iteration:
[280]	train-mlogloss:0.464471	eval-mlogloss:0.499727





nn_score_list: [0.49890317622216107, 0.5015991311414667, 0.50523793931982997, 0.49882312196414452, 0.49976897593151226]
nn_score_mean: 0.500866468916
Train 2nd-level model: knn
knn now score is: [0.52549721271571614]
knn now score is: [0.52549721271571614, 0.52891857330449588]
knn now score is: [0.52549721271571614, 0.52891857330449588, 0.53616795245930882]
knn now score is: [0.52549721271571614, 0.52891857330449588, 0.53616795245930882, 0.50850059757150456]
knn now score is: [0.52549721271571614, 0.52891857330449588, 0.53616795245930882, 0.50850059757150456, 0.52616853056040847]
knn_score_list: [0.52549721271571614, 0.52891857330449588, 0.53616795245930882, 0.50850059757150456, 0.52616853056040847]
knn_score_mean: 0.525050573322
Train 2nd-level model: lr
lr now score is: [0.50946311761200624]
lr now score is: [0.50946311761200624, 0.51563969115456865]
lr now score is: [0.50946311761200624, 0.51563969115456865, 0.51852308077387321]
lr now score is: [0.50946311761200624, 0.5156396911545

In [9]:
def clf_level2(clf=xgb):
    np.random.seed(1)
    x_train = pd.read_csv("stacking_train_stdscale.csv")
    y_train = x_train['level']
    x_train = x_train.drop(['level'], axis=1)
    x_valid = pd.read_csv("stacking_test_stdscale.csv")
    test_listing = x_valid.listing_id.values
    x_train, x_valid = _preprocess(x_train, x_valid)

    x_train, y_train, x_valid = x_train.as_matrix(), y_train.as_matrix(), x_valid.as_matrix()

    folds = 5
    seed = 1
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    # models for stacking
    test_data_list=[]
    print('Train 2nd-level model: {}'.format(clf.__name__))
    _, test_data, _=clf(x_train,y_train,x_valid)
    test_data_list.append(test_data)

    test = np.concatenate(test_data_list, axis=1)
    test = pd.DataFrame(test)
    test.columns = ['low', 'medium', 'high']
    test["listing_id"] = test_listing

    test.to_csv("submission_stacking_xgb.csv", index=None)

In [10]:
clf_level2()

Train 2nd-level model: xgb
[0]	train-mlogloss:1.07418	eval-mlogloss:1.07418
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.505043	eval-mlogloss:0.516752
[200]	train-mlogloss:0.475361	eval-mlogloss:0.498135
[300]	train-mlogloss:0.462942	eval-mlogloss:0.496802
[400]	train-mlogloss:0.452458	eval-mlogloss:0.497149
Stopping. Best iteration:
[330]	train-mlogloss:0.459927	eval-mlogloss:0.496714

xgb now score is: [0.49671411228775153]
[0]	train-mlogloss:1.07418	eval-mlogloss:1.07429
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.504197	eval-mlogloss:0.519002
[200]	train-mlogloss:0.474539	eval-mlogloss:0.500735
[300]	train-mlogloss:0.462414	eval-mlogloss:0.499789
Stopping. Best iteration:
[280]	train-mlogloss:0.464471	eval-mlogloss:0.499727



The final submission with xgboost on stacked dataset, leads to a score of 0.50372 on Kaggle private leader board.