# Stacked Model定义

In [16]:
import argparse
import numpy as np
import pickle
import random
import pandas as pd
import  os
import xgboost as xgb
import time
import operator
from sklearn.svm import  LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import  LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.pipeline import  Pipeline, FeatureUnion
from sklearn.svm import  LinearSVC, SVC
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.utils import check_array
import pickle
from sklearn.preprocessing import  MinMaxScaler
from sklearn.decomposition import FastICA

class StackingEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, mix_data=False):
        self.estimator = estimator
        self.mix_data = mix_data

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self

    def transform(self, X):
        X = check_array(X)
        if self.mix_data:
            X_transformed = np.copy(X)
            # add class probabilities/decision_functions as a synthetic feature
            if hasattr(self.estimator, 'predict_proba'):
                pred = self.estimator.predict_proba(X)
                if len(pred.shape) == 1:
                    pred = pred.reshape(-1, 1)
                X_transformed = np.hstack((pred, X))
            elif hasattr(self.estimator, 'decision_function'):
                pred = self.estimator.decision_function(X)
                if len(pred.shape) == 1:
                    pred = pred.reshape(-1, 1)
                X_transformed = np.hstack((pred, X))
        else:
            if hasattr(self.estimator, 'predict_proba'):
                pred = self.estimator.predict_proba(X)
                if len(pred.shape) == 1:
                    pred = pred.reshape(-1, 1)
                X_transformed = pred
            elif hasattr(self.estimator, 'decision_function'):
                pred = self.estimator.decision_function(X)
                if len(pred.shape) == 1:
                    pred = pred.reshape(-1, 1)
                X_transformed = pred
        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))
        return X_transformed

    def predict(self, X):
        return self.estimator.predict_proba(X)

    
def train_stacked_model():
    allDS = 'merged_table_ac_11.csv'
    all_df = pd.read_csv(allDS)
    print("Start Training!")
    del_feature_name_list = ['MPS03', 'MPS04', 'MPF01', 'LOT_ID_PRODUCT', 'GLASS_ID_PRODUCT', 'label']
    # LALEL
    label_name_list = ['MPS03', 'MPS04', 'MPF01']
    last_label_name = 'label'

    num_boost_rounds = 10000
    
    label_column = all_df[label_name_list[0]]*0
    for idx in range(len(label_name_list)):
        label_column += all_df[label_name_list[idx]] * (idx+1)
    all_df[last_label_name] = label_column
    shuffel_df = all_df.sample(frac=1).reset_index(drop=True)
    row_count = shuffel_df[last_label_name].count()
    train_row = int(row_count*0.8)
    
    train_df = shuffel_df[:train_row]
    test_df = shuffel_df[train_row:]
    test_df.to_csv("test.csv", index=False)
    train_df.to_csv("train.csv", index=False)

    train_data = train_df.drop(del_feature_name_list, axis=1).values
    train_label = train_df[last_label_name].values.astype(np.int32)

    test_data = test_df.drop(del_feature_name_list, axis=1).values
    test_label = test_df[last_label_name].values.astype(np.int32)

    xgb_params = {
        'eta': 0.005,
        'max_depth': 4,
        'subsample': 0.95,
        #'objective': 'binary:logistic',
        'objective': 'multi:softmax',
        #'eval_metric': 'error',
        'silent': 0,
        'n_estimators': num_boost_rounds,
        'n_jobs':100
    }
    xgb_estimator = StackingEstimator(xgb.XGBClassifier(**xgb_params), mix_data=True)

    svc1 = StackingEstimator(LinearSVC(penalty='l1', dual=False))
    svc2 = StackingEstimator(LinearSVC(penalty='l2', dual=False))
    gbt = StackingEstimator(GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1), mix_data=True)
    rf = StackingEstimator(RandomForestClassifier(n_estimators=1000, n_jobs=100), mix_data=True)

    logistic1 = StackingEstimator(LogisticRegressionCV(cv=4, n_jobs=40,penalty='l1',solver='liblinear'))
    logistic2 = StackingEstimator(LogisticRegressionCV(cv=4, n_jobs=40,penalty='l2',solver='liblinear'))

    pipline = Pipeline(
        [('preprocess',  MinMaxScaler()),
         ('level1',  FeatureUnion(transformer_list=[('gbt',gbt),
                                  ('pipline',
                                   Pipeline([('ica', FastICA(n_components=200)),
                                       ('linear_svc', FeatureUnion(transformer_list=[('svc1', svc1),('svc2', svc2),('level2_logistic', logistic1), ('level3_logistic', logistic2)]))]))])),
         ('level4_rf', rf),
         ('level5_xgb', xgb_estimator)])

    pipline.fit(train_data, train_label)
    print("Start Predict")
    prediction = pipline.predict(test_data)

    save_data = {'model': pipline, 'pred':prediction, 'label': test_label}
    pickle.dump(save_data, open('train_stacked_model.bin','w'))
    print("Write result")
    result =pd.DataFrame({'label': test_label, 'pred':prediction[:,1]})
    result.to_csv("train_stacked_model.csv", index=False)

In [17]:
train_stacked_model()

Start Training!


ValueError: labels [None] not contained in axis