# stacking模型融合

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import pickle
with open('29_features.pkl','rb') as f:
    X = pickle.load(f, encoding = 'gbk')  #标准化后的数据
with open('new_label.pkl','rb') as f:
    y = pickle.load(f)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state = 2018)

import warnings
warnings.filterwarnings('ignore')

## 简化版模型评估

In [3]:
from sklearn.metrics import accuracy_score, roc_auc_score

def model_metrics(clf, X_train, X_test, y_train, y_test):
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    
    y_train_proba = clf.predict_proba(X_train)[:,1]
    y_test_proba = clf.predict_proba(X_test)[:,1]
    
    # 准确率
    print('[准确率]', end = ' ')
    print('训练集：', '%.4f'%accuracy_score(y_train, y_train_pred), end = ' ')
    print('测试集：', '%.4f'%accuracy_score(y_test, y_test_pred))
    
    # auc取值：用roc_auc_score或auc
    print('[auc值]', end = ' ')
    print('训练集：', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
    print('测试集：', '%.4f'%roc_auc_score(y_test, y_test_proba))

## 模型融合-stacking
选择简单的lr为次级学习器， 其他的svm, dt, rf,xgb 作为初级学习器
- 训练基学习器
- 选择lr为次级学习器进行基学习器权重学习

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

#### 训练基学习器

In [5]:
#lr
lr = LogisticRegression(C = 0.01, penalty = 'l2')
lr.fit(X_train, y_train)
model_metrics(lr, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8028 测试集： 0.7835
[auc值] 训练集： 0.8038 测试集： 0.7840


In [6]:
#svm_linear
svm_linear = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
svm_linear.fit(X_train, y_train)
model_metrics(svm_linear, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.7926 测试集： 0.7744
[auc值] 训练集： 0.8082 测试集： 0.7884


In [7]:
#DT
dt = DecisionTreeClassifier(max_depth=5,min_samples_split=140,min_samples_leaf=60,max_features='sqrt', random_state =2018)
dt.fit(X_train, y_train)
model_metrics(dt, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.7890 测试集： 0.7687
[auc值] 训练集： 0.7885 测试集： 0.7310


In [8]:
#rf
rf = RandomForestClassifier(n_estimators=120, max_depth=8, min_samples_split=30,
                            min_samples_leaf=20, max_features = 'sqrt',oob_score=True, random_state=2018)
rf.fit(X_train, y_train)
print('袋外分数：', rf.oob_score_)
model_metrics(rf, X_train, X_test, y_train, y_test)

袋外分数： 0.7935076645626691
[准确率] 训练集： 0.8206 测试集： 0.7821
[auc值] 训练集： 0.8829 测试集： 0.7782


In [9]:
#xgboost
xgb = XGBClassifier(learning_rate =0.1, n_estimators=40, max_depth=3, 
                                                  min_child_weight=11, gamma=0.0, subsample=0.8, reg_alpha=1e-05,
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4,scale_pos_weight=1, seed=27)
xgb.fit(X_train, y_train)

model_metrics(xgb, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8224 测试集： 0.7877
[auc值] 训练集： 0.8490 测试集： 0.7801


### 开始融合


In [10]:
#选择默认参数下的lr为基学习器
LR = LogisticRegression()

In [12]:
from mlxtend.classifier import StackingCVClassifier
sclf = StackingCVClassifier(classifiers=[svm_linear, xgb, dt, lr, rf], 
                            meta_classifier=LR
                           # , use_probas=True
                           )
sclf.fit(X_train, y_train)
model_metrics(sclf, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8164 测试集： 0.7849
[auc值] 训练集： 0.7157 测试集： 0.6679


In [13]:
from mlxtend.classifier import StackingCVClassifier
sclf = StackingCVClassifier(classifiers=[svm_linear, xgb, dt, lr, rf], 
                            meta_classifier=LR
                            , use_probas=True
                           )
sclf.fit(X_train, y_train)
model_metrics(sclf, X_train, X_test, y_train, y_test)

[准确率] 训练集： 0.8124 测试集： 0.7856
[auc值] 训练集： 0.8342 测试集： 0.7878


融合之前，最好的svm_linear:0.7744/0.7884 ---> 0.7856/0.7878