多样性度量是用于度量集成中个体分类器的多样性,即估算个体学习器的多样化程度.

In [1]:
import numpy as np
import numpy.ma as ma
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# 预处理
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

import sys
import warnings

warnings.simplefilter('ignore')

In [2]:
def load_data():
    data = load_breast_cancer()
    return data.data,data.target

def load_data_split():
    X,y = load_data()
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    return X_train,X_test,y_train,y_test

In [3]:
def standardize(X_):
    X_ = StandardScaler().fit_transform(X_)
    return X_

In [4]:
def load_data_split_standarlize():
    X_train,X_test,y_train,y_test = load_data_split()
    return standardize(X_train),standardize(X_test),y_train,y_test

In [5]:
def contigency_table(y_h_i,y_h_j):
    '''
    用于二分类结果
    @param y_h_i 分类器i的预测结果
    @param y_h_j 分类器j的预测结果
    '''
    y_h_i_1 = y_h_i==1
    y_h_i_2 = y_h_i==-1
    y_h_j_1 = y_h_j==1
    y_h_j_2 = y_h_j==-1
    a = np.sum(y_h_i_1 == y_h_j_1) # hi +1 , hj +1
    b = np.sum(y_h_i_1) + np.sum(y_h_j_2) # hi +1 , hj -1
    c = np.sum(y_h_i_2) + np.sum(y_h_j_1) # hi -1 , hj +1
    d = np.sum(y_h_i_2 == y_h_j_2) # hi -1 , hk -1
    return a,b,c,d

In [6]:
def disagreement_measure(y_h_i,y_h_j):
    '''
    不合度量
    值域为[0,1]
    值越大则多样性越大
    '''
    a,b,c,d = contigency_table(y_h_i,y_h_j)
    m = a+b+c+d
    return (b+c)/m

In [7]:
 def correlation_coefficient(y_h_i,y_h_j):
    '''
    相关系数
    值域为[-1,1]
    若无关,则值为0,若hi与hj正相关则值为正,否则为负
    '''
    a,b,c,d = contigency_table(y_h_i,y_h_j)
    rho = a*d - b*c
    rho /= np.sqrt((a+b) * (a+c) * (c+d) * (b+d))
    return rho

In [8]:
def q_statistic(y_h_i,y_h_j):
    '''
    Q统计量
    Q与相关系数符号相同且
    |Q| >= cc
    '''
    a,b,c,d = contigency_table(y_h_i,y_h_j)
    Q = a*d-b*c
    Q /= a*d + b*c
    return Q

In [9]:
def k_statistic(y_h_i,y_h_j):
    '''
    卡方统计量
    '''
    a,b,c,d = contigency_table(y_h_i,y_h_j)
    m = a+b+c+d
    p1 = (a + d) / m # 两个分类器取得一致的概率
    p2 = ((a + b) *(a+c) + (c+d)*(b+d)) / (m **2) # 两个分类器偶然达成一致的概率
    return (p1-p2) / (1 - p2)

In [13]:
X_train,X_test,y_train,y_test = load_data_split_standarlize()

lor = LogisticRegression().fit(X_train,y_train)
dt = DecisionTreeClassifier().fit(X_train,y_train)
svc = SVC().fit(X_train,y_train)

lor_predict = lor.predict(X_test) 
dt_predict = dt.predict(X_test)
svc_predict = svc.predict(X_test)

predictions = [lor_predict,dt_predict,svc_predict]
model_names = [lor.__class__.__name__,dt.__class__.__name__,svc.__class__.__name__]

model_prediction_map = dict(zip(model_names,predictions))

for m1 in model_prediction_map:
    for m2 in model_prediction_map:
        m1_p,m2_p = model_prediction_map.get(m1),model_prediction_map.get(m2)
        mat = "{:20}\t{:35}\t{:20}\t{:10}"
        dm = round(disagreement_measure(m1_p,m2_p),3)
        cc = round(correlation_coefficient(m1_p,m2_p),3)
        qs = round(q_statistic(m1_p,m2_p),3)
        ks = round(k_statistic(m1_p,m2_p),3)
        print('_'*100)
        print(mat.format(m1,'disagreement_measure',m2,dm))
        print(mat.format(m1,'correlation_coefficient',m2,cc))
        print(mat.format(m1,'q_statistic',m2,qs))
        print(mat.format(m1,'k_statistic',m2,ks))
        print('_'*100)

____________________________________________________________________________________________________
LogisticRegression  	disagreement_measure               	LogisticRegression  	     0.384
LogisticRegression  	correlation_coefficient            	LogisticRegression  	     0.233
LogisticRegression  	q_statistic                        	LogisticRegression  	     0.442
LogisticRegression  	k_statistic                        	LogisticRegression  	     0.233
____________________________________________________________________________________________________
____________________________________________________________________________________________________
LogisticRegression  	disagreement_measure               	DecisionTreeClassifier	     0.386
LogisticRegression  	correlation_coefficient            	DecisionTreeClassifier	     0.228
LogisticRegression  	q_statistic                        	DecisionTreeClassifier	     0.433
LogisticRegression  	k_statistic                        	DecisionTre