In [1]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from collections import Counter
import pandas as pd
import numpy as np

''' Summary of the file.

    功能：
        实现特征的混合分类方法：
        构造的公式是对随机森林分类后概率矩阵的相加比例：
            proba = proba1*(P) + proba2*(1-P)
        上面，proba1是由GIST特征算出来的分类概率矩阵，proba2是由lbp或dense sift算出来的分类概率矩阵。
        这个权重P是一个不确定的值，不同的混合方法会有不同的最优值。
        最后根据合成的这个概率矩阵proba获得每个样本的分类情况

        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以随机森林方法划分并获得准确率，通过混淆矩阵标示结果。

    输出：
        结果的混淆矩阵，每次划分的准确率，N次的平均准确率
'''


' Summary of the file.\n\n    功能：\n        实现特征的混合分类方法：\n        构造的公式是对随机森林分类后概率矩阵的相加比例：\n            proba = proba1*(P) + proba2*(1-P)\n        上面，proba1是由GIST特征算出来的分类概率矩阵，proba2是由lbp或dense sift算出来的分类概率矩阵。\n        这个权重P是一个不确定的值，不同的混合方法会有不同的最优值。\n        最后根据合成的这个概率矩阵proba获得每个样本的分类情况\n\n        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以随机森林方法划分并获得准确率，通过混淆矩阵标示结果。\n\n    输出：\n        结果的混淆矩阵，每次划分的准确率，N次的平均准确率\n'

In [2]:
def getCopySplit(subfeatures, labels, X_train, X_test, y_train, y_test):
    """
    Function: 产生一个与已有交叉验证集合相对应的交叉验证的测试集和训练集，但来自于不同的特征集，同时试集和训练集具有的编号对应
    :param subfeatures: 用于交叉验证的另一个特征群的特征文件
    :param labels: 用于交叉验证的另一个特征群的标签文件
    :param X_train: 此前已经产生的训练集的特征
    :param X_test: 此前已经产生的测试集的特征
    :param y_train: 此前产生的训练集的标签
    :param y_test: 此前产生的测试集的标签
    :return: 与之前交叉对应相对应而不同特征集的交叉验证结果
    """
    X_train2 = subfeatures.loc[X_train.index]
    X_test2 = subfeatures.loc[X_test.index]
    y_train2 = labels.loc[y_train.index]
    y_test2 = labels.loc[y_test.index]

    return (X_train2, X_test2, y_train2, y_test2)


# # GIST特征值
subfeatures = pd.read_csv(r'F:\virtus_test\Simhash_Gist\gist_f_train_full.csv',header=None)
labels = pd.read_csv(r'F:\virtus_test\Simhash_Gist\CNO_full.txt',header=None)
# 对全NaN列行进行清除
result = pd.concat([subfeatures, labels], axis=1,ignore_index=True)
# print(result)
result=result.dropna(how='all',axis=1)
result=result.dropna(how='any',axis=0)
result=result.reindex()
# print(result)
subfeatures=result.iloc[:,:-1]
labels=result.iloc[:,-1]

# # LBP特征值
subfeatures2=pd.read_csv(r'F:\virtus_test\Simhash_LBP\feature_train_full.csv',header=None)
labels2 = pd.read_csv(r'F:\virtus_test\Simhash_LBP\CNO_full.txt',header=None)


In [3]:
# m = [11,12,30,31]       # 新数据集的家族更改方案
# first = 0
# N = 0

# for i in range(len(labels)):
#     if(labels[0].get(i) in m) and (labels[0].get(i) != first):
#         N = N + 1
#         first = labels[0].get(i)
#     labels.ix[i] = labels.ix[i] - N

# 平均准确率归零 Gist混合比例 实验次数
avgscore = 0
P = 0.5
N = 100


# 进行N次随机森林测试
for i in range(N):
    # 以10%的比例进行交叉验证
    X_train, X_test, y_train, y_test = train_test_split(subfeatures, labels, test_size=0.1)
    X_train2, X_test2, y_train2, y_test2 = getCopySplit(subfeatures2, labels, X_train, X_test, y_train, y_test)
    print("ROUND:",i)

    # 进行训练
    print('train...')
    # 进行随机森林训练,30课树，不限制进程数，为两个混合的特征集各自产生一个随机森林
    srf = RF(n_estimators=30, n_jobs=-1)
    srf.fit(X_train, y_train)
    srf2 = RF(n_estimators=30, n_jobs=-1)
    srf2.fit(X_train2, y_train2)

    # 预试
    print("test...")
    # 构建混合的随机森林概率矩阵
    proba1 = srf.predict_proba(X_test)
    proba2 = srf2.predict_proba(X_test2)
    proba = proba1*(P) + proba2*(1-P)
    # print("Proda")  # 输出随机森林混合概率矩阵
    # print(proba)

    predictions = np.zeros((proba.shape[0]))    # 记录预测分类结果
    Max_allClo = proba.max(axis=1)  # 每个样本分类概率的最大值
    # print("Max_allClo:")  # 输出每个样本分类概率的最大值
    # print(Max_allClo)

    # 将每一个样本分类概率的最大值和随机森林概率矩阵中的概率相比，取得最大概率的分类并记录
    for k in range(proba.shape[0]):
        for n in range(proba.shape[1]):
            if proba[k][n] == Max_allClo[k]:
                predictions[k] = n+1
                # continue

    # print("predictions")  # 输出分类结果
    # print(predictions)

    # 计算预测划分准确率
    score = accuracy_score(y_test, predictions,sample_weight=None)
    print(score)

    # 累加分数
    avgscore = avgscore + score

    # 通过混淆矩阵进行结果标示
    cm = confusion_matrix(y_test, predictions)
    np.set_printoptions(threshold=10000)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(str(cm))




ROUND: 0
train...
test...
0.9432387312186978
Confusion matrix, without normalization
[[ 53   7   3   0   0   0   0   0   2   6]
 [  0 129   0   0   0   0   2   0   2   0]
 [  6   4  89   0   0   0   1   2   0   0]
 [  0   0   0 158   0   0   0   0   0   0]
 [  0   0   0   0 137  16   1   0   0   0]
 [  0   0   0   0   0 140   0   0   0   0]
 [  1   4   1   0   0   0 116   0   2   0]
 [  0   0   0   0   0   0   0  48   0   0]
 [  2   1   2   0   0   0   2   0 121   0]
 [  1   0   0   0   0   0   0   0   0 139]]
ROUND: 1
train...
test...
0.9449081803005008
Confusion matrix, without normalization
[[ 55   4   0   1   2   0   0   1   3   6]
 [  5 157   0   0   0   0   0   1   2   0]
 [  6   1  63   0   0   0   2   0   0   0]
 [  0   0   0 141   0   0   0   0   0   0]
 [  0   0   0   0 145  14   0   0   0   0]
 [  0   0   0   0   1 158   0   0   0   0]
 [  1   2   2   0   0   0 137   0   3   0]
 [  0   0   0   0   0   0   0  41   0   0]
 [  1   3   1   0   0   0   4   0 101   0]
 [  0   0   

test...
0.9398998330550918
Confusion matrix, without normalization
[[ 29   2   7   2   0   0   0   0   3   4]
 [  1 113   0   0   0   0   0   0   0   0]
 [  4   2  78   0   0   0   1   0   3   5]
 [  0   0   0 165   0   0   0   0   0   0]
 [  0   0   0   0 132  12   0   0   0   0]
 [  0   0   1   0   0 155   0   0   0   0]
 [  2   2   4   0   0   0 153   0   1   0]
 [  0   0   0   0   0   0   0  39   0   0]
 [  4   0   3   0   0   0   2   0 128   0]
 [  6   0   0   0   0   0   0   0   1 134]]
ROUND: 17
train...
test...
0.9390651085141903
Confusion matrix, without normalization
[[ 56   5  10   0   0   0   0   0   6   3]
 [  2 139   2   0   0   0   0   1   0   0]
 [  7   3  83   0   0   0   1   0   1   3]
 [  0   0   0 146   0   0   0   0   0   0]
 [  0   0   0   0 128  13   0   0   0   0]
 [  0   0   0   0   0 139   0   0   0   0]
 [  1   1   2   0   0   0 131   0   2   0]
 [  0   0   0   0   0   0   0  42   0   0]
 [  1   0   0   0   0   0   3   0 121   1]
 [  2   0   1   0   0   0   0

test...
0.9398998330550918
Confusion matrix, without normalization
[[ 41  10   5   1   1   0   3   0   2   3]
 [  2 138   1   0   0   0   0   0   0   0]
 [  8   2  84   0   0   0   1   2   3   1]
 [  0   0   0 134   0   0   0   0   0   0]
 [  0   0   0   0 142   9   0   0   0   0]
 [  0   0   1   0   0 153   0   0   0   0]
 [  0   1   3   0   0   0 132   0   0   0]
 [  0   1   0   0   0   0   0  47   0   0]
 [  3   0   0   0   0   0   4   0 121   0]
 [  3   0   1   0   0   0   0   0   1 134]]
ROUND: 33
train...
test...
0.9373956594323873
Confusion matrix, without normalization
[[ 52   8   8   0   0   0   1   1   2   4]
 [  1 140   0   0   0   0   2   1   0   0]
 [  6   2  75   0   0   0   2   0   3   0]
 [  0   0   0 136   0   0   0   0   0   0]
 [  0   0   1   0 115  13   0   0   0   0]
 [  0   0   1   0   0 157   0   0   0   0]
 [  1   1   4   0   0   0 151   0   2   0]
 [  0   1   0   0   0   0   0  63   0   0]
 [  1   2   1   0   0   0   3   0 126   1]
 [  2   0   0   0   0   0   0

test...
0.9348914858096828
Confusion matrix, without normalization
[[ 48   4   7   2   0   0   1   1   6   4]
 [  0 114   0   0   0   0   1   0   0   0]
 [  7   2  89   0   0   0   2   0   4   4]
 [  1   0   0 149   0   0   0   0   0   0]
 [  0   0   0   0 127  10   0   0   0   0]
 [  0   0   0   0   0 146   0   0   0   0]
 [  4   1   0   0   0   0 139   0   2   0]
 [  0   2   0   0   0   0   1  45   0   0]
 [  3   0   1   0   0   0   5   0 130   0]
 [  1   0   1   1   0   0   0   0   0 133]]
ROUND: 49
train...
test...
0.9298831385642737
Confusion matrix, without normalization
[[ 46   3   3   0   0   0   4   0   4   1]
 [  0 128   1   0   0   0   3   0   0   0]
 [  6   2  69   0   0   0   4   1   0   4]
 [  0   0   0 141   0   0   0   0   0   0]
 [  0   0   0   0 131  14   0   0   0   0]
 [  1   0   2   0   0 150   0   0   0   0]
 [  3   3   4   0   0   0 137   0   1   0]
 [  0   1   1   0   0   0   0  49   1   0]
 [  1   0   2   0   0   0  11   0 122   0]
 [  2   0   1   0   0   0   0

test...
0.9390651085141903
Confusion matrix, without normalization
[[ 60   5   4   0   0   0   3   0   2   0]
 [  1 106   0   0   0   0   0   1   0   0]
 [  7   4  80   0   0   0   2   0   3   4]
 [  0   0   0 155   0   0   0   0   0   0]
 [  0   0   1   0 136  13   0   0   0   0]
 [  0   0   1   0   0 155   0   0   0   0]
 [  1   0   0   0   0   0 140   0   2   0]
 [  0   1   0   0   0   0   0  39   0   0]
 [  5   0   0   0   0   0   3   0 125   1]
 [  5   0   2   0   0   0   0   0   2 129]]
ROUND: 65
train...
test...
0.9424040066777963
Confusion matrix, without normalization
[[ 44   4   8   0   0   0   1   1   4   4]
 [  1 115   1   0   0   0   0   3   0   0]
 [  7   1  89   0   0   0   3   1   1   0]
 [  0   0   0 136   0   0   0   0   0   0]
 [  0   0   0   0 127  12   0   0   1   0]
 [  1   0   0   0   0 155   0   0   0   0]
 [  1   1   1   0   0   0 146   0   1   0]
 [  0   0   0   0   0   0   0  57   0   0]
 [  2   0   2   0   0   0   1   0 121   0]
 [  6   0   0   0   0   0   0

test...
0.9398998330550918
Confusion matrix, without normalization
[[ 39   7   2   0   0   0   0   0   4   6]
 [  1 141   0   0   0   0   2   0   0   0]
 [  6   3  76   0   0   0   0   1   1   2]
 [  0   0   0 145   0   0   0   0   0   0]
 [  0   0   0   0 133  11   0   0   0   0]
 [  1   0   1   0   0 154   0   0   0   0]
 [  2   0   4   0   0   0 137   0   4   0]
 [  0   1   0   0   0   0   1  43   0   0]
 [  2   0   2   0   0   0   5   0 115   0]
 [  3   0   0   0   0   0   0   0   0 143]]
ROUND: 81
train...
test...
0.9507512520868113
Confusion matrix, without normalization
[[ 43   5   8   1   2   1   0   0   0   5]
 [  2 141   1   0   0   0   0   0   1   0]
 [  6   3  66   0   0   0   2   0   1   1]
 [  0   0   0 160   0   0   0   0   0   0]
 [  0   0   0   0 132   7   0   0   0   0]
 [  1   0   1   0   0 156   0   0   0   0]
 [  1   1   0   0   0   0 137   0   2   0]
 [  0   0   0   0   0   0   0  49   0   0]
 [  2   0   0   0   0   0   2   0 139   0]
 [  3   0   0   0   0   0   0

test...
0.9616026711185309
Confusion matrix, without normalization
[[ 49   4   3   1   0   0   1   0   3   3]
 [  1 145   0   0   0   0   1   0   0   0]
 [  8   0  80   0   0   0   0   0   2   2]
 [  0   0   0 134   0   0   0   0   0   0]
 [  0   0   0   0 148   5   0   0   0   0]
 [  0   0   0   0   0 148   0   0   0   0]
 [  2   1   2   0   0   0 150   0   0   0]
 [  0   0   0   0   0   0   0  47   1   0]
 [  4   0   0   0   0   0   0   0 126   0]
 [  2   0   0   0   0   0   0   0   0 125]]
ROUND: 97
train...
test...
0.9424040066777963
Confusion matrix, without normalization
[[ 61   1   7   1   1   0   1   0   3   3]
 [  1 131   0   0   0   0   0   1   0   0]
 [  7   3  81   0   0   0   2   1   1   1]
 [  0   0   0 154   0   0   0   0   0   0]
 [  0   0   0   0 140  12   0   0   1   0]
 [  0   0   1   0   0 147   0   0   0   0]
 [  1   2   4   0   0   0 137   0   2   0]
 [  0   0   0   0   0   0   0  45   0   0]
 [  1   1   2   0   0   0   3   0 113   1]
 [  1   0   2   1   0   0   0

In [4]:
# 输出N次的平均准确率
avgscore = avgscore / N
print('avgscore....')
print(avgscore)

avgscore....
0.9394991652754586
