In [1]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from collections import Counter
import pandas as pd
import numpy as np

''' Summary of the file.

    功能：
        实现特征的混合分类方法：
        构造的公式是对随机森林分类后概率矩阵的相加比例：
            proba = proba1*(P) + proba2*(1-P)
        上面，proba1是由GIST特征算出来的分类概率矩阵，proba2是由lbp或dense sift算出来的分类概率矩阵。
        这个权重P是一个不确定的值，不同的混合方法会有不同的最优值。
        最后根据合成的这个概率矩阵proba获得每个样本的分类情况

        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以随机森林方法划分并获得准确率，通过混淆矩阵标示结果。

    输出：
        结果的混淆矩阵，每次划分的准确率，N次的平均准确率
'''

' Summary of the file.\n\n    功能：\n        实现特征的混合分类方法：\n        构造的公式是对随机森林分类后概率矩阵的相加比例：\n            proba = proba1*(P) + proba2*(1-P)\n        上面，proba1是由GIST特征算出来的分类概率矩阵，proba2是由lbp或dense sift算出来的分类概率矩阵。\n        这个权重P是一个不确定的值，不同的混合方法会有不同的最优值。\n        最后根据合成的这个概率矩阵proba获得每个样本的分类情况\n\n        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以随机森林方法划分并获得准确率，通过混淆矩阵标示结果。\n\n    输出：\n        结果的混淆矩阵，每次划分的准确率，N次的平均准确率\n'

In [2]:
# # GIST特征值
subfeatures = pd.read_csv(r'F:\virtus_test\Simhash_Gist\gist_f_train_full.csv',header=None)
labels = pd.read_csv(r'F:\virtus_test\Simhash_Gist\CNO_full.txt',header=None)
# 对全NaN列行进行清除
result = pd.concat([subfeatures, labels], axis=1,ignore_index=True)
# print(result)
result=result.dropna(how='all',axis=1)
result=result.dropna(how='any',axis=0)
result=result.reindex()
# print(result)
subfeatures=result.iloc[:,:-1]
labels=result.iloc[:,-1]


# # LBP特征值
subfeatures2=pd.read_csv(r'F:\virtus_test\Simhash_LBP\feature_train_full.csv',header=None)
labels2 = pd.read_csv(r'F:\virtus_test\Simhash_LBP\CNO_full.txt',header=None)


# 合并两条特征
subfeatures2.columns = range(len(subfeatures.columns),len(subfeatures.columns)+len(subfeatures2.columns))
subfeatures[-1] = subfeatures.index;
subfeatures2[-1] = subfeatures2.index;
subfeatures = pd.merge(subfeatures,subfeatures2,on=-1)
subfeatures = subfeatures.drop(-1,axis=1)



In [3]:
print(subfeatures)

            0         1         2         3         4         5         6    \
0      0.000011  0.000129  0.000012  0.000013  0.000190  0.011638  0.000046   
1      0.085982  0.040875  0.021009  0.007505  0.091562  0.046841  0.023623   
2      0.047440  0.016571  0.000198  0.000226  0.029116  0.015923  0.000131   
3      0.048793  0.000449  0.000058  0.000162  0.046696  0.007090  0.000088   
4      0.090414  0.069236  0.015272  0.027969  0.108240  0.064971  0.028188   
5      0.105230  0.055067  0.045520  0.043418  0.146050  0.082876  0.059477   
6      0.106120  0.052204  0.025907  0.030567  0.106140  0.064704  0.054260   
7      0.021522  0.000341  0.000092  0.005975  0.018221  0.012244  0.000068   
8      0.000052  0.000663  0.000042  0.000023  0.000109  0.011774  0.000069   
9      0.073294  0.031559  0.000907  0.001161  0.033077  0.018886  0.000125   
10     0.034921  0.000640  0.008101  0.006527  0.045931  0.007843  0.000342   
11     0.054843  0.017804  0.021770  0.000366  0.026

In [4]:
# 平均准确率归零 实验次数
avgscore = 0
N = 100

# 进行N次随机森林测试
for i in range(N):
    # 以10%的比例进行交叉验证
    X_train, X_test, y_train, y_test = train_test_split(subfeatures, labels, test_size=0.1)
    print("ROUND:", i)
    print("train")

    # 合并两条相对应的特征


    # 进行训练
    print('train...')
    # 进行随机森林训练,30课树，不限制进程数，为两个混合的特征集各自产生一个随机森林
    srf = RF(n_estimators=30, n_jobs=-1)
    srf.fit(X_train, y_train)


    # 预试
    print("test...")
    c_test = srf.predict(X_test)

    # 计算预测划分准确率
    score = srf.score(X_test, y_test)
    print(score)

    # 累加分数
    avgscore = avgscore + score
    print(" initial test  labels size")

    # 通过混淆矩阵进行结果标示
    cm = confusion_matrix(y_test, c_test)
    np.set_printoptions(threshold=10000)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(str(cm))




ROUND: 0
train
train...
test...
0.9248747913188647
 initial test  labels size
Confusion matrix, without normalization
[[ 57   5   5   0   1   0   2   1   2   1]
 [  4 118   2   0   0   0   0   1   0   0]
 [  8   1  90   0   0   0   2   1   5   2]
 [  0   0   0 159   0   0   0   0   0   0]
 [  0   0   0   0 139  18   1   0   0   0]
 [  0   0   1   0   0 136   0   0   0   0]
 [  1   0   0   0   0   0 145   0   4   1]
 [  0   0   0   0   0   0   0  42   0   0]
 [  4   1   2   1   0   0   3   0  97   0]
 [  7   0   1   0   2   0   0   0   0 125]]
ROUND: 1
train
train...
test...
0.9440734557595993
 initial test  labels size
Confusion matrix, without normalization
[[ 54   6   2   1   0   1   1   1   4   0]
 [  1 121   1   0   0   0   0   0   0   0]
 [  5   1  73   0   0   0   1   0   5   1]
 [  0   0   0 144   0   0   0   0   0   0]
 [  0   0   0   0 141  14   0   0   0   0]
 [  0   0   2   0   0 155   0   0   0   1]
 [  2   1   0   0   0   0 139   0   1   0]
 [  0   0   0   0   0   0   0  4

test...
0.9415692821368948
 initial test  labels size
Confusion matrix, without normalization
[[ 42   4  10   0   0   0   0   0   2   6]
 [  3 120   1   0   0   0   0   1   1   0]
 [  3   2  77   0   0   0   3   1   3   1]
 [  0   0   0 164   0   0   0   0   0   0]
 [  0   0   0   0 138   8   0   0   0   0]
 [  0   0   0   0   0 141   0   0   0   0]
 [  1   0   2   0   1   0 142   0   4   0]
 [  0   0   0   0   0   0   0  49   0   0]
 [  3   0   1   0   0   0   4   0 134   1]
 [  2   0   1   0   0   0   0   0   1 121]]
ROUND: 16
train
train...
test...
0.9382303839732888
 initial test  labels size
Confusion matrix, without normalization
[[ 53   4   8   0   0   1   0   0   1   4]
 [  3 126   0   0   0   0   0   0   0   0]
 [  7   2  77   0   0   0   2   0   3   3]
 [  0   0   0 126   0   0   0   0   0   0]
 [  0   0   0   0 137   9   0   0   0   0]
 [  1   0   0   0   0 174   0   0   0   0]
 [  3   4   3   0   0   0 148   0   3   0]
 [  0   0   0   0   0   0   0  30   0   0]
 [  1   1   

test...
0.9307178631051753
 initial test  labels size
Confusion matrix, without normalization
[[ 49   3  12   0   0   0   0   0   3   7]
 [  2 130   1   0   0   0   2   0   0   0]
 [  5   2  91   0   0   0   1   1   1   2]
 [  0   0   0 143   0   0   0   0   0   0]
 [  0   0   1   0 148  11   0   0   0   0]
 [  0   0   1   0   0 131   0   0   0   0]
 [  3   4   1   0   0   0 139   0   4   0]
 [  0   1   0   0   0   0   0  46   1   0]
 [  5   0   0   0   0   0   3   0 114   1]
 [  3   0   1   0   0   0   1   0   0 124]]
ROUND: 31
train
train...
test...
0.9398998330550918
 initial test  labels size
Confusion matrix, without normalization
[[ 49   3   8   0   0   0   1   0   4   5]
 [  5 108   1   0   0   0   0   1   0   0]
 [  8   3  76   0   0   0   2   1   0   1]
 [  0   0   0 159   0   0   0   0   0   0]
 [  0   0   0   0 136   3   0   0   0   0]
 [  0   0   0   0   2 162   0   0   0   0]
 [  2   0   1   0   0   0 137   0   1   0]
 [  0   1   0   0   0   0   0  47   0   0]
 [  3   1   

test...
0.9357262103505843
 initial test  labels size
Confusion matrix, without normalization
[[ 46   2   9   0   1   0   0   1   2   5]
 [  0 129   1   0   0   0   2   3   1   0]
 [  8   1  77   0   0   0   2   0   0   3]
 [  1   0   0 149   0   0   0   0   0   0]
 [  0   0   0   0 134  11   0   0   0   0]
 [  0   0   2   0   1 166   0   0   0   0]
 [  1   0   3   0   0   0 142   0   2   1]
 [  0   0   0   0   0   0   0  43   0   0]
 [  3   1   3   0   0   0   4   0 117   0]
 [  2   0   1   0   0   0   0   0   0 118]]
ROUND: 46
train
train...
test...
0.9415692821368948
 initial test  labels size
Confusion matrix, without normalization
[[ 57   7   8   1   0   0   2   0   1   3]
 [  1 129   0   0   0   0   0   3   0   0]
 [  7   0  74   0   1   1   0   0   0   0]
 [  0   0   0 137   0   0   0   0   0   0]
 [  0   0   0   0 135  12   0   0   0   0]
 [  0   0   1   0   1 136   0   0   0   0]
 [  2   2   1   0   0   0 131   0   1   0]
 [  0   0   0   0   0   0   1  51   0   0]
 [  4   0   

test...
0.9407345575959933
 initial test  labels size
Confusion matrix, without normalization
[[ 49   3   3   1   0   1   3   0   3   4]
 [  1 127   0   0   0   0   1   0   0   0]
 [  7   0  76   1   0   0   2   0   2   3]
 [  0   0   0 137   0   0   0   0   0   0]
 [  0   0   0   0 129  15   0   0   0   0]
 [  0   0   1   0   0 150   0   0   0   0]
 [  2   0   3   0   0   0 158   0   2   1]
 [  0   0   0   0   0   0   0  45   0   0]
 [  1   1   5   0   0   0   2   0 133   1]
 [  2   0   0   0   0   0   0   0   0 123]]
ROUND: 61
train
train...
test...
0.9398998330550918
 initial test  labels size
Confusion matrix, without normalization
[[ 49  10   3   0   0   0   0   0   4   2]
 [  2 113   1   0   0   0   0   0   0   0]
 [ 10   2  76   0   0   0   1   1   0   2]
 [  0   0   0 148   0   0   0   0   0   0]
 [  0   0   0   0 148   8   0   0   0   0]
 [  0   0   0   0   0 138   0   0   0   0]
 [  2   0   3   0   0   0 142   0   1   1]
 [  0   0   0   0   0   0   0  59   0   0]
 [  4   0   

test...
0.9373956594323873
 initial test  labels size
Confusion matrix, without normalization
[[ 41   6   5   0   0   0   1   2   6   4]
 [  0 136   0   0   0   0   1   0   1   0]
 [  4   3  79   0   0   1   0   2   3   3]
 [  0   0   0 150   0   0   0   0   0   0]
 [  0   0   0   0 122  11   0   0   0   0]
 [  0   0   0   0   1 133   0   0   0   0]
 [  2   2   0   0   0   0 155   0   3   0]
 [  0   0   0   0   0   0   0  48   0   0]
 [  3   1   0   0   0   0   5   0 129   2]
 [  2   0   1   0   0   0   0   0   0 130]]
ROUND: 76
train
train...
test...
0.9365609348914858
 initial test  labels size
Confusion matrix, without normalization
[[ 54   5   5   0   0   1   2   1   3   4]
 [  1 135   0   0   0   0   2   0   0   0]
 [  4   1  86   0   0   0   1   2   2   2]
 [  0   0   0 125   0   0   0   0   0   0]
 [  0   0   0   0 142  13   0   0   0   0]
 [  0   0   1   0   0 149   0   0   0   0]
 [  2   2   1   0   0   0 128   0   5   0]
 [  0   0   0   0   0   0   1  43   0   0]
 [  4   0   

test...
0.9457429048414023
 initial test  labels size
Confusion matrix, without normalization
[[ 60   5   6   0   0   1   0   0   2   2]
 [  2 129   1   0   0   0   0   0   1   0]
 [  5   2  82   0   0   0   0   0   2   2]
 [  0   0   0 141   0   0   0   0   0   0]
 [  0   0   0   0 143  13   0   0   0   0]
 [  0   0   1   0   0 143   0   0   0   0]
 [  2   1   0   0   0   0 140   0   1   0]
 [  0   1   0   0   0   0   0  49   0   0]
 [  4   0   2   0   0   0   5   0 130   0]
 [  3   0   1   0   0   0   0   0   0 116]]
ROUND: 91
train
train...
test...
0.9373956594323873
 initial test  labels size
Confusion matrix, without normalization
[[ 49   3   4   1   0   1   2   0   3   5]
 [  1 117   0   0   0   0   0   0   2   0]
 [  4   1  73   0   1   0   2   1   3   2]
 [  0   0   0 133   0   0   0   0   0   0]
 [  0   0   1   0 152  11   0   0   0   0]
 [  0   0   0   0   0 167   0   0   0   0]
 [  1   1   1   0   0   0 124   0   5   0]
 [  0   1   0   0   0   0   0  51   1   0]
 [  2   1   

In [5]:
# 输出N次的平均准确率
avgscore = avgscore / N
print('avgscore....')
print(avgscore)

avgscore....
0.9377963272120197
