In [1]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,precision_score,recall_score
from collections import Counter
import pandas as pd
import numpy as np

''' Summary of the file.
    
    功能：
        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以随机森林方法划分并获得准确率，通过混淆矩阵标示结果。
    
    输出：
        结果的混淆矩阵，每次划分的准确率，十次的平均准确率
'''


' Summary of the file.\n    \n    功能：\n        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以随机森林方法划分并获得准确率，通过混淆矩阵标示结果。\n    \n    输出：\n        结果的混淆矩阵，每次划分的准确率，十次的平均准确率\n'

In [2]:
# GIST特征值
subfeatures = pd.read_csv(r'F:\virtus_test\Simhash_Gist\gist_f_train_full.csv',header=None)
labels = pd.read_csv(r'F:\virtus_test\Simhash_Gist\CNO_full.txt',header=None)


In [3]:
# 对全NaN列行进行清除
result = pd.concat([subfeatures, labels], axis=1,ignore_index=True)
# print(result)
result=result.dropna(how='all',axis=1)
result=result.dropna(how='any',axis=0)
result=result.reindex()
# print(result)
subfeatures=result.iloc[:,:-1]
labels=result.iloc[:,-1]
# print(subfeatures,'\n',labels)

In [4]:
# 平均准确率归零
avgscore = 0
recallscore = 0
precisionscore = 0
N = 10

In [5]:
# 进行十次随机森林测试
for i in range(N):
    # 以10%的比例进行交叉验证
    # X_train, X_test, y_train, y_test = cross_validation.train_test_split(subfeatures,features_labels,test_size=0.1)
    X_train, X_test, y_train, y_test = train_test_split(subfeatures, labels, test_size=0.1)

    # 进行训练
    print('train...')
    # 进行随机森林训练,30课树，不限制进程数
    srf = RF(n_estimators=30, n_jobs=-1)
    srf.fit(X_train, y_train)

    # 预试
    print("test...")
    c_test = srf.predict(X_test)


    # 计算预测划分准确率
    score = srf.score(X_test, y_test)
    print(score)
    print(classification_report(y_test,c_test))
    # print("c_test")
    # print(c_test)
    # print('y_test')
    # print(y_test)

    avgscore = avgscore + score
    recallscore = recallscore + recall_score(y_test,c_test,average="macro")
    precisionscore = precisionscore + precision_score(y_test,c_test,average="macro")
    print(" initial test  labels size")

    # 通过混淆矩阵进行结果标示
    cm = confusion_matrix(y_test, c_test)
    np.set_printoptions(threshold=10000)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(str(cm))



train...
test...
0.9382303839732888
              precision    recall  f1-score   support

           1       0.81      0.72      0.76        76
           2       0.96      0.96      0.96       143
           3       0.86      0.86      0.86        86
           4       1.00      1.00      1.00       147
           5       1.00      0.89      0.94       162
           6       0.89      0.99      0.94       149
           7       0.97      0.95      0.96       152
           8       0.98      0.98      0.98        46
           9       0.90      0.95      0.92       129
          10       0.93      0.99      0.96       108

   micro avg       0.94      0.94      0.94      1198
   macro avg       0.93      0.93      0.93      1198
weighted avg       0.94      0.94      0.94      1198

 initial test  labels size
Confusion matrix, without normalization
[[ 55   3   7   0   0   0   1   0   6   4]
 [  5 137   0   0   0   0   0   1   0   0]
 [  2   0  74   0   0   0   2   0   5   3]
 [  0   0

test...
0.9382303839732888
              precision    recall  f1-score   support

           1       0.76      0.72      0.74        79
           2       0.92      0.95      0.93       125
           3       0.92      0.76      0.83       105
           4       0.99      1.00      0.99       160
           5       0.98      0.95      0.97       136
           6       0.96      0.99      0.97       156
           7       0.97      0.96      0.96       138
           8       0.91      0.98      0.94        43
           9       0.90      0.98      0.94       119
          10       0.96      0.97      0.96       137

   micro avg       0.94      0.94      0.94      1198
   macro avg       0.93      0.93      0.93      1198
weighted avg       0.94      0.94      0.94      1198

 initial test  labels size
Confusion matrix, without normalization
[[ 57   8   5   1   1   0   1   0   3   3]
 [  1 119   0   0   0   0   1   3   1   0]
 [ 12   2  80   0   0   0   1   1   6   3]
 [  0   0   0 160 

In [6]:
# 输出N次的平均准确率
avgscore = avgscore / N
recallscore = recallscore / N
precisionscore = precisionscore / N

print('avgscore....')
print(avgscore)
print('False positive rate')
print(1-precisionscore)
print('false negative rate')
print(1-recallscore)


avgscore....
0.9389816360601001
False positive rate
0.0730382907721916
false negative rate
0.07193824417541106
