In [1]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,precision_score,recall_score
from collections import Counter
import pandas as pd
import numpy as np

''' Summary of the file.
    
    功能：
        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以随机森林方法划分并获得准确率，通过混淆矩阵标示结果。
    
    输出：
        结果的混淆矩阵，每次划分的准确率，十次的平均准确率
'''


' Summary of the file.\n    \n    功能：\n        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以随机森林方法划分并获得准确率，通过混淆矩阵标示结果。\n    \n    输出：\n        结果的混淆矩阵，每次划分的准确率，十次的平均准确率\n'

In [2]:
# GIST特征值
subfeatures = pd.read_csv(r'E:\test\process_gist2\gist_f_train_full.csv',header=None)
labels = pd.read_csv(r'E:\test\process_gist2\CNO_full.txt',header=None)

In [3]:
# 对全NaN列行进行清除
result = pd.concat([subfeatures, labels], axis=1,ignore_index=True)
# print(result)
result=result.dropna(how='all',axis=1)
result=result.dropna(how='any',axis=0)
result=result.reindex()
# print(result)
subfeatures=result.iloc[:,:-1]
labels=result.iloc[:,-1]
# print(subfeatures,'\n',labels)

In [4]:
# 平均准确率归零
avgscore = 0
recallscore = 0
precisionscore = 0
N = 10

In [5]:
# 进行十次随机森林测试
for i in range(N):
    # 以10%的比例进行交叉验证
    # X_train, X_test, y_train, y_test = cross_validation.train_test_split(subfeatures,features_labels,test_size=0.1)
    X_train, X_test, y_train, y_test = train_test_split(subfeatures, labels, test_size=0.1)

    # 进行训练
    print('train...')
    # 进行随机森林训练,30课树，不限制进程数
    srf = RF(n_estimators=30, n_jobs=-1)
    srf.fit(X_train, y_train)

    # 预试
    print("test...")
    c_test = srf.predict(X_test)


    # 计算预测划分准确率
    score = srf.score(X_test, y_test)
    print(score)
    print(classification_report(y_test,c_test))
    # print("c_test")
    # print(c_test)
    # print('y_test')
    # print(y_test)

    avgscore = avgscore + score
    recallscore = recallscore + recall_score(y_test,c_test,average="macro")
    precisionscore = precisionscore + precision_score(y_test,c_test,average="macro")
    print(" initial test  labels size")

    # 通过混淆矩阵进行结果标示
    cm = confusion_matrix(y_test, c_test)
    np.set_printoptions(threshold=10000)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(str(cm))



train...
test...
0.8272800645682001
              precision    recall  f1-score   support

           1       0.40      0.60      0.48        99
           2       0.98      0.84      0.91       133
           3       0.92      0.65      0.76       110
           4       0.56      0.82      0.67       153
           5       0.99      0.99      0.99       144
           6       0.99      0.98      0.98        96
           7       0.97      0.84      0.90       147
           8       0.97      0.97      0.97        71
           9       0.94      0.82      0.87       146
          10       0.96      0.79      0.86       140

   micro avg       0.83      0.83      0.83      1239
   macro avg       0.87      0.83      0.84      1239
weighted avg       0.87      0.83      0.84      1239

 initial test  labels size
Confusion matrix, without normalization
[[ 59   1   4  32   0   0   0   0   2   1]
 [  4 112   0  13   0   0   1   2   0   1]
 [ 17   0  71  20   0   0   0   0   1   1]
 [ 20   0

test...
0.807909604519774
              precision    recall  f1-score   support

           1       0.50      0.61      0.55       132
           2       0.98      0.82      0.90       135
           3       0.72      0.68      0.70       101
           4       0.56      0.81      0.66       141
           5       1.00      0.99      1.00       125
           6       1.00      0.95      0.97        92
           7       0.99      0.80      0.88       154
           8       0.99      1.00      0.99        67
           9       0.88      0.78      0.83       146
          10       0.84      0.76      0.80       146

   micro avg       0.81      0.81      0.81      1239
   macro avg       0.85      0.82      0.83      1239
weighted avg       0.84      0.81      0.82      1239

 initial test  labels size
Confusion matrix, without normalization
[[ 81   1  12  26   0   0   0   1   2   9]
 [  8 111   2  14   0   0   0   0   0   0]
 [ 14   0  69   9   0   0   1   0   5   3]
 [ 19   0   2 114  

In [6]:
# 输出N次的平均准确率
avgscore = avgscore / N
recallscore = recallscore / N
precisionscore = precisionscore / N

print('avgscore....')
print(avgscore)
print('False positive rate')
print(1-precisionscore)
print('false negative rate')
print(1-recallscore)


avgscore....
0.8198547215496367
False positive rate
0.13912149620781522
false negative rate
0.17229023701977053
