In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, recall_score, precision_score
from collections import Counter
import pandas as pd
import numpy as np
''' Summary of the file.

    功能：
        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以KNN方法划分并获得准确率，通过混淆矩阵标示结果。

    输出：
        结果的混淆矩阵，每次划分的准确率，十次的平均准确率
'''



' Summary of the file.\n\n    功能：\n        读取特征值文件和标签文件，通过交叉验证获得训练集和测试集，以KNN方法划分并获得准确率，通过混淆矩阵标示结果。\n\n    输出：\n        结果的混淆矩阵，每次划分的准确率，十次的平均准确率\n'

In [2]:
# 实验室的15000条恶意样本，分为十个家族

# # GIST特征值
subfeatures = pd.read_csv(r'F:\virtus_test\Simhash_Gist\gist_f_train_full.csv',header=None)
labels = pd.read_csv(r'F:\virtus_test\Simhash_Gist\CNO_full.txt',header=None)

# # text段代码提取二进制代码，生成图像的LBP特征
# subfeatures=pd.read_csv('/home/stack/Data/Output/lbp_only_text/feature_train_full.csv',header=None)
# labels = pd.read_csv('/home/stack/Data/Output/lbp_only_text/CNO_full.txt',header=None)

# text段 GIST特征
# #subfeatures=pd.read_csv(r'E:\test\process_gist\gist_f_train_full.csv',header=None)
# #labels = pd.read_csv(r'E:\test\process_gist\CNO_full.txt',header=None)


In [3]:
# 对全NaN列行进行清除
result = pd.concat([subfeatures, labels], axis=1,ignore_index=True)
# print(result)
result=result.dropna(how='all',axis=1)
result=result.dropna(how='any',axis=0)
result=result.reindex()
# print(result)
subfeatures=result.iloc[:,:-1]
labels=result.iloc[:,-1]
print(subfeatures,'\n',labels)

            0         1         2         3         4         5         6    \
1      0.000011  0.000129  0.000012  0.000013  0.000190  0.011638  0.000046   
3      0.085982  0.040875  0.021009  0.007505  0.091562  0.046841  0.023623   
5      0.047440  0.016571  0.000198  0.000226  0.029116  0.015923  0.000131   
8      0.048793  0.000449  0.000058  0.000162  0.046696  0.007090  0.000088   
9      0.090414  0.069236  0.015272  0.027969  0.108240  0.064971  0.028188   
10     0.105230  0.055067  0.045520  0.043418  0.146050  0.082876  0.059477   
12     0.106120  0.052204  0.025907  0.030567  0.106140  0.064704  0.054260   
14     0.021522  0.000341  0.000092  0.005975  0.018221  0.012244  0.000068   
17     0.000052  0.000663  0.000042  0.000023  0.000109  0.011774  0.000069   
21     0.073294  0.031559  0.000907  0.001161  0.033077  0.018886  0.000125   
23     0.034921  0.000640  0.008101  0.006527  0.045931  0.007843  0.000342   
24     0.054843  0.017804  0.021770  0.000366  0.026

In [4]:
# 平均准确率归零
avgscore = 0
recallscore = 0
precisionscore = 0
N = 10

# 进行十次KNN测试
for i in range(N):
    # 以10%的比例进行交叉验证
    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(subfeatures, features_labels, test_size=0.1)
    X_train, X_test, y_train, y_test = train_test_split(subfeatures, labels, test_size=0.1)

    # 进行训练
    print('train...')
    # 进行KNN训练,距离为1
    neigh = KNeighborsClassifier(n_neighbors=1)
    neigh.fit(X_train, y_train)

    # 预试
    print('test...')
    c_test = neigh.predict(X_test)

    # print(y_test)
    # print(c_test)

    # 计算预测划分准确率
    print('accurary...')
    score = neigh.score(X_test, y_test)
    print(classification_report(y_test,c_test))
    avgscore = avgscore + score
    recallscore = recallscore + recall_score(y_test,c_test,average="macro")
    precisionscore = precisionscore + precision_score(y_test,c_test,average="macro")
    print(score)

    # 通过混淆矩阵进行结果标示
    cm = confusion_matrix(y_test, c_test)
    np.set_printoptions(threshold=10000)
    np.set_printoptions(precision=2)
    print('Confusion matrix, without normalization')
    print(str(cm))


train...
test...
accurary...
              precision    recall  f1-score   support

           1       0.75      0.71      0.73        63
           2       0.94      0.96      0.95       125
           3       0.88      0.82      0.85        97
           4       0.99      1.00      1.00       168
           5       1.00      0.93      0.97       152
           6       0.94      1.00      0.97       152
           7       0.94      0.93      0.94       128
           8       0.91      1.00      0.95        48
           9       0.95      0.97      0.96       130
          10       0.97      0.98      0.97       135

   micro avg       0.94      0.94      0.94      1198
   macro avg       0.93      0.93      0.93      1198
weighted avg       0.94      0.94      0.94      1198

0.9449081803005008
Confusion matrix, without normalization
[[ 45   5   5   0   0   0   1   3   2   2]
 [  3 120   1   0   0   0   1   0   0   0]
 [  7   1  80   1   0   0   3   1   2   2]
 [  0   0   0 168   0   

test...
accurary...
              precision    recall  f1-score   support

           1       0.79      0.76      0.77        87
           2       0.95      0.94      0.94       125
           3       0.87      0.92      0.89        83
           4       0.99      1.00      1.00       169
           5       0.99      0.92      0.96       154
           6       0.90      1.00      0.95       121
           7       0.93      0.95      0.94       149
           8       1.00      1.00      1.00        46
           9       0.95      0.93      0.94       140
          10       0.97      0.95      0.96       124

   micro avg       0.94      0.94      0.94      1198
   macro avg       0.93      0.94      0.93      1198
weighted avg       0.94      0.94      0.94      1198

0.9398998330550918
Confusion matrix, without normalization
[[ 66   6   5   0   0   1   1   0   5   3]
 [  6 117   1   0   0   0   1   0   0   0]
 [  4   0  76   1   0   0   0   0   1   1]
 [  0   0   0 169   0   0   0   0

In [5]:
# 输出N次的平均准确率
avgscore = avgscore / N
recallscore = recallscore / N
precisionscore = precisionscore / N

print('avgscore....')
print(avgscore)
print('False positive rate')
print(1-precisionscore)
print('false negative rate')
print(1-recallscore)


avgscore....
0.9356427378964941
False positive rate
0.0739616897089459
false negative rate
0.07296649520946463
