In [1]:
#-*-coding:utf-8-*-
import itertools as iters
from time import time
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.feature_selection import f_classif
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pylab as plt
from math import log2 
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn import metrics

def entropy(data):
    length,dataDict=len(data),{} 
    for b in data:  
        try:dataDict[b]+=1  
        except:dataDict[b]=1  
    entropy=sum([-d/length*log2(d/length) for d in list(dataDict.values())])
    return entropy

def informationgain(data,label):
    informationgain = []
    la = entropy(label)
    print(la)
    for j in range(data.shape[1] ):
        feature = data[:,j]
        for a in set(feature):
            ent = []
            op = []
            ne = []
            for k in range(len(feature)):
                if feature[k] >= a:
                    op.append(label[k])
                else:
                    ne.append(label[k])
            if len(op) == 0 or len(ne) == 0:
                ent.append(la)
            else:
                ent.append(len(op)*entropy(op)/len(label) + len(ne)*entropy(ne)/len(label))
        informationgain.append(la-min(ent))
    return informationgain

##计算二肽特征
def statisPsi(seqs, protein, gap):
    psi = np.zeros(len(seqs))
    loops = len(protein) - gap - 1
    for start in range(loops):
        dipeptide = protein[start] + protein[start + gap + 1]
        index = seqs.index(dipeptide)
        psi[index] += 1
    psi = np.array(psi)
    psi = psi / sum(psi)
    return psi


# get gap dipeptide features psi matrix",
def all_psi(dataset,gap):
    gap_psi = np.zeros((len(dataset), len(DIPEPTIDE)))
    for idx in range(len(dataset)):
        gap_psi[idx] = statisPsi(DIPEPTIDE, dataset[idx], gap)
    return gap_psi

#输入核函数名称和参数gamma值，返回SVM训练十折交叉验证的准确率
def SVM_10fold(data,label):
    ##数据归一化（按列处理，必须做）
    scaler = preprocessing.StandardScaler().fit(data)
    data = scaler.transform(data)
    kf = KFold(data.shape[0],n_folds=10,shuffle = True)  #固定随机数可固定分组
    precision_average = 0.0
    TPCount = 0
    TNCount = 0
    train = []
    test = []
    test_true = []

    C_range = np.logspace(15, 5, 11, base=2)
    gamma_range = np.logspace(-15, -25, 11, base=2)
    param_grid = dict(gamma=gamma_range, C=C_range)
    grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=5)  # 基于交叉验证的网格搜索

    for train_index, test_index in kf:
        grid = grid.fit(data[train_index],label[train_index])
        clf = SVC(C = grid.best_params_['C'],gamma=grid.best_params_['gamma'])
        clf.fit(data[train_index],label[train_index])
        pred_train = clf.predict(data[train_index]).tolist()
        pred_test = clf.predict(data[test_index]).tolist()

        train.append(pred_train)

        test = test + pred_test
        test_true = test_true + label[test_index].tolist()

        testLabel = label[test_index]

        # print(metrics.confusion_matrix(pred_train,trainLabel))
        TP = 0
        TN = 0
        #同时输出svm的结果
        #查看测试集的结果
        for i in range(len(pred_test)):
            if pred_test[i] == 1 and (testLabel[i] == 1):
                TP = TP + 1
            elif pred_test[i] == 0 and (testLabel[i] == 0):
                TN = TN + 1
        TPCount = TPCount + TP
        TNCount = TNCount + TN

        precision = (TP + TN) * 1.0 / len(testLabel)
        precision_average = precision_average + precision


    precision_average = precision_average / 10
    positiveReca1 = TPCount * 1.0 / 99
    negtiveReca1 = TNCount * 1.0 / 208
    MCC = matthews_corrcoef(test_true,test)


    print (u'SVM的测试集结果：')
    print (TPCount,TNCount)
    print (u'准确率为' + str(precision_average))
    print (u'正类召回率为' + str(positiveReca1))
    print (u'负类召回率为' + str(negtiveReca1))
    print(u'MCC为' + str(matthews_corrcoef(test_true,test)))
    print(accuracy_score(test_true,test))


    # ##保存最优模型的训练、测试集结果
    # with open('C:\\Users\Administrator\Desktop\data\\result\\result_4gapTC_svm_train.txt','w') as file:
    #     for i in train:
    #         file.write(str(i) + '\n')
    # with open('C:\\Users\Administrator\Desktop\data\\result\\result_4gapTC_svm_test.txt','w') as file:
    #     for i in test:
    #         file.write(str(i) + '\n')


    return precision_average,positiveReca1,negtiveReca1,MCC

if(__name__=='__main__'):
    # t0 = time()
    path = r''
    with open('virion.txt', "r") as file:
        train_tdata = [line.strip() for line in file if '>' != line[0]]
    with open( 'non-virion.txt', "r") as file:
        train_fdata = [line.strip() for line in file if '>' != line[0]]

    SAA = ('ACDEFGHIKLMNPQRSTVWY')
    DIPEPTIDE = []
    print('1')
    for dipeptide in iters.product(SAA, repeat=2):
        DIPEPTIDE.append(''.join(dipeptide))

    label = pd.Series([1 for i in range(len(train_tdata))]+ [0 for i in range(len(train_fdata))])
    label = label.as_matrix()

    ##将序列处理为特征向量
    gap = 1

    gap_T = all_psi(train_tdata,gap)
    gap_F = all_psi(train_fdata,gap)

    dataAll = np.row_stack((gap_T, gap_F))  # 矩阵按行合并
    # print(data.shape)
    ##将特征结果保存，方便下次直接读取
    # np.savetxt('E:\Python Program\Protein\data\gapAll\\virion_gap0_dipe2.csv', data, delimiter=',')
    ##直接读入特征数据
    # dataAll = pd.read_csv(path = '\\virion_gap0_dipe2.csv',header=None).as_matrix()


    print(dataAll.shape)
    ##计算F值（informationgain）
    f = informationgain(dataAll, label)

    #因为三肽特征较为稀疏，会有一些全为nan的列，要对这些列进行处理
    a = np.array(f)
    nan_count = np.sum(a != a)
    print(nan_count)
    ##将nan替换为0
    f1 = np.nan_to_num(a)
    #将F值从大到小排序，获得相应的位置序号
    f_order = np.argsort(f).tolist()[::-1]

    #选取所需的特征维度，构造模型
    data = dataAll[:,f_order[0:189]]
    acc = []
    pr = []
    nr = []
    MCC = []
    for cishu in range(200):
        presion,positiveRecall,negtiveRecall,matthews = SVM_10fold(data,label)
        acc.append(presion)
        pr.append(positiveRecall)
        nr.append(negtiveRecall)
        MCC.append(matthews)
    


    ###为了选取最优特征子集，一般需要遍历，结果运行、保存、展示如下
#     p = []
#     for i in range(1,dataAll.shape[1]+1):
#         data = dataAll[:,f_order[0:i]]
#         print(data.shape)
#         presion, positiveRecall, negtiveRecall = SVM_10fold(data, label)
#         p.append(presion)
#     # print(SVM_10fold(data,label))
#     np.savetxt(str(gap)+'informationgainstatisPsi.csv',p,delimiter=',')
#     plt.plot(range(len(p)),p)
#     plt.savefig(str(gap)+'informationgainstatisPsi.pdf')
#     plt.close()



1
(307, 400)
0.9070532598287184
0
SVM的测试集结果：
70 186
准确率为0.8341935483870969
正类召回率为0.7070707070707071
负类召回率为0.8942307692307693
MCC为0.613519528877
0.833876221498
SVM的测试集结果：
69 187
准确率为0.8336559139784946
正类召回率为0.696969696969697
负类召回率为0.8990384615384616
MCC为0.611998486045
0.833876221498
SVM的测试集结果：
68 187
准确率为0.8305376344086021
正类召回率为0.6868686868686869
负类召回率为0.8990384615384616
MCC为0.603607755688
0.830618892508
SVM的测试集结果：
70 188
准确率为0.8404301075268817
正类召回率为0.7070707070707071
负类召回率为0.9038461538461539
MCC为0.627307174647
0.840390879479
SVM的测试集结果：
63 185
准确率为0.8082795698924732
正类召回率为0.6363636363636364
负类召回率为0.8894230769230769
MCC为0.547284955116
0.807817589577
SVM的测试集结果：
68 185
准确率为0.8239784946236559
正类召回率为0.6868686868686869
负类召回率为0.8894230769230769
MCC为0.58985342547
0.824104234528
SVM的测试集结果：
73 183
准确率为0.833763440860215
正类召回率为0.7373737373737373
负类召回率为0.8798076923076923
MCC为0.618836518278
0.833876221498
SVM的测试集结果：
68 185
准确率为0.8243010752688174
正类召回率为0.6868686868686869
负类召回率为0.8894230769230769
MCC

SVM的测试集结果：
63 184
准确率为0.8046236559139786
正类召回率为0.6363636363636364
负类召回率为0.8846153846153846
MCC为0.54037908716
0.804560260586
SVM的测试集结果：
68 188
准确率为0.8336559139784946
正类召回率为0.6868686868686869
负类召回率为0.9038461538461539
MCC为0.610609776037
0.833876221498
SVM的测试集结果：
65 187
准确率为0.8208602150537635
正类召回率为0.6565656565656566
负类召回率为0.8990384615384616
MCC为0.578321526875
0.820846905537
SVM的测试集结果：
66 185
准确率为0.817741935483871
正类召回率为0.6666666666666666
负类召回率为0.8894230769230769
MCC为0.57288954914
0.817589576547
SVM的测试集结果：
71 186
准确率为0.8370967741935484
正类召回率为0.7171717171717171
负类召回率为0.8942307692307693
MCC为0.621910808035
0.837133550489
SVM的测试集结果：
65 185
准确率为0.8147311827956989
正类召回率为0.6565656565656566
负类召回率为0.8894230769230769
MCC为0.564377319301
0.814332247557
SVM的测试集结果：
72 187
准确率为0.8435483870967742
正类召回率为0.7272727272727273
负类召回率为0.8990384615384616
MCC为0.637075749899
0.843648208469
SVM的测试集结果：
68 183
准确率为0.8178494623655913
正类召回率为0.6868686868686869
负类召回率为0.8798076923076923
MCC为0.576415982443
0.817589576547
SVM

SVM的测试集结果：
68 184
准确率为0.8204301075268818
正类召回率为0.6868686868686869
负类召回率为0.8846153846153846
MCC为0.583096253903
0.820846905537
SVM的测试集结果：
66 187
准确率为0.8240860215053762
正类召回率为0.6666666666666666
负类召回率为0.8990384615384616
MCC为0.586770692238
0.824104234528
SVM的测试集结果：
72 186
准确率为0.8402150537634409
正类召回率为0.7272727272727273
负类召回率为0.8942307692307693
MCC为0.630288107142
0.840390879479
SVM的测试集结果：
65 187
准确率为0.8207526881720429
正类召回率为0.6565656565656566
负类召回率为0.8990384615384616
MCC为0.578321526875
0.820846905537
SVM的测试集结果：
65 186
准确率为0.8178494623655915
正类召回率为0.6565656565656566
负类召回率为0.8942307692307693
MCC为0.571306823879
0.817589576547
SVM的测试集结果：
69 184
准确率为0.8240860215053762
正类召回率为0.696969696969697
负类召回率为0.8846153846153846
MCC为0.591580924307
0.824104234528
SVM的测试集结果：
69 185
准确率为0.8273118279569893
正类召回率为0.696969696969697
负类召回率为0.8894230769230769
MCC为0.59830789139
0.827361563518
SVM的测试集结果：
70 185
准确率为0.8305376344086023
正类召回率为0.7070707070707071
负类召回率为0.8894230769230769
MCC为0.606745866171
0.830618892508
SVM

In [12]:
print(acc)
print(pr)
print(nr)
print(MCC)
a = np.array(MCC)

np.savetxt('./2informationgain/MCC.csv',a, delimiter = ',')

[0.8341935483870969, 0.8336559139784946, 0.8305376344086021, 0.8404301075268817, 0.8082795698924732, 0.8239784946236559, 0.833763440860215, 0.8243010752688174, 0.8344086021505376, 0.8275268817204303, 0.8206451612903226, 0.8369892473118279, 0.8238709677419356, 0.8393548387096775, 0.8108602150537635, 0.8373118279569892, 0.797741935483871, 0.8206451612903226, 0.8367741935483872, 0.8208602150537635, 0.8300000000000001, 0.8275268817204301, 0.8241935483870968, 0.8304301075268817, 0.8274193548387098, 0.8170967741935484, 0.8306451612903226, 0.8241935483870968, 0.8208602150537635, 0.8178494623655913, 0.8311827956989248, 0.8178494623655915, 0.8275268817204301, 0.8110752688172044, 0.8208602150537635, 0.7945161290322581, 0.8434408602150537, 0.8281720430107526, 0.8181720430107526, 0.8206451612903226, 0.8244086021505377, 0.8112903225806452, 0.8241935483870968, 0.8308602150537636, 0.8213978494623657, 0.8274193548387098, 0.8207526881720429, 0.8208602150537635, 0.8178494623655913, 0.8208602150537635, 0