In [2]:
#-*-coding:utf-8-*-
import itertools as iters
from time import time
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.feature_selection import f_classif
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pylab as plt
from sklearn.metrics import matthews_corrcoef

##计算二肽特征
def statisPsi(seqs, protein, gap):
    psi = np.zeros(len(seqs))
    loops = len(protein) - gap - 1
    for start in range(loops):
        dipeptide = protein[start] + protein[start + gap + 1]
        index = seqs.index(dipeptide)
        psi[index] += 1
    psi = np.array(psi)
    psi = psi / sum(psi)
    return psi

##计算三肽特征
def statisPsi_3(seqs,protein,gap):
    psi = np.zeros(len(seqs))
    loops = len(protein) - gap - 2
    for start in range(loops):
        dipeptide = protein[start:(start + 2)] + protein[start + 2 + gap]
        index = seqs.index(dipeptide)
        psi[index] += 1
    psi = np.array(psi)
    psi = psi / sum(psi)
    return psi

# get gap dipeptide features psi matrix",
def all_psi(dataset,gap):
    gap_psi = np.zeros((len(dataset), len(DIPEPTIDE)))
    for idx in range(len(dataset)):
        gap_psi[idx] = statisPsi(DIPEPTIDE, dataset[idx], gap)
    return gap_psi

#输入核函数名称和参数gamma值，返回SVM训练十折交叉验证的准确率
def SVM_10fold(data,label):
    ##数据归一化（按列处理，必须做）
    scaler = preprocessing.StandardScaler().fit(data)
    data = scaler.transform(data)
    kf = KFold(data.shape[0],n_folds=10,shuffle = True)  #固定随机数可固定分组
    precision_average = 0.0
    TPCount = 0
    TNCount = 0
    train = []
    test = []
    test_true = []

    C_range = np.logspace(15, 5, 11, base=2)
    gamma_range = np.logspace(-15, -25, 11, base=2)
    param_grid = dict(gamma=gamma_range, C=C_range)
    grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=5)  # 基于交叉验证的网格搜索

    for train_index, test_index in kf:
        grid = grid.fit(data[train_index],label[train_index])
        clf = SVC(C = grid.best_params_['C'],gamma=grid.best_params_['gamma'])
        clf.fit(data[train_index],label[train_index])
        pred_train = clf.predict(data[train_index]).tolist()
        pred_test = clf.predict(data[test_index]).tolist()

        train.append(pred_train)
        test = test + pred_test
        test_true = test_true + label[test_index].tolist()

        testLabel = label[test_index]

        # print(metrics.confusion_matrix(pred_train,trainLabel))
        TP = 0
        TN = 0
        #同时输出svm的结果
        #查看测试集的结果
        for i in range(len(pred_test)):
            if pred_test[i] == 1 and (testLabel[i] == 1):
                TP = TP + 1
            elif pred_test[i] == 0 and (testLabel[i] == 0):
                TN = TN + 1
        TPCount = TPCount + TP
        TNCount = TNCount + TN

        precision = (TP + TN) * 1.0 / len(testLabel)
        precision_average = precision_average + precision


    precision_average = precision_average / 10
    positiveReca1 = TPCount * 1.0 / 99
    negtiveReca1 = TNCount * 1.0 / 208
    MCC = matthews_corrcoef(test_true,test)


    print (u'SVM的测试集结果：')
    print (TPCount,TNCount)
    print (u'准确率为' + str(precision_average))
    print (u'正类召回率为' + str(positiveReca1))
    print (u'负类召回率为' + str(negtiveReca1))
    print(u'MCC为' + str(matthews_corrcoef(test_true,test)))
   


    # ##保存最优模型的训练、测试集结果
    # with open('C:\\Users\Administrator\Desktop\data\\result\\result_4gapTC_svm_train.txt','w') as file:
    #     for i in train:
    #         file.write(str(i) + '\n')
    # with open('C:\\Users\Administrator\Desktop\data\\result\\result_4gapTC_svm_test.txt','w') as file:
    #     for i in test:
    #         file.write(str(i) + '\n')


    return precision_average,positiveReca1,negtiveReca1,MCC

if(__name__=='__main__'):
    # t0 = time()
    path = r''
    with open('virion.txt', "r") as file:
        train_tdata = [line.strip() for line in file if '>' != line[0]]
    with open( 'non-virion.txt', "r") as file:
        train_fdata = [line.strip() for line in file if '>' != line[0]]

    SAA = ('ACDEFGHIKLMNPQRSTVWY')
    DIPEPTIDE = []

    for dipeptide in iters.product(SAA, repeat=2):
        DIPEPTIDE.append(''.join(dipeptide))

    label = pd.Series([1 for i in range(len(train_tdata))]+ [0 for i in range(len(train_fdata))])
    label = label.as_matrix()

    ##将序列处理为特征向量
    gap=1
    gap_T = all_psi(train_tdata,gap)
    gap_F = all_psi(train_fdata,gap)

    dataAll = np.row_stack((gap_T, gap_F))  # 矩阵按行合并
    # print(data.shape)
    ##将特征结果保存，方便下次直接读取
    # np.savetxt('E:\Python Program\Protein\data\gapAll\\virion_gap0_dipe2.csv', data, delimiter=',')
    ##直接读入特征数据
    # dataAll = pd.read_csv(path = '\\virion_gap0_dipe2.csv',header=None).as_matrix()


    print(dataAll.shape)
    ##计算F值（ANOVA）
    f = f_classif(dataAll, label)[0]

    ##因为三肽特征较为稀疏，会有一些全为nan的列，要对这些列进行处理
    a = np.array(f)
    nan_count = np.sum(a != a)
    print(nan_count)
    ##将nan替换为0
    f1 = np.nan_to_num(a)
    ##将F值从大到小排序，获得相应的位置序号
    f_order = np.argsort(f1).tolist()[::-1]
    print(f_order)

    #选取所需的特征维度，构造模型
    data = dataAll[:,f_order[0:159]]
    acc = []
    pr = []
    nr = []
    MCC = []
    for cishu in range(200):
        presion,positiveRecall,negtiveRecall,matthews = SVM_10fold(data,label)
        acc.append(presion)
        pr.append(positiveRecall)
        nr.append(negtiveRecall)
        MCC.append(matthews)
        print(cishu)


#     ###为了选取最优特征子集，一般需要遍历，结果运行、保存、展示如下
#     p = []
#     for i in range(1,dataAll.shape[1]+1):
#         data = dataAll[:, f_order[0:i]]
#         print(data.shape)
#         presion, positiveRecall, negtiveRecall = SVM_10fold(data, label)
#         p.append(presion)
#     # print(SVM_10fold(data,label))
#     np.savetxt(str(gap)+'statisPsi.csv',p,delimiter=',')
#     plt.plot(range(len(p)),p)
#     plt.savefig(str(gap)+'statisPsi.pdf')
#     plt.show()
#     plt.close()

(307, 400)
0
[5, 16, 12, 316, 300, 69, 340, 335, 356, 100, 169, 105, 163, 305, 345, 347, 63, 68, 325, 85, 113, 166, 66, 171, 183, 143, 179, 174, 48, 397, 240, 117, 383, 306, 317, 61, 357, 145, 49, 15, 188, 62, 83, 162, 167, 28, 283, 222, 228, 64, 170, 276, 43, 17, 157, 288, 237, 337, 336, 181, 349, 79, 236, 116, 6, 13, 82, 194, 203, 137, 331, 44, 107, 67, 148, 11, 262, 294, 320, 386, 399, 363, 77, 27, 339, 156, 56, 74, 172, 140, 128, 154, 368, 268, 1, 7, 270, 286, 126, 197, 131, 178, 248, 199, 329, 52, 176, 388, 161, 88, 257, 57, 328, 0, 71, 241, 31, 139, 332, 65, 220, 385, 104, 146, 8, 355, 229, 301, 396, 122, 291, 23, 292, 376, 362, 334, 217, 214, 173, 269, 25, 182, 109, 189, 136, 210, 218, 168, 394, 303, 274, 201, 380, 264, 398, 297, 129, 120, 158, 54, 232, 223, 4, 72, 123, 351, 280, 200, 165, 365, 258, 361, 38, 132, 287, 370, 206, 389, 254, 234, 70, 86, 124, 346, 3, 375, 133, 333, 205, 10, 213, 22, 318, 33, 101, 125, 233, 282, 338, 55, 41, 314, 390, 184, 235, 26, 138, 21, 152, 215,

SVM的测试集结果：
75 183
准确率为0.8401075268817205
正类召回率为0.7575757575757576
负类召回率为0.8798076923076923
MCC为0.635718535982
58
SVM的测试集结果：
71 185
准确率为0.8338709677419354
正类召回率为0.7171717171717171
负类召回率为0.8894230769230769
MCC为0.615168678412
59
SVM的测试集结果：
70 181
准确率为0.8181720430107529
正类召回率为0.7070707070707071
负类召回率为0.8701923076923077
MCC为0.58040010646
60
SVM的测试集结果：
72 182
准确率为0.8272043010752688
正类召回率为0.7272727272727273
负类召回率为0.875
MCC为0.603887835368
61
SVM的测试集结果：
77 183
准确率为0.846774193548387
正类召回率为0.7777777777777778
负类召回率为0.8798076923076923
MCC为0.652566021749
62
SVM的测试集结果：
70 182
准确率为0.8205376344086022
正类召回率为0.7070707070707071
负类召回率为0.875
MCC为0.586878452389
63
SVM的测试集结果：
70 181
准确率为0.8172043010752688
正类召回率为0.7070707070707071
负类召回率为0.8701923076923077
MCC为0.58040010646
64
SVM的测试集结果：
74 184
准确率为0.8406451612903227
正类召回率为0.7474747474747475
负类召回率为0.8846153846153846
MCC为0.633785201189
65
SVM的测试集结果：
72 183
准确率为0.8306451612903227
正类召回率为0.7272727272727273
负类召回率为0.8798076923076923
MCC为0.610379551683
66
SVM的测试集结果：
7

SVM的测试集结果：
76 183
准确率为0.8427956989247312
正类召回率为0.7676767676767676
负类召回率为0.8798076923076923
MCC为0.644146004432
132
SVM的测试集结果：
71 181
准确率为0.8203225806451613
正类召回率为0.7171717171717171
负类召回率为0.8701923076923077
MCC为0.588939152457
133
SVM的测试集结果：
72 182
准确率为0.8275268817204301
正类召回率为0.7272727272727273
负类召回率为0.875
MCC为0.603887835368
134
SVM的测试集结果：
71 187
准确率为0.8403225806451614
正类召回率为0.7171717171717171
负类召回率为0.8990384615384616
MCC为0.628731166364
135
SVM的测试集结果：
72 183
准确率为0.8301075268817204
正类召回率为0.7272727272727273
负类召回率为0.8798076923076923
MCC为0.610379551683
136
SVM的测试集结果：
70 184
准确率为0.8275268817204301
正类召回率为0.7070707070707071
负类召回率为0.8846153846153846
MCC为0.600049249681
137
SVM的测试集结果：
71 183
准确率为0.8269892473118279
正类召回率为0.7171717171717171
负类召回率为0.8798076923076923
MCC为0.601910296614
138
SVM的测试集结果：
74 183
准确率为0.8379569892473118
正类召回率为0.7474747474747475
负类召回率为0.8798076923076923
MCC为0.627282439782
139
SVM的测试集结果：
67 184
准确率为0.8178494623655913
正类召回率为0.6767676767676768
负类召回率为0.8846153846153846
MCC为0.5745

In [7]:
a = np.array(MCC)

np.savetxt('./2anova/MCC.csv',a, delimiter = ',')