In [7]:
#-*-coding:utf-8-*-
import itertools as iters
from time import time
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.feature_selection import f_classif
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pylab as plt
from math import log2 
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score

def entropy(data):
    length,dataDict=len(data),{} 
    for b in data:  
        try:dataDict[b]+=1  
        except:dataDict[b]=1  
    entropy=sum([-d/length*log2(d/length) for d in list(dataDict.values())])
    return entropy

def informationgain(data,label):
    informationgain = []
    la = entropy(label)
    print(la)
    for j in range(data.shape[1] ):
        feature = data[:,j]
        for a in set(feature):
            ent = []
            op = []
            ne = []
            for k in range(len(feature)):
                if feature[k] >= a:
                    op.append(label[k])
                else:
                    ne.append(label[k])
            if len(op) == 0 or len(ne) == 0:
                ent.append(la)
            else:
                ent.append(len(op)*entropy(op)/len(label) + len(ne)*entropy(ne)/len(label))
        informationgain.append(la-min(ent))
    return informationgain

##计算三肽特征
def statisPsi_3(seqs,protein,gap1,gap2):
    psi = np.zeros(len(seqs))
    loops = len(protein) - gap1 - gap2 - 2
    for start in range(loops):
        dipeptide = protein[start] + protein[start + gap1 + 1] + protein[start + 2 + gap1 + gap2]
        index = seqs.index(dipeptide)
        psi[index] += 1
    psi = np.array(psi)
    psi = psi / sum(psi)
    return psi

# get gap dipeptide features psi matrix",
def all_psi(dataset,gap1,gap2):
    gap_psi = np.zeros((len(dataset), len(DIPEPTIDE)))
    for idx in range(len(dataset)):
        gap_psi[idx] = statisPsi_3(DIPEPTIDE, dataset[idx], gap1,gap2)
    return gap_psi

#输入核函数名称和参数gamma值，返回SVM训练十折交叉验证的准确率
def SVM_10fold(data,label):
    ##数据归一化（按列处理，必须做）
    scaler = preprocessing.StandardScaler().fit(data)
    data = scaler.transform(data)
    kf = KFold(data.shape[0],n_folds=10,shuffle = True)  #固定随机数可固定分组
    precision_average = 0.0
    TPCount = 0
    TNCount = 0
    train = []
    test = []
    test_true = []

    C_range = np.logspace(15, 5, 11, base=2)
    gamma_range = np.logspace(-15, -25, 11, base=2)
    param_grid = dict(gamma=gamma_range, C=C_range)
    grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=5)  # 基于交叉验证的网格搜索

    for train_index, test_index in kf:
        grid = grid.fit(data[train_index],label[train_index])
        clf = SVC(C = grid.best_params_['C'],gamma=grid.best_params_['gamma'])
        clf.fit(data[train_index],label[train_index])
        pred_train = clf.predict(data[train_index]).tolist()
        pred_test = clf.predict(data[test_index]).tolist()

        train.append(pred_train)
        test = test + pred_test
        test_true = test_true + label[test_index].tolist()

        testLabel = label[test_index]

        # print(metrics.confusion_matrix(pred_train,trainLabel))
        TP = 0
        TN = 0
        #同时输出svm的结果
        #查看测试集的结果
        for i in range(len(pred_test)):
            if pred_test[i] == 1 and (testLabel[i] == 1):
                TP = TP + 1
            elif pred_test[i] == 0 and (testLabel[i] == 0):
                TN = TN + 1
        TPCount = TPCount + TP
        TNCount = TNCount + TN

        precision = (TP + TN) * 1.0 / len(testLabel)
        precision_average = precision_average + precision


    precision_average = precision_average / 10
    positiveReca1 = TPCount * 1.0 / 99
    negtiveReca1 = TNCount * 1.0 / 208
    MCC = matthews_corrcoef(test_true,test)


    print (u'SVM的测试集结果：')
    print (TPCount,TNCount)
    print (u'准确率为' + str(precision_average))
    print (u'正类召回率为' + str(positiveReca1))
    print (u'负类召回率为' + str(negtiveReca1))
    print(u'MCC为' + str(matthews_corrcoef(test_true,test)))
    print(accuracy_score(test_true,test))


    # ##保存最优模型的训练、测试集结果
    # with open('C:\\Users\Administrator\Desktop\data\\result\\result_4gapTC_svm_train.txt','w') as file:
    #     for i in train:
    #         file.write(str(i) + '\n')
    # with open('C:\\Users\Administrator\Desktop\data\\result\\result_4gapTC_svm_test.txt','w') as file:
    #     for i in test:
    #         file.write(str(i) + '\n')


    return precision_average,positiveReca1,negtiveReca1,MCC

if(__name__=='__main__'):
    # t0 = time()
    path = r''
    with open('virion.txt', "r") as file:
        train_tdata = [line.strip() for line in file if '>' != line[0]]
    with open( 'non-virion.txt', "r") as file:
        train_fdata = [line.strip() for line in file if '>' != line[0]]

    SAA = ('ACDEFGHIKLMNPQRSTVWY')
    DIPEPTIDE = []

    for dipeptide in iters.product(SAA, repeat=3):
        DIPEPTIDE.append(''.join(dipeptide))

    label = pd.Series([1 for i in range(len(train_tdata))]+ [0 for i in range(len(train_fdata))])
    label = label.as_matrix()

    ##将序列处理为特征向量
    ##gap1 和 gap2 的数值是任意选取的，一般是从0到10遍历寻优的，但是那样太浪费时间了，你可以先直接指定一个值先看效果

    gap1 = 2
    gap2 = 1
    gap_T = all_psi(train_tdata,gap1,gap2)
    gap_F = all_psi(train_fdata,gap1,gap2)

    dataAll = np.row_stack((gap_T, gap_F))  # 矩阵按行合并
    # print(data.shape)
    ##将特征结果保存，方便下次直接读取
    # np.savetxt('E:\Python Program\Protein\data\gapAll\\virion_gap0_dipe2.csv', data, delimiter=',')
    ##直接读入特征数据
    # dataAll = pd.read_csv(path = '\\virion_gap0_dipe2.csv',header=None).as_matrix()


    print(dataAll.shape)
    ##计算F值
    f = informationgain(dataAll,label)

    ##因为三肽特征较为稀疏，会有一些全为nan的列，要对这些列进行处理
    a = np.array(f)
    nan_count = np.sum(a != a)
    print(nan_count)
    ##将nan替换为0
    f1 = np.nan_to_num(a)
    ##将F值从大到小排序，获得相应的位置序号
    f_order = np.argsort(f1).tolist()[::-1]
    #选取所需的特征维度，构造模型，如这里是前100维特征，这个数值按照需要也可以更改
    p = []
    n = 771
    data = dataAll[:,f_order[0:n]]

    for cishu in range(103):
        presion,positiveRecall,negtiveRecall,matthews = SVM_10fold(data,label)
        acc.append(presion)
        pr.append(positiveRecall)
        nr.append(negtiveRecall)
        MCC.append(matthews)
        
#             p.append(presion)
#         np.savetxt('./新增/'+ str(gap1) + str(gap2)+'informationgain.csv',p,delimiter=',')

    ###为了选取最优特征子集，一般需要遍历，结果运行、保存、展示如下
#     p = []
#     for i in range(1,800,10):
#         data = dataAll[:, f_order[0:i]]
#         print(data.shape)
#         presion, positiveRecall, negtiveRecall = SVM_10fold(data, label)
#         p.append(presion)
#     # print(SVM_10fold(data,label))
#     np.savetxt(str(gap1) + 'and' + str(gap2) + 'informationgainstatisPsi_3.csv',p,delimiter=',')
#     plt.plot(range(len(p)),p)
#     plt.savefig(str(gap1) +'and' + str(gap2) + 'informationgainstatisPsi_3.pdf')
#     plt.close()


(307, 8000)
0.9070532598287184
0
SVM的测试集结果：
89 208
准确率为0.9670967741935484
正类召回率为0.898989898989899
负类召回率为1.0
MCC为0.926148924433
0.967426710098
SVM的测试集结果：
89 208
准确率为0.9675268817204301
正类召回率为0.898989898989899
负类召回率为1.0
MCC为0.926148924433
0.967426710098
SVM的测试集结果：
89 208
准确率为0.9676344086021504
正类召回率为0.898989898989899
负类召回率为1.0
MCC为0.926148924433
0.967426710098
SVM的测试集结果：
88 208
准确率为0.9638709677419355
正类召回率为0.8888888888888888
负类召回率为1.0
MCC为0.918826154273
0.964169381107
SVM的测试集结果：
87 207
准确率为0.9575268817204302
正类召回率为0.8787878787878788
负类召回率为0.9951923076923077
MCC为0.903415335361
0.957654723127
SVM的测试集结果：
88 207
准确率为0.960752688172043
正类召回率为0.8888888888888888
负类召回率为0.9951923076923077
MCC为0.91078982116
0.960912052117
SVM的测试集结果：
88 206
准确率为0.957741935483871
正类召回率为0.8888888888888888
负类召回率为0.9903846153846154
MCC为0.902863569488
0.957654723127
SVM的测试集结果：
86 208
准确率为0.9575268817204302
正类召回率为0.8686868686868687
负类召回率为1.0
MCC为0.904205530351
0.957654723127
SVM的测试集结果：
89 208
准确率为0.9675268817204301
正类召回率为0

SVM的测试集结果：
87 208
准确率为0.9608602150537635
正类召回率为0.8787878787878788
负类召回率为1.0
MCC为0.911511927783
0.960912052117
SVM的测试集结果：
89 208
准确率为0.9675268817204301
正类召回率为0.898989898989899
负类召回率为1.0
MCC为0.926148924433
0.967426710098
SVM的测试集结果：
89 208
准确率为0.9674193548387097
正类召回率为0.898989898989899
负类召回率为1.0
MCC为0.926148924433
0.967426710098
SVM的测试集结果：
84 208
准确率为0.9511827956989247
正类召回率为0.8484848484848485
负类召回率为1.0
MCC为0.889613329831
0.951140065147
SVM的测试集结果：
91 208
准确率为0.9740860215053763
正类召回率为0.9191919191919192
负类召回率为1.0
MCC为0.940822923847
0.973941368078
SVM的测试集结果：
89 208
准确率为0.9675268817204301
正类召回率为0.898989898989899
负类召回率为1.0
MCC为0.926148924433
0.967426710098
SVM的测试集结果：
87 207
准确率为0.9573118279569893
正类召回率为0.8787878787878788
负类召回率为0.9951923076923077
MCC为0.903415335361
0.957654723127
SVM的测试集结果：
89 207
准确率为0.9643010752688171
正类召回率为0.898989898989899
负类召回率为0.9951923076923077
MCC为0.91817225809
0.964169381107
SVM的测试集结果：
90 208
准确率为0.9706451612903226
正类召回率为0.9090909090909091
负类召回率为1.0
MCC为0.933480946693


In [2]:
#-*-coding:utf-8-*-
import itertools as iters
from time import time
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.feature_selection import f_classif
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pylab as plt
from math import log2 
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

def entropy(data):
    length,dataDict=len(data),{} 
    for b in data:  
        try:dataDict[b]+=1  
        except:dataDict[b]=1  
    entropy=sum([-d/length*log2(d/length) for d in list(dataDict.values())])
    return entropy

def informationgain(data,label):
    informationgain = []
    la = entropy(label)
    print(la)
    for j in range(data.shape[1] ):
        feature = data[:,j]
        for a in set(feature):
            ent = []
            op = []
            ne = []
            for k in range(len(feature)):
                if feature[k] >= a:
                    op.append(label[k])
                else:
                    ne.append(label[k])
            if len(op) == 0 or len(ne) == 0:
                ent.append(la)
            else:
                ent.append(len(op)*entropy(op)/len(label) + len(ne)*entropy(ne)/len(label))
        informationgain.append(la-min(ent))
    return informationgain

##计算三肽特征
def statisPsi_3(seqs,protein,gap1,gap2):
    psi = np.zeros(len(seqs))
    loops = len(protein) - gap1 - gap2 - 2
    for start in range(loops):
        dipeptide = protein[start] + protein[start + gap1 + 1] + protein[start + 2 + gap1 + gap2]
        index = seqs.index(dipeptide)
        psi[index] += 1
    psi = np.array(psi)
    psi = psi / sum(psi)
    return psi

# get gap dipeptide features psi matrix",
def all_psi(dataset,gap1,gap2):
    gap_psi = np.zeros((len(dataset), len(DIPEPTIDE)))
    for idx in range(len(dataset)):
        gap_psi[idx] = statisPsi_3(DIPEPTIDE, dataset[idx], gap1,gap2)
    return gap_psi

#输入核函数名称和参数gamma值，返回SVM训练十折交叉验证的准确率
def SVM_10fold(data,label):
    ##数据归一化（按列处理，必须做）
    scaler = preprocessing.StandardScaler().fit(data)
    data = scaler.transform(data)
    kf = KFold(data.shape[0],n_folds=10,shuffle = True,random_state=100)  #固定随机数可固定分组
    precision_average = 0.0
    TPCount = 0
    TNCount = 0
    train = []
    test = []
    test_true = []

    C_range = np.logspace(15, 5, 11, base=2)
    gamma_range = np.logspace(-15, -25, 11, base=2)
    param_grid = dict(gamma=gamma_range, C=C_range)
    grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=5)  # 基于交叉验证的网格搜索

    for train_index, test_index in kf:
        grid = grid.fit(data[train_index],label[train_index])
        clf = SVC(C = grid.best_params_['C'],gamma=grid.best_params_['gamma'],probability=True)
        clf.fit(data[train_index],label[train_index])
        pred_train = clf.predict(data[train_index]).tolist()
        pred_test = clf.predict(data[test_index]).tolist()

        train.append(pred_train)
        test = test + pred_test
        test_true = test_true + label[test_index].tolist()

        testLabel = label[test_index]

        # print(metrics.confusion_matrix(pred_train,trainLabel))
        TP = 0
        TN = 0
        #同时输出svm的结果
        #查看测试集的结果
        for i in range(len(pred_test)):
            if pred_test[i] == 1 and (testLabel[i] == 1):
                TP = TP + 1
            elif pred_test[i] == 0 and (testLabel[i] == 0):
                TN = TN + 1
        TPCount = TPCount + TP
        TNCount = TNCount + TN

        precision = (TP + TN) * 1.0 / len(testLabel)
        precision_average = precision_average + precision


    precision_average = precision_average / 10
    positiveReca1 = TPCount * 1.0 / 99
    negtiveReca1 = TNCount * 1.0 / 208


    print (u'SVM的测试集结果：')
    print (TPCount,TNCount)
    print (u'准确率为' + str(precision_average))
    print (u'正类召回率为' + str(positiveReca1))
    print (u'负类召回率为' + str(negtiveReca1))
    print(u'MCC为' + str(matthews_corrcoef(test_true,test)))
    print(accuracy_score(test_true,test))


    # ##保存最优模型的训练、测试集结果
    # with open('C:\\Users\Administrator\Desktop\data\\result\\result_4gapTC_svm_train.txt','w') as file:
    #     for i in train:
    #         file.write(str(i) + '\n')
    # with open('C:\\Users\Administrator\Desktop\data\\result\\result_4gapTC_svm_test.txt','w') as file:
    #     for i in test:
    #         file.write(str(i) + '\n')


    return precision_average,positiveReca1,negtiveReca1

if(__name__=='__main__'):
    # t0 = time()
    path = r''
    with open('virion.txt', "r") as file:
        train_tdata = [line.strip() for line in file if '>' != line[0]]
    with open( 'non-virion.txt', "r") as file:
        train_fdata = [line.strip() for line in file if '>' != line[0]]

    SAA = ('ACDEFGHIKLMNPQRSTVWY')
    DIPEPTIDE = []

    for dipeptide in iters.product(SAA, repeat=3):
        DIPEPTIDE.append(''.join(dipeptide))

    label = pd.Series([1 for i in range(len(train_tdata))]+ [0 for i in range(len(train_fdata))])
    label = label.as_matrix()

    ##将序列处理为特征向量
    ##gap1 和 gap2 的数值是任意选取的，一般是从0到10遍历寻优的，但是那样太浪费时间了，你可以先直接指定一个值先看效果

    gap1 = 2
    gap2 = 1
    gap_T = all_psi(train_tdata,gap1,gap2)
    gap_F = all_psi(train_fdata,gap1,gap2)

    dataAll = np.row_stack((gap_T, gap_F))  # 矩阵按行合并
    # print(data.shape)
    ##将特征结果保存，方便下次直接读取
    # np.savetxt('E:\Python Program\Protein\data\gapAll\\virion_gap0_dipe2.csv', data, delimiter=',')
    ##直接读入特征数据
    # dataAll = pd.read_csv(path = '\\virion_gap0_dipe2.csv',header=None).as_matrix()


    print(dataAll.shape)
    ##计算F值
    f = informationgain(dataAll,label)

    ##因为三肽特征较为稀疏，会有一些全为nan的列，要对这些列进行处理
    a = np.array(f)
    nan_count = np.sum(a != a)
    print(nan_count)
    ##将nan替换为0
    f1 = np.nan_to_num(a)
    ##将F值从大到小排序，获得相应的位置序号
    f_order = np.argsort(f1).tolist()[::-1]
    #选取所需的特征维度，构造模型，如这里是前100维特征，这个数值按照需要也可以更改
    p = []
    n = 771
    data = dataAll[:,f_order[0:n]]
    
    scaler = preprocessing.StandardScaler().fit(data)
    data = scaler.transform(data)
    C_range = np.logspace(15, 5, 11, base=2)
    gamma_range = np.logspace(-15, -25, 11, base=2)
    param_grid = dict(gamma=gamma_range, C=C_range)
    grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=5)  # 基于交叉验证的网格搜索
    grid = grid.fit(data,label)
    clf = SVC(C = grid.best_params_['C'],gamma=grid.best_params_['gamma'],probability=True)
    clf.fit(data,label)
    joblib.dump(clf, "model.m")
#             p.append(presion)
#         np.savetxt('./新增/'+ str(gap1) + str(gap2)+'informationgain.csv',p,delimiter=',')

        ###为了选取最优特征子集，一般需要遍历，结果运行、保存、展示如下
    #     p = []
    #     for i in range(1,800,10):
    #         data = dataAll[:, f_order[0:i]]
    #         print(data.shape)
    #         presion, positiveRecall, negtiveRecall = SVM_10fold(data, label)
    #         p.append(presion)
    #     # print(SVM_10fold(data,label))
    #     np.savetxt(str(gap1) + 'and' + str(gap2) + 'informationgainstatisPsi_3.csv',p,delimiter=',')
    #     plt.plot(range(len(p)),p)
    #     plt.savefig(str(gap1) +'and' + str(gap2) + 'informationgainstatisPsi_3.pdf')
    #     plt.close()


(307, 8000)
0.9070532598287184
0


In [12]:
print(len(MCC))
print(np.mean(MCC))
a = np.array(MCC)

np.savetxt('./3informationgain/MCC.csv',a, delimiter = ',')

200
0.916173587602


In [26]:
np.savetxt('f_order.txt',f_order)
print(f_order)

[1105, 336, 5383, 5500, 325, 1269, 2056, 1871, 6234, 2316, 6563, 7157, 1268, 4951, 5432, 6651, 100, 149, 6872, 3394, 6958, 5189, 1673, 6032, 2312, 3208, 1377, 3603, 5280, 4667, 6792, 2164, 6323, 6067, 3266, 2862, 16, 6405, 5833, 4804, 6623, 287, 3342, 7791, 1504, 1542, 2919, 7762, 1288, 4469, 3668, 5536, 6592, 6735, 1535, 7140, 2397, 6785, 6495, 1374, 3369, 3, 2307, 7923, 6589, 2083, 2054, 2063, 3455, 5737, 6248, 3289, 3911, 2553, 456, 6666, 3151, 4730, 4681, 1165, 4460, 4240, 2651, 1330, 6729, 500, 3592, 6775, 3242, 7687, 3322, 5064, 973, 327, 5169, 3648, 3743, 2255, 6740, 5795, 6983, 1494, 3360, 4720, 5357, 2319, 1157, 2283, 2339, 1295, 6282, 1568, 2097, 2962, 5663, 6382, 6682, 7248, 3432, 4443, 1250, 6971, 5883, 5897, 3211, 3869, 3119, 3754, 7794, 6019, 2072, 5482, 2309, 6332, 4008, 3024, 355, 3617, 4644, 3875, 6637, 1180, 7615, 1429, 6112, 357, 192, 6117, 1262, 6702, 2105, 56, 4505, 3269, 3705, 7927, 7933, 4989, 1595, 4883, 5489, 1028, 1547, 3575, 1071, 3187, 5619, 1845, 7743, 50, 