In [1]:
using Knet;

In [2]:
using Knet, Plots, NBInclude;
nbinclude("deepppiutils.ipynb");

## Features Dictionary Construction

This function constructs protein features dictionary. Protein features for the taining data set are stored in the file "yeast_feature_all.csv". 
The function returns a dictionary mapping a protein UniProt ID to its 1164-features vector
* ```featurefilepath```    : The path for the file containint features data

In [3]:
function constructFeatDict(featurefilepath)
    # read features for all proteins
    f = open(featurefilepath);
    lines = readlines(f);
    close(f);
    numberOfProteins = length(lines) 
    featureNames = String.(split(lines[1],",")); 
    d = length(featureNames) - 1 # number of features per protein
    proteins = lines[2:numberOfProteins];
    featuresDict = Dict{String,Any}()
    for p in proteins
        featureVect = String.(split(p, ","));
        # to avoid NaN values due to missing data
        if("" in featureVect[2:d+1])
            continue
        end
        featuresDict[featureVect[1]] = parse.(Float32, featureVect[2:d+1]) # featureVect[1] is the protein UniProt ID 
    end
    return featuresDict;
end;

## Loading Data

This function loads the training data with its labels 
* ```featuresDict```    : The features dictionary which maps the protein ID to its features vector

The function returns two arrays, the data and the labels. 
In order to construct the data matrix, the following steps are executed:
* Protein pairs (represented by their UniProt IDs are read from the file "yeast_protein_pair.csv".
* A feature vector for each partner in the protein pair is extracted from ```featuresDict``` 
* The resulting two feature vectors are concatenated to form a new combined 2328-elements features vector
* Combined vectors for all pairs in the dataset are concatenated vertically to for the 65852x2328 data matrix ```concatAB```. 
* Labels for the protein pairs are read and returned as ```ygold```


In [4]:
function loaddata(featuresfilepath, pairsfilepath, nopos, noneg; Atype=gpu() >= 0 ? KnetArray{Float32} : Array{Float32})
    pairsfile = open(pairsfilepath)
    pairslines = readlines(pairsfile)
    featuresDict = constructFeatDict(featuresfilepath)
    close(pairsfile)
    
    n = length(pairslines); # number of samples/ protein pairs
    pairslines = pairslines[2:end];
    pos_a = pairslines[1:nopos];
    pos_b = pairslines[nopos+1:2*nopos];
    
    neg_a = pairslines[(2*nopos +1): (2*nopos+noneg)];
    neg_b = pairslines[(2*nopos+noneg+1):(2*nopos+2*noneg)];

    concatAB1 = []
    i = 1
    ygold = Array{UInt8,1}(nopos + noneg);

    pos_miss = 0
    for j in 1: length(pos_a)
        aline = pos_a[j]
        bline = pos_b[j]
        a = String.(split(aline, ","));
        b = String.(split(bline, ","));
        if(get(featuresDict, a[1], 0) == 0 || get(featuresDict, b[1], 0) == 0 )
            pos_miss += 1
            continue
        end
        push!(concatAB1, hcat(reshape(mat(featuresDict[a[1]]), 1, 1164), reshape(mat(featuresDict[b[1]]), 1, 1164)))
        ygold[i] = 0x02; # positive class
        i += 1
    end
    neg_miss = 0
    for j in 1:length(neg_a)
        aline = neg_a[j]
        bline = neg_b[j]
        a = String.(split(aline, ","));
        b = String.(split(bline, ","));
        if(get(featuresDict, a[1], 0) == 0 || get(featuresDict, b[1], 0) == 0 )
            neg_miss += 1
            continue
        end
        push!(concatAB1, hcat(reshape(mat(featuresDict[a[1]]), 1, 1164), reshape(mat(featuresDict[b[1]]), 1, 1164)))
        ygold[i] = 0x01; # negative class
        i += 1
    end
    println("Num of dropped positive sample: ", pos_miss, " , Num of dropped negative sample: ", neg_miss)
    concatAB = vcat(map(Atype, concatAB1)...)
    return concatAB, ygold
end;

In [5]:
function trn_tst_split(data, ygold, trnper, tstper; batchsize=64)
    nosamples = size(data,1)
    notrn = Int(floor(trnper * nosamples))
    notst = nosamples - notrn
    ind = randperm(nosamples)
    
    xtrn = data[ind[1:notrn],:];
    ytrn = ygold[ind[1:notrn]];
    
    xtst = data[ind[notrn+1:notrn+notst], :];
    ytst = ygold[ind[notrn+1:notrn+notst]];
    
    dtrn = minibatchi(xtrn',ytrn,batchsize);
    dtst = minibatchi(xtst',ytst,batchsize);
    
    return dtrn, dtst
end;

# Yeast Dataset

In [6]:
yeast_feature="test_datasets/Yeast/yeast_feature.csv"
yeast_protein="test_datasets/Yeast/yeast_protein.csv"
yeast_dataset,yeast_label=loaddata(yeast_feature,yeast_protein,5594,5594);
#dtrn, dtst = trn_tst_split(yeast_dataset,yeast_label, 0.75, 0.25);

Num of dropped positive sample: 0 , Num of dropped negative sample: 0


In [14]:
# the number of hidden units in the hidden layers of the DeepPPI-CON model
HIDDENSSEP = Any[NOINPUTS, 512, 256, 128]; 
HIDDENSMER = Any[256, 128, NOOUTPUTS]
NOEPOCH = 30;
BATCHSIZE = 64;
PDROP = (0, 0.2);

accuracy_ =[]
recall=[]
specifity=[]
precision= []
mcc=[]
f1=[]
npv=[]
accuracyt= recalli=specifityi=precisioni=mcci = 0.0
for i in 1:5
    #setseed(i);
    wa = winit(HIDDENSSEP...);
    wb = winit(HIDDENSSEP...);
    wMerged = winit(HIDDENSMER...);
    w = vcat(wa, wb, wMerged);
    
    #dtrn, ddev, dtst = dividedataset(concatAB, ygold, trnper, devper, tstper; batchsize= BATCHSIZE);
    dtrn, dtst = trn_tst_split(yeast_dataset, yeast_label, trnper, tstper; batchsize= BATCHSIZE);
    
    optims = params(w; optim="Momentum", lr=0.05, gamma=0.9);
    #@time trnloss, trnerr, tstloss, tsterr=trainSep!(w, optims, dtrn, predictSep, ddev; pdrop=PDROP, epochs=NOEPOCH) 
    @time trainSep!(w, optims, dtrn, predictSep, dtst; pdrop=PDROP, epochs=NOEPOCH) 
    
    println("Yeast Dataset, fold: ", i)
    println("Training: min. loss =",loss(w,dtrn,predictSep),", min. error =",zeroone(w,dtrn,predictSep))  
    println("Test: min. loss =",loss(w,dtst,predictSep),", min. error =",zeroone(w,dtst,predictSep))  
    
    accuracyt,recalli,specifityi,precisioni,mcci,f1i,npvi = modelevaluation(w, dtst, predictSep; p=true);
    push!(accuracy_, accuracyt)
    push!(recall, recalli)
    push!(specifity, specifityi)
    push!(precision, precisioni)
    push!(mcc, mcci)
    push!(f1, f1i)
    push!(npv, npvi)
    
end

(:epoch, 10, :trn, 0.9734848484848485, :tst, 0.9207149621212121)
(:epoch, 20, :trn, 0.993844696969697, :tst, 0.9366950757575757)
(:epoch, 30, :trn, 0.9969223484848485, :tst, 0.9339094065656565)
 79.755301 seconds (7.55 M allocations: 39.367 GiB, 5.22% gc time)
Yeast Dataset, fold: 1
Training: min. loss =0.009232941, min. error =0.0030776515151514916
Test: min. loss =0.47611618, min. error =0.06609059343434354
TP: 1245 , TN: 1368 , FP: 134 , FN: 50
Model evaluation:
Accuracy : 0.9342152302702127
Precision : 0.9028281356759766
NPV : 0.9647390684310725
Sensitivity / Recall : 0.9613899606475752
Specifity : 0.9107856185680521
MCC : 0.8698683412357969
F1 : 0.9311892292703107
(:epoch, 10, :trn, 0.9533617424242424, :tst, 0.9210148358585858)
(:epoch, 20, :trn, 0.9918323863636364, :tst, 0.9409011994949494)
(:epoch, 30, :trn, 0.9957386363636364, :tst, 0.9372001262626263)
 83.289743 seconds (7.55 M allocations: 39.367 GiB, 5.15% gc time)
Yeast Dataset, fold: 2
Training: min. loss =0.010866349, min

In [8]:
open("DeepPPI-YeastDataset_Scores.txt", "w") do f
    write(f, "Dataset \tAccuracy\t\t\tPrecision\t\t\tnpv      \t\t\tRecall   \t\t\tSpecifity\t\t\tMCC\n")
    write(f, "__________________________________________________________________________________________________________________________________\n")
    for i in 1:5
        write(f, "dataset"*string(i)*"\t"*string(accuracy_[i]) *"\t"* string(precision[i]) *"\t"* string(npv[i]) *"\t"* string(recall[i]) *"\t"*  string(specifity[i]) *"\t"*  string(mcc[i]) *"\n")
    end
    write(f, "__________________________________________________________________________________________________________________________________\n")
    write(f, "Average"*"\t\t"*string(mean(accuracy_)) *"\t"* string(mean(precision))  *"\t"* string(mean(npv)) *"\t"* string(mean(recall)) *"\t"*  string(mean(specifity)) *"\t"*  string(mean(mcc)) *"\n")
end;
## DeepPPI: acc=0.948159, precision=0.972388, npv=0.925875, sensitivity=0.923459, specificity=0.973304, mcc=0.897513

# Pylori Dataset

In [9]:
pylori_feature="test_datasets/pylori/pylori_feature.csv"
pylori_protein="test_datasets/pylori/pylori_protein.csv"
pylori_dataset,pylori_label=loaddata(pylori_feature,pylori_protein,1458,1458);
#dtrn, dtst = trn_tst_split(pylori_dataset,pylori_label, 0.75, 0.25);

Num of dropped positive sample: 0 , Num of dropped negative sample: 34


In [12]:
# the number of hidden units in the hidden layers of the DeepPPI-CON model
HIDDENSSEP = Any[NOINPUTS, 512, 256, 128]; 
HIDDENSMER = Any[256, 128, NOOUTPUTS]
NOEPOCH = 30;
BATCHSIZE = 64;
PDROP = (0, 0.2);

accuracy_ =[]
recall=[]
specifity=[]
precision= []
mcc=[]
f1=[]
npv=[]
accuracyt= recalli=specifityi=precisioni=mcci = 0.0
for i in 1:5
    #setseed(i);
    wa = winit(HIDDENSSEP...);
    wb = winit(HIDDENSSEP...);
    wMerged = winit(HIDDENSMER...);
    w = vcat(wa, wb, wMerged);
    
    #dtrn, ddev, dtst = dividedataset(concatAB, ygold, trnper, devper, tstper; batchsize= BATCHSIZE);
    dtrn, dtst = trn_tst_split(pylori_dataset,pylori_label, trnper, tstper; batchsize= BATCHSIZE);
    
    optims = params(w; optim="Momentum", lr=0.05, gamma=0.9);
    #@time trnloss, trnerr, tstloss, tsterr=trainSep!(w, optims, dtrn, predictSep, ddev; pdrop=PDROP, epochs=NOEPOCH) 
    @time trainSep!(w, optims, dtrn, predictSep, dtst; pdrop=PDROP, epochs=NOEPOCH) 
    
    println("Yeast Dataset, fold: ", i)
    println("Training: min. loss =",loss(w,dtrn,predictSep),", min. error =",zeroone(w,dtrn,predictSep))  
    println("Test: min. loss =",loss(w,dtst,predictSep),", min. error =",zeroone(w,dtst,predictSep))  
    
    accuracyt,recalli,specifityi,precisioni,mcci,f1i,npvi = modelevaluation(w, dtst, predictSep; p=true);
    push!(accuracy_, accuracyt)
    push!(recall, recalli)
    push!(specifity, specifityi)
    push!(precision, precisioni)
    push!(mcc, mcci)
    push!(f1, f1i)
    push!(npv, npvi)
    
end

(:epoch, 10, :trn, 0.4970175570228092, :tst, 0.4891237745098039)
(:epoch, 20, :trn, 0.8444252701080431, :tst, 0.7635569852941176)
(:epoch, 30, :trn, 0.9935661764705882, :tst, 0.8403799019607843)
 20.148107 seconds (1.95 M allocations: 10.144 GiB, 5.99% gc time)
Yeast Dataset, fold: 1
Training: min. loss =0.02283014, min. error =0.006433823529411797
Test: min. loss =0.8716205, min. error =0.15962009803921573
TP: 317 , TN: 298 , FP: 56 , FN: 50
Model evaluation:
Accuracy : 0.8529819683037698
Precision : 0.8498659494641664
NPV : 0.8563218366197648
Sensitivity / Recall : 0.8637602156300812
Specifity : 0.8418079072265313
MCC : 0.7058778912028615
F1 : 0.8567567555989773
(:epoch, 10, :trn, 0.4923844537815126, :tst, 0.4854473039215686)
(:epoch, 20, :trn, 0.4923844537815126, :tst, 0.4854473039215686)
(:epoch, 30, :trn, 0.9814769657863145, :tst, 0.8283547794117646)
 21.655956 seconds (1.95 M allocations: 10.144 GiB, 5.48% gc time)
Yeast Dataset, fold: 2
Training: min. loss =0.052388802, min. err

In [13]:
open("DeepPPI-PyloriDataset_Scores.txt", "w") do f
    write(f, "Dataset \tAccuracy\t\t\tPrecision\t\t\tnpv      \t\t\tRecall   \t\t\tSpecifity\t\t\tMCC\n")
    write(f, "__________________________________________________________________________________________________________________________________\n")
    for i in 1:5
        write(f, "dataset"*string(i)*"\t"*string(accuracy_[i]) *"\t"* string(precision[i]) *"\t"* string(npv[i]) *"\t"* string(recall[i]) *"\t"*  string(specifity[i]) *"\t"*  string(mcc[i]) *"\n")
    end
    write(f, "__________________________________________________________________________________________________________________________________\n")
    write(f, "Average"*"\t\t"*string(mean(accuracy_)) *"\t"* string(mean(precision))  *"\t"* string(mean(npv)) *"\t"* string(mean(recall)) *"\t"*  string(mean(specifity)) *"\t"*  string(mean(mcc)) *"\n")
end;

# Human Dataset

In [17]:
human_feature="test_datasets/Human/human_feature.csv"
human_protein="test_datasets/Human/human_protein.csv"
human_dataset,human_label=loaddata(human_feature,human_protein,3899,4262);

Num of dropped positive sample: 0 , Num of dropped negative sample: 0


In [18]:
# the number of hidden units in the hidden layers of the DeepPPI-CON model
HIDDENSSEP = Any[NOINPUTS, 512, 256, 128]; 
HIDDENSMER = Any[256, 128, NOOUTPUTS]
NOEPOCH = 30;
BATCHSIZE = 64;
PDROP = (0, 0.2);

accuracy_ =[]
recall=[]
specifity=[]
precision= []
mcc=[]
f1=[]
npv=[]
accuracyt= recalli=specifityi=precisioni=mcci = 0.0
for i in 1:5
    #setseed(i);
    wa = winit(HIDDENSSEP...);
    wb = winit(HIDDENSSEP...);
    wMerged = winit(HIDDENSMER...);
    w = vcat(wa, wb, wMerged);
    
    #dtrn, ddev, dtst = dividedataset(concatAB, ygold, trnper, devper, tstper; batchsize= BATCHSIZE);
    dtrn, dtst = trn_tst_split(human_dataset,human_label, trnper, tstper; batchsize= BATCHSIZE);
    
    optims = params(w; optim="Momentum", lr=0.05, gamma=0.9);
    #@time trnloss, trnerr, tstloss, tsterr=trainSep!(w, optims, dtrn, predictSep, ddev; pdrop=PDROP, epochs=NOEPOCH) 
    @time trainSep!(w, optims, dtrn, predictSep, dtst; pdrop=PDROP, epochs=NOEPOCH) 
    
    println("Human Dataset, fold: ", i)
    println("Training: min. loss =",loss(w,dtrn,predictSep),", min. error =",zeroone(w,dtrn,predictSep))  
    println("Test: min. loss =",loss(w,dtst,predictSep),", min. error =",zeroone(w,dtst,predictSep))  
    
    accuracyt,recalli,specifityi,precisioni,mcci,f1i,npvi = modelevaluation(w, dtst, predictSep; p=true);
    push!(accuracy_, accuracyt)
    push!(recall, recalli)
    push!(specifity, specifityi)
    push!(precision, precisioni)
    push!(mcc, mcci)
    push!(f1, f1i)
    push!(npv, npvi)
    
end

(:epoch, 10, :trn, 0.9628580729166667, :tst, 0.9411663925438597)
(:epoch, 20, :trn, 0.9988606770833334, :tst, 0.9754660087719298)
(:epoch, 30, :trn, 0.9991861979166666, :tst, 0.9706431606359649)
 62.519110 seconds (5.49 M allocations: 28.666 GiB, 5.33% gc time)
Human Dataset, fold: 1
Training: min. loss =0.0027028145, min. error =0.0008138020833333703
Test: min. loss =0.1807733, min. error =0.029356839364035103
TP: 936 , TN: 1045 , FP: 47 , FN: 13
Model evaluation:
Accuracy : 0.9706026452863289
Precision : 0.9521871811269713
NPV : 0.9877126644728613
Sensitivity / Recall : 0.9863013688237078
Specifity : 0.9569597060833701
MCC : 0.9415789623067146
F1 : 0.9689440988773581
(:epoch, 10, :trn, 0.9585611979166666, :tst, 0.9456208881578947)
(:epoch, 20, :trn, 0.9986979166666666, :tst, 0.9754060444078947)
(:epoch, 30, :trn, 0.9998372395833334, :tst, 0.9755259731359649)
 60.155320 seconds (5.49 M allocations: 28.666 GiB, 5.34% gc time)
Human Dataset, fold: 2
Training: min. loss =0.0010411941, mi

In [19]:
open("DeepPPI-HumanDataset_Scores.txt", "w") do f
    write(f, "Dataset \tAccuracy\t\t\tPrecision\t\t\tnpv      \t\t\tRecall   \t\t\tSpecifity\t\t\tMCC\n")
    write(f, "__________________________________________________________________________________________________________________________________\n")
    for i in 1:5
        write(f, "dataset"*string(i)*"\t"*string(accuracy_[i]) *"\t"* string(precision[i]) *"\t"* string(npv[i]) *"\t"* string(recall[i]) *"\t"*  string(specifity[i]) *"\t"*  string(mcc[i]) *"\n")
    end
    write(f, "__________________________________________________________________________________________________________________________________\n")
    write(f, "Average"*"\t\t"*string(mean(accuracy_)) *"\t"* string(mean(precision))  *"\t"* string(mean(npv)) *"\t"* string(mean(recall)) *"\t"*  string(mean(specifity)) *"\t"*  string(mean(mcc)) *"\n")
end;