In [1]:
for p in ("Knet", "Plots", "NBInclude")
    Pkg.installed(p) == nothing && Pkg.add(p);
end

In [2]:
using Knet, Plots, NBInclude;

In [12]:
function constructFeatDict()
    # read features for all proteins
    f = open("yeast_feature_all.csv");
    lines = readlines(f);
    close(f);
    numberOfProteins = length(lines) 
    featureNames = String.(split(lines[1],",")); 
    d = length(featureNames) - 1 # number of features per protein
    proteins = lines[2:numberOfProteins];
    featuresDict = Dict{String,Any}()
    for p in proteins
        featureVect = String.(split(p, ","));
        featuresDict[featureVect[1]] = parse.(Float32, featureVect[2:d+1]) # featureVect[1] is the protein UniProt ID 
    end
    f = open("test_datasets/Negatome_features.csv");
    lines = readlines(f);
    close(f);
    numberOfProteins = length(lines) 
    featureNames = String.(split(lines[1],",")); 
    d = length(featureNames) - 1 # number of features per protein
    proteins = lines[2:numberOfProteins];
    for p in proteins
        featureVect = String.(split(p, ","));
        if("" in featureVect[2:d+1])
            continue
        end
        featuresDict[featureVect[1]] = parse.(Float32, featureVect[2:d+1]) # featureVect[1] is the protein UniProt ID 
    end
    return featuresDict;
end;

In [40]:
function loaddata(featuresDict)
    f = open("yeast_protein_pair.csv")
    lines = readlines(f);
    close(f)
    f = open("test_datasets/Negatome_pairs.csv")
    negsamples = readlines(f);
    close(f)
    n = length(lines); # number of samples/ protein pairs
    samples = lines[2:end - 1];
    possamples = samples[1:17257]
    concatAB1 = []
    ygold = Array{UInt8,1}(length(possamples) + length(negsamples));
    i = 1;
    for s in possamples
        s = String.(split(s, ","));
        push!(concatAB1, hcat(reshape(mat(featuresDict[s[1]]), 1, 1164), reshape(mat(featuresDict[s[2]]), 1, 1164)))
        ygold[i] = 0x02;
        i += 1;
    end
    
    
    n = length(negsamples); # number of samples/ protein pairs
    for s in negsamples
        s = String.(split(s, "\t"));
        if s[1] in keys(featuresDict) && s[2] in keys(featuresDict) 
            push!(concatAB1, hcat(reshape(mat(featuresDict[s[1]]), 1, 1164), reshape(mat(featuresDict[s[2]]), 1, 1164)))
            ygold[i] = 0x01;
            i += 1;
        end
    end
    
    concatAB = vcat(map(Atype, concatAB1)...)
    return concatAB, ygold
end;

In [10]:
function winit(h...)
    w = Any[]
    for i=2:length(h)
        push!(w, 0.01*randn(h[i],h[i-1]))
        push!(w, zeros(h[i],1))
    end
    map(Atype, w)
end;

In [45]:
function dividedataset(data, ygold, trnper, devper, tstper; batchsize=64, dev=true) # 0.58, 0.17, 0.25
    nopos = 17257 # number of total positive samples
    noneg = 6222
    # construct a 1:1 ratio of positive and negative samples as the data set
    posdata = data[1 : nopos, :];
    posygold = ygold[1 : nopos];
    negdata = data[nopos + 1 : end, :];
    negygold = ygold[nopos + 1 : end];
    
    # pick noneg positive samples randomly
    indneg = randperm(noneg)
    data = vcat(posdata[indneg[1:end],:], negdata)
    ygold = vcat(posygold[indneg[1:end]], negygold)
    println(summary(data))
    nosamples = size(data,1)
    notst = Int(floor(tstper * nosamples))
    notrn = Int(floor(trnper * nosamples))
    nodev = nosamples - notrn - notst
    ind = randperm(nosamples)
    
    xtrn = data[ind[1:notrn],:];
    ytrn = ygold[ind[1:notrn]];
    
    xtst = data[ind[notrn+1:notrn+notst], :];
    ytst = ygold[ind[notrn+1:notrn+notst]];
    
    dtrn = minibatchi(xtrn',ytrn,batchsize);
    dtst = minibatchi(xtst',ytst,batchsize);
    
    if (dev)
        xdev = data[ind[notrn+1:notrn+nodev], :];
        ydev = ygold[ind[notrn+1:notrn+nodev]];
        ddev = minibatchi(xdev',ydev,batchsize);
        return dtrn, ddev, dtst
    else
        return dtrn, dtst
    end
    
end;

In [24]:
function predictSep(w, x; pdrop=(0,0))
    wa = w[1:6]
    wb = w[7:12]
    wm = w[13:end]
    xa = x[1:1164,:]
    xb = x[1165:end,:]
    
    for i=1:2:length(wa)
        xa = dropout(xa, pdrop[i==1?1:2])
        xa = wa[i]*xa .+ wa[i+1]
        xa = relu.(xa)                      
    end
    
    for i=1:2:length(wb)
        xb = dropout(xb, pdrop[i==1?1:2])
        xb = wb[i]*xb .+ wb[i+1]
        xb = relu.(xb)                         
    end
    
    xm = vcat(xa, xb)
    
    for i=1:2:length(wm)
        xm = dropout(xm, pdrop[i==1?2:1])
        xm = wm[i]*xm .+ wm[i+1]
        if i<length(wm)-1
            xm = relu.(xm)   ## apply RELU to all but the final layer's output                        
        end
    end
    return xm
end;

In [28]:
# Train model(w) with SGD and return a list containing w for every epoch
function trainSep!(w, optims,data,predict, ddev; epochs=100,lr=.01,o...) #, decay=1.0
    #trnloss = Any[loss(w,data,predict)]
    #trnerr = Any[zeroone(w,data,predict)]
    #devloss = Any[loss(w,ddev,predict)]
    #deverr = Any[zeroone(w,ddev,predict)]
    for epoch in 1:epochs
        for (x,y) in data
            dw = lossgradient(w,x,y, predict; o...)
            #@show (map(vecnorm, dw))
            update!(w, dw, optims)
        end
        #push!(trnloss, loss(w,data,predict))
        #push!(trnerr, zeroone(w,data,predict))
        #push!(devloss, loss(w,ddev,predict))
        #push!(deverr, zeroone(w,ddev,predict))
        if(epoch % 10 == 0)
            println((:epoch,epoch,:trn,accuracyi(w,data,predict),:tst,accuracyi(w,ddev,predict)));
        end

    end
    #return trnloss, trnerr, devloss, deverr
end;

In [26]:
loss(w,x,ygold, predict; o...) = nll(predict(w,x; o...),ygold);
loss(w, data, predict; o...) = mean(loss(w,x,y,predict; o...) for (x,y) in data);
zeroone(w,data,predict; o...) = 1 - accuracyi(w,data,predict);
lossgradient = grad(loss);
report(epoch)=println((:epoch,epoch,:trn,accuracyi(w,dtrn,predict),:dev,accuracyi(w,ddev,predict)));

In [30]:
function params(ws; optim="Sgd", lr=0.01, gamma=0.95, eps=1e-6, rho=0.9, beta1=0.9, beta2=0.95)
    prms = Any[]

    for i=1:length(ws)
        w = ws[i]
        if optim == "Sgd"
            prm = Sgd(;lr=lr)
        elseif optim == "Momentum"
            prm = Momentum(lr=lr, gamma=gamma)
        elseif optim == "Nesterov"
            prm = Nesterov(lr=lr, gamma=gamma)
        elseif optim == "Adagrad"
            prm = Adagrad(lr=lr, eps=eps)
        elseif optim == "Adadelta"
            prm = Adadelta(lr=lr, rho=rho, eps=eps)
        elseif optim == "Rmsprop"
            prm = Rmsprop(lr=lr, rho=rho, eps=eps)
        elseif optim == "Adam"
            prm = Adam(lr=lr, beta1=beta1, beta2=beta2, eps=eps)
        else
            error("Unknown optimization method!")
        end
        push!(prms, prm)
    end

    return prms
end;

In [31]:
function accuracyi(ypred, ygold)
    count = 0
    for i in 1:size(ypred, 2)
        if((ypred[1,i] >= ypred[2,i] && ygold[i]==1) || (ypred[1,i] <= ypred[2,i] && ygold[i]==2))
            count +=1
        end
    end
    return count/size(ypred, 2);
end;

In [33]:
function accuracyi(w, data, predict)
    acc = 0;
    for (x, y) in data
        ypred = predict(w,x)
        acc += accuracyi(ypred, y) 
    end
    return acc/length(data)
end;

In [34]:
function modelevaluation(w, data, pred; p=false)
    tp=tn=fp=fn= 0    

    for (x,y) in data
        ypred = pred(w, x)
        for i in 1:size(ypred, 2)
            if(ypred[1,i] >= ypred[2,i] && y[i]==1)
                tn += 1;
            elseif(ypred[1,i] <= ypred[2,i] && y[i]==2)
                tp += 1
            elseif(ypred[1,i] >= ypred[2,i] && y[i]==2)
                fp += 1
            else
                fn += 1
            end
        end
    end
    accuracy = (tp + tn) / (tp + tn + fn + fp+1e-06)
    recall = tp / (tp + fn +1e-06)
    specifity = tn / (tn + fp+1e-06)
    precision = tp / (tp + fp +1e-06)
    mcc = (tp*tn-fp*fn)/(sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))+1e-06)
    f1 = (tp*2)/(tp*2+fp+fn+1e-06)
    npv = (tn)/(tn + fn + 1e-06)
    if (p)
        println("TP: ", tp, " , TN: ", tn, " , FP: ", fp, " , FN: ", fn);
        println("Model evaluation:");
        println("Accuracy : ", accuracy);
        println("Precision : ", precision);
        println("NPV : ", npv);
        println("Sensitivity / Recall : ", recall);
        println("Specifity : ", specifity);
        println("MCC : ", mcc);
        println("F1 : ", f1);
    end
    
    return accuracy,recall,specifity,precision,mcc,f1,npv;
end;

In [36]:
# Input X matrix and gold labels Y
# Output list of minibatches (x, y)
function minibatchi(X, Y, batchsize)
    data = Any[] 
    meanx = mean(X,2);
    if(gpu() < 0)
        # CPU
        stdx=std(X,2);
    else
        # for GPU, manually calculate std for Knet arrays
        stdx= sqrt.(sum(abs2.(X.-meanx), 2)/(size(X,2) - 1));
    end
    X = (X.-meanx) ./ stdx;

    for i = 1:batchsize:size(X, 2)
        bl = min(i + batchsize - 1, size(X, 2))
        push!(data, (X[:, i:bl], Y[i:bl]))
    end
    return data
end;

In [50]:
function train!(w, optims,data,predict, ddev; epochs=100,lr=.5,o...)
    #trnloss = Any[loss(w,data,predict)]
    #trnerr = Any[zeroone(w,data,predict)]
    #devloss = Any[loss(w,ddev,predict)]
    #deverr = Any[zeroone(w,ddev,predict)]
    for epoch in 1:epochs
        if(epoch % 10 == 0)
            println((:epoch,epoch,:trn,accuracyi(w,data,predict),:tst,accuracyi(w,ddev,predict)));
        end
        for (x,y) in data
            dw = lossgradient(w,x,y, predict; o...)
            #@show (map(vecnorm, dw))
            update!(w, dw, optims)
#             for i in 1:length(w)
#                w[i] -= lr * dw[i]
#             end
        end
        #push!(trnloss, loss(w,data,predict))
        #push!(trnerr, zeroone(w,data,predict))
        #push!(devloss, loss(w,ddev,predict))
        #push!(deverr, zeroone(w,ddev,predict))
    end
    #return trnloss, trnerr , devloss, deverr
end;

In [51]:
function predict(w,x; pdrop=(0,0))
    for i=1:2:length(w)
        x = dropout(x, pdrop[i==1 || i == length(w)-1?1:2])
        x = w[i]*x .+ w[i+1]
        if i < length(w)-1
            x = relu.(x)   ## apply RELU to all but the final layer's output                        
        end
    end
    return x
end;

In [42]:
Atype = gpu() >= 0 ? KnetArray{Float32} : Array{Float32};
setseed(1);

# number of input features per protein
NOINPUTS = 1164;
# number of input features for the protein pair
NOCONCAT = NOINPUTS * 2;
# output is a one-hot-vector 10 -> not interacting, 01 -> intracting
NOOUTPUTS = 2;

# the percentages used for evaluationg models in the paper
trnper = 0.75;
tstper = 0.25;
devper = 1 - trnper - tstper;

In [43]:
featuresDict = constructFeatDict();
concatAB, ygold = loaddata(featuresDict);

(Float32[7.5 1.90476 … 0.002937 0.001369; 4.14971 1.4646 … 0.000999 -0.001443; … ; 5.74713 0.574713 … 0.006669 0.00223; 9.77444 3.00752 … 0.006374 0.00367], UInt8[0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  …  0x38, 0x2c, 0x30, 0x2e, 0x30, 0x30, 0x30, 0x32, 0x30, 0x33])

In [46]:
BATCHSIZE = 64;
dtrn, ddev, dtst = dividedataset(concatAB, ygold, trnper, devper, tstper; batchsize= BATCHSIZE);

12444×2328 Array{Float32,2}


In [47]:
# the number of hidden units in the hidden layers of the DeepPPI-CON model
HIDDENSSEP = Any[NOINPUTS, 512, 256, 128]; 
HIDDENSMER = Any[256, 128, NOOUTPUTS]
NOEPOCH = 30;
BATCHSIZE = 64;
PDROP = (0, 0.2);

In [48]:
accuracy =[]
recall=[]
specifity=[]
precision= []
mcc=[]
f1=[]
npv=[]
accuracyt= recalli=specifityi=precisioni=mcci = 0.0
for i in 1:5
    #setseed(i);
    wa = winit(HIDDENSSEP...);
    wb = winit(HIDDENSSEP...);
    wMerged = winit(HIDDENSMER...);
    w = vcat(wa, wb, wMerged);
    
    #dtrn, ddev, dtst = dividedataset(concatAB, ygold, trnper, devper, tstper; batchsize= BATCHSIZE);
    dtrn, dtst = dividedataset(concatAB, ygold, trnper, devper, tstper; batchsize= BATCHSIZE, dev=false);
    
    optims = params(w; optim="Momentum", lr=0.01, gamma=0.9);
    #@time trnloss, trnerr, tstloss, tsterr=trainSep!(w, optims, dtrn, predictSep, ddev; pdrop=PDROP, epochs=NOEPOCH) 
    @time trainSep!(w, optims, dtrn, predictSep, dtst; pdrop=PDROP, epochs=NOEPOCH) 
    
    println("Dataset", i)
    println("Training: min. loss =",loss(w,dtrn,predictSep),", min. error =",zeroone(w,dtrn,predictSep))  
    println("Test: min. loss =",loss(w,dtst,predictSep),", min. error =",zeroone(w,dtst,predictSep))  
    
    accuracyt,recalli,specifityi,precisioni,mcci,f1i,npvi = modelevaluation(w, dtst, predictSep; p=true);
    push!(accuracy, accuracyt)
    push!(recall, recalli)
    push!(specifity, specifityi)
    push!(precision, precisioni)
    push!(mcc, mcci)
    push!(f1, f1i)
    push!(npv, npvi)
    
    writedlm("DeepPPI_SepModel"*string(i)*".csv", map(Array, w))
end

12444×2328 Array{Float32,2}
(:epoch, 10, :trn, 0.9476407017317136, :tst, 0.9344256933542647)
(:epoch, 20, :trn, 0.995291095890411, :tst, 0.962805795395081)
(:epoch, 30, :trn, 0.9972174657534246, :tst, 0.9633290816326531)
 99.946530 seconds (10.43 M allocations: 43.746 GiB, 5.29% gc time)
Dataset1
Training: min. loss =0.008437758, min. error =0.0027825342465753744
Test: min. loss =0.17190474, min. error =0.03667091836734693
TP: 1471 , TN: 1525 , FP: 67 , FN: 48
Model evaluation:
Accuracy : 0.9630343937759452
Precision : 0.9564369304574531
NPV : 0.9694850597778225
Sensitivity / Recall : 0.968400262693614
Specifity : 0.9579145722626164
MCC : 0.9261183930042733
F1 : 0.9623814193776966
12444×2328 Array{Float32,2}
(:epoch, 10, :trn, 0.9167965721116569, :tst, 0.9038625065410779)
(:epoch, 20, :trn, 0.9955899457224089, :tst, 0.9665423861852432)
(:epoch, 30, :trn, 0.9973022744895321, :tst, 0.9641957744636316)
112.256977 seconds (8.35 M allocations: 43.642 GiB, 4.85% gc time)
Dataset2
Training: m

In [49]:
#summary(accuracy)
open("DeepPPI-Sep_Scores_wNegatome.txt", "w") do f
    write(f, "Dataset \tAccuracy\t\t\tPrecision\t\t\tnpv      \t\t\tRecall   \t\t\tSpecifity\t\t\tMCC\n")
    write(f, "__________________________________________________________________________________________________________________________________\n")
    for i in 1:5
        write(f, "dataset"*string(i)*"\t"*string(accuracy[i]) *"\t"* string(precision[i]) *"\t"* string(npv[i]) *"\t"* string(recall[i]) *"\t"*  string(specifity[i]) *"\t"*  string(mcc[i]) *"\n")
    end
    write(f, "__________________________________________________________________________________________________________________________________\n")
    write(f, "Average"*"\t\t"*string(mean(accuracy)) *"\t"* string(mean(precision))  *"\t"* string(mean(npv)) *"\t"* string(mean(recall)) *"\t"*  string(mean(specifity)) *"\t"*  string(mean(mcc)) *"\n")
end;
#println("Accuracy", "   Precision", "   npv"," recall", "    specifity", "     mcc", "       f1")
#(hcat(accuracy, precision, npv, recall, specifity, mcc, f1))