In [None]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

class DefaultConfig(object):

    acid_one_hot = [0 for i in range(20)]
    acid_idex = {j:i for i,j in enumerate("ACDEFGHIKLMNPQRSTVWY")}


    BASE_PATH = "/content/drive/MyDrive/DeepPPISP-master"
    sequence_path = "{0}/data_cache/sequence_data".format(BASE_PATH)
    pssm_path = "{0}/data_cache/pssm_data".format(BASE_PATH)
    dssp_path = "{0}/data_cache/dssp_data".format(BASE_PATH)

    max_sequence_length = 500
    windows_size = 3

    batch_size = 32
    seq_dim = 20
    dssp_dim = 9
    pssm_dim = 20

    kernels = [13,15,17]
    dropout =0.2
    splite_rate = 0.9



In [None]:

import os
import time
import pickle
import torch as t
import numpy as np
from torch.utils import data


class dataSet(data.Dataset):
    def __init__(self,window_size,sequences_file=None,pssm_file=None, dssp_file=None, label_file=None, protein_list_file=None):
        super(dataSet,self).__init__()

        self.all_sequences = []
        for seq_file in sequences_file:
            with open(seq_file,"rb") as fp_seq:
               temp_seq  = pickle.load(fp_seq)
            self.all_sequences.extend(temp_seq)

        self.all_pssm = []
        for pm_file in pssm_file:
            with open(pm_file,"rb") as fp_pssm:
                temp_pssm = pickle.load(fp_pssm)
            self.all_pssm.extend(temp_pssm)

        self.all_dssp = []
        for dp_file in dssp_file:
            with open(dp_file,"rb") as fp_dssp:
                temp_dssp  = pickle.load(fp_dssp)
            self.all_dssp.extend(temp_dssp)

        self.all_label = []
        for lab_file in label_file:
            with open(lab_file, "rb") as fp_label:
                temp_label = pickle.load(fp_label)
            self.all_label.extend(temp_label)

        with open(protein_list_file, "rb") as list_label:
            self.protein_list = pickle.load(list_label)



        self.Config = DefaultConfig()
        self.max_seq_len = self.Config.max_sequence_length
        self.window_size = window_size



    def __getitem__(self,index):

        count,id_idx,ii,dset,protein_id,seq_length = self.protein_list[index]
        window_size = self.window_size
        id_idx = int(id_idx)
        win_start = ii - window_size
        win_end = ii + window_size
        seq_length = int(seq_length)
        label_idx = (win_start+win_end)//2

        all_seq_features = []
        seq_len = 0
        for idx in self.all_sequences[id_idx][:self.max_seq_len]:
            acid_one_hot = [0 for i in range(20)]
            acid_one_hot[idx] = 1
            all_seq_features.append(acid_one_hot)
            seq_len += 1
        while seq_len<self.max_seq_len:
            acid_one_hot = [0 for i in range(20)]
            all_seq_features.append(acid_one_hot)
            seq_len += 1

        all_pssm_features = self.all_pssm[id_idx][:self.max_seq_len]
        seq_len = len(all_pssm_features)
        while seq_len<self.max_seq_len:
            zero_vector = [0 for i in range(20)]
            all_pssm_features.append(zero_vector)
            seq_len += 1

        all_dssp_features = self.all_dssp[id_idx][:self.max_seq_len]
        seq_len = len(all_dssp_features)
        while seq_len<self.max_seq_len:
            zero_vector = [0 for i in range(9)]
            all_dssp_features.append(zero_vector)
            seq_len += 1


        local_features = []
        labels = []
        while win_start<0:
            data = []
            acid_one_hot = [0 for i in range(20)]
            data.extend(acid_one_hot)

            pssm_zero_vector = [0 for i in range(20)]
            data.extend(pssm_zero_vector)

            dssp_zero_vector = [0 for i in range(9)]
            data.extend(dssp_zero_vector)

            local_features.extend(data)
            win_start += 1

        valid_end = min(win_end,seq_length-1)
        while win_start<=valid_end:
            data = []
            idx = self.all_sequences[id_idx][win_start]

            acid_one_hot = [0 for i in range(20)]
            acid_one_hot[idx] = 1
            data.extend(acid_one_hot)


            pssm_val = self.all_pssm[id_idx][win_start]
            data.extend(pssm_val)

            try:
                dssp_val = self.all_dssp[id_idx][win_start]
            except:
                dssp_val = [0 for i in range(9)]
            data.extend(dssp_val)

            local_features.extend(data)
            win_start += 1

        while win_start<=win_end:
            data = []
            acid_one_hot = [0 for i in range(20)]
            data.extend(acid_one_hot)

            pssm_zero_vector = [0 for i in range(20)]
            data.extend(pssm_zero_vector)

            dssp_zero_vector = [0 for i in range(9)]
            data.extend(dssp_zero_vector)

            local_features.extend(data)
            win_start += 1


        label = self.all_label[id_idx][label_idx]
        label = np.array(label,dtype=np.float32)

        all_seq_features = np.stack(all_seq_features)
        all_seq_features = all_seq_features[np.newaxis,:,:]
        all_pssm_features = np.stack(all_pssm_features)
        all_pssm_features = all_pssm_features[np.newaxis,:,:]

        all_dssp_features = np.stack(all_dssp_features)
        all_dssp_features = all_dssp_features[np.newaxis,:,:]
        local_features = np.stack(local_features)


        return all_seq_features,all_pssm_features,all_dssp_features,local_features,label


    def __len__(self):

        return len(self.protein_list)

In [None]:

import torch as t
import time

class BasicModule(t.nn.Module):

    def __init__(self):
        super(BasicModule,self).__init__()
        self.model_name = str(type(self))

    def load(self,path):

        self.load_state_dict(t.load(path))

    def save(self,name=None):


        if name is None:
            prefix = ""
            name = time.strftime("%y%m%d_%H:%M:%S.pth".format(prefix))

        t.save(self.state_dict(),name)
        return name

In [None]:
#-*- encoding:utf8 -*-

import os
import time
import sys

import torch as t
from torch import nn
from torch.autograd import Variable


sys.path.append("../")
# from utils.config import DefaultConfig
configs = DefaultConfig()


class ConvsLayer(BasicModule):
    def __init__(self,):

        super(ConvsLayer,self).__init__()

        self.kernels = configs.kernels
        hidden_channels = configs.cnn_chanel
        in_channel = 1
        features_L = configs.max_sequence_length
        seq_dim = configs.seq_dim
        dssp_dim = configs.dssp_dim
        pssm_dim = configs.pssm_dim
        W_size = seq_dim + dssp_dim + pssm_dim

        padding1 = (self.kernels[0]-1)//2
        padding2 = (self.kernels[1]-1)//2
        padding3 = (self.kernels[2]-1)//2
        self.conv1 = nn.Sequential()
        self.conv1.add_module("conv1",
            nn.Conv2d(in_channel, hidden_channels,
            padding=(padding1,0),
            kernel_size=(self.kernels[0],W_size)))
        self.conv1.add_module("ReLU",nn.PReLU())
        self.conv1.add_module("pooling1",nn.MaxPool2d(kernel_size=(features_L,1),stride=1))

        self.conv2 = nn.Sequential()
        self.conv2.add_module("conv2",
            nn.Conv2d(in_channel, hidden_channels,
            padding=(padding2,0),
            kernel_size=(self.kernels[1],W_size)))
        self.conv2.add_module("ReLU",nn.ReLU())
        self.conv2.add_module("pooling2",nn.MaxPool2d(kernel_size=(features_L,1),stride=1))

        self.conv3 = nn.Sequential()
        self.conv3.add_module("conv3",
            nn.Conv2d(in_channel, hidden_channels,
            padding=(padding3,0),
            kernel_size=(self.kernels[2],W_size)))
        self.conv3.add_module("ReLU",nn.ReLU())
        self.conv3.add_module("pooling3",nn.MaxPool2d(kernel_size=(features_L,1),stride=1))


    def forward(self,x):

        features1 = self.conv1(x)
        features2 = self.conv2(x)
        features3 = self.conv3(x)
        features = t.cat((features1,features2,features3),1)
        shapes = features.data.shape
        features = features.view(shapes[0],shapes[1]*shapes[2]*shapes[3])

        return features






class DeepPPI(BasicModule):
    def __init__(self,class_nums,window_size,ratio=None):
        super(DeepPPI,self).__init__()
        global configs
        configs.kernels = [13, 15, 17]
        self.dropout = configs.dropout = 0.2

        seq_dim = configs.seq_dim*configs.max_sequence_length


        self.seq_layers = nn.Sequential()
        self.seq_layers.add_module("seq_embedding_layer",
        nn.Linear(seq_dim,seq_dim))
        self.seq_layers.add_module("seq_embedding_ReLU",
        nn.ReLU())


        seq_dim = configs.seq_dim
        dssp_dim = configs.dssp_dim
        pssm_dim = configs.pssm_dim
        local_dim = (window_size*2+1)*(pssm_dim+dssp_dim+seq_dim)
        if ratio:
            configs.cnn_chanel = (local_dim*int(ratio[0]))//(int(ratio[1])*3)
        input_dim = configs.cnn_chanel*3+local_dim

        self.multi_CNN = nn.Sequential()
        self.multi_CNN.add_module("layer_convs",
                               ConvsLayer())



        self.DNN1 = nn.Sequential()
        self.DNN1.add_module("DNN_layer1",
                            nn.Linear(input_dim,1024))
        self.DNN1.add_module("ReLU1",
                            nn.ReLU())
        #self.dropout_layer = nn.Dropout(self.dropout)
        self.DNN2 = nn.Sequential()
        self.DNN2.add_module("DNN_layer2",
                            nn.Linear(1024,256))
        self.DNN2.add_module("ReLU2",
                            nn.ReLU())


        self.outLayer = nn.Sequential(
            nn.Linear(256, class_nums),
            nn.Sigmoid())

    def forward(self,seq,dssp,pssm,local_features):
        shapes = seq.data.shape
        features = seq.view(shapes[0],shapes[1]*shapes[2]*shapes[3])
        features = self.seq_layers(features)
        features = features.view(shapes[0],shapes[1],shapes[2],shapes[3])

        features = t.cat((features,dssp,pssm),3)
        features = self.multi_CNN(features)
        features = t.cat((features, local_features), 1)
        features = self.DNN1(features)
        #features =self.dropout_layer(features)
        features = self.DNN2(features)
        features = self.outLayer(features)

        return features

In [None]:

from __future__ import print_function
from __future__ import absolute_import

import os
import numpy as np
from collections import deque
import pickle

from sklearn.metrics import roc_curve, auc, matthews_corrcoef, precision_recall_curve,accuracy_score


def compute_roc(preds, labels):
    fpr, tpr, _ = roc_curve(labels.flatten(), preds.flatten())
    roc_auc = auc(fpr, tpr)
    return roc_auc


def compute_aupr(preds, labels):
    p, r, _ = precision_recall_curve(labels.flatten(), preds.flatten())
    aupr = auc(r, p)
    return aupr


def compute_mcc(preds, labels, threshold=0.5):
    preds = preds.astype(np.float64)
    labels = labels.astype(np.float64)
    mcc = matthews_corrcoef(labels.flatten(), preds.flatten())
    return mcc


def compute_performance(preds, labels):

    predictions_max = None
    f_max = 0
    p_max = 0
    r_max = 0
    t_max = 0
    for t in range(1, 100):
        threshold = t / 100.0
        predictions = (preds > threshold).astype(np.int32)
        p = 0.0
        r = 0.0
        total = 0
        p_total = 0

        tp = np.sum(predictions * labels)
        fp = np.sum(predictions) - tp
        fn = np.sum(labels) - tp

        if tp == 0 and fp == 0 and fn == 0:
            continue
        total += 1
        if tp != 0:
            p_total += 1
            precision = tp / (1.0 * (tp + fp))
            recall = tp / (1.0 * (tp + fn))
            p += precision
            r += recall

        if total > 0 and p_total > 0:
            r /= total
            p /= p_total
            if p + r > 0:
                f = 2 * p * r / (p + r)
                if f_max < f:
                    f_max = f
                    p_max = p
                    r_max = r
                    t_max = threshold
                    predictions_max = predictions

    return f_max, p_max, r_max, t_max, predictions_max


def micro_score(output, label):
    N = len(output)
    total_P = np.sum(output)
    total_R = np.sum(label)
    TP = float(np.sum(output * label))
    MiP = TP / max(total_P, 1e-12)
    MiR = TP / max(total_R, 1e-12)
    if TP==0:
        MiF = 0
    else:
        MiF = 2 * MiP * MiR / (MiP + MiR)
    return MiP, MiR, MiF, total_P / N, total_R / N

def acc_score(output,label):
    acc = accuracy_score(label.flatten(), output.flatten())

    return acc

if __name__ == '__main__':
    pass

In [None]:
#-*- encoding:utf8 -*-

import os
import time
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, confusion_matrix
import pickle
import numpy as np
import torch
from torch.optim import lr_scheduler
from torch.nn.init import xavier_normal,xavier_normal_
from torch import nn
import torch.utils.data.sampler as sampler
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from imblearn.over_sampling import SMOTENC,SMOTE
from sklearn.datasets import make_classification
from torch.utils.data import DataLoader, TensorDataset
import gc


configs = DefaultConfig()
THREADHOLD = 0.2


def save_data(class_tag, train_data_set, save,
          train_file=None):

    class_tag = "all_dset"
    if seed is not None:
        torch.manual_seed(seed)
    global THREADHOLD
    # # split data
    with open(train_file,"rb") as fp:
        train_list = pickle.load(fp)


    samples_num =len(train_list)
    split_num = int(configs.splite_rate * samples_num)
    data_index = train_list
    np.random.shuffle(data_index)
    train_index = data_index
    train_samples = sampler.SubsetRandomSampler(train_index)

    features = []
    labels = []
    test_list_file = '/content/drive/MyDrive/DeepPPISP-master/data_cache/testing_list.pkl'

    with open(test_list_file,"rb") as fp:
        test_indices = pickle.load(fp)

    test_features=[]
    test_labels=[]
    for i in test_indices:
      test_features.append((np.concatenate([comp.numpy().ravel() if hasattr(comp, 'numpy') else comp.ravel() for comp in train_data_set[i][:-1]])))
      test_features.append(train_data_set[i][-1])

    data_dict = {
        'test_features': test_features,
        'test_labels': test_labels
    }

    # File path to store the pickle file
    file_path = "test_data.pickle"

    # Dump the data to a pickle file
    with open(file_path, 'wb') as f:
        pickle.dump(data_dict, f)

    print("Data saved to", file_path)


    test_features = []
    test_labels = []
    with open(test_list_file,"rb") as fp:
        test_list = pickle.load(fp)
    # Extract features and labels from the PyTorch dataset
    shuffled_array = np.random.permutation(train_index)
    train_indices=shuffled_array

    cnt=0
    train_features=[]
    train_labels=[]

    # Clear the lists to free up memory
    dnn_train_index=[]

    for i in train_indices:
      if(train_data_set[i][-1]==0 and cnt<32000):
        dnn_train_index.append(i)
        cnt+=1
        train_features.append((np.concatenate([comp.numpy().ravel() if hasattr(comp, 'numpy') else comp.ravel() for comp in train_data_set[i][:-1]])))
        train_labels.append(0)
      elif(train_data_set[i][-1]==1):
        cnc=np.concatenate([comp.numpy().ravel() if hasattr(comp, 'numpy') else comp.ravel() for comp in train_data_set[i][:-1]])
        train_features.append(cnc)
        train_labels.append(1)
        train_features.append(cnc)
        train_labels.append(1)
        dnn_train_index.append(i)
    data_dict = {
        'train_features': train_features,
        'train_labels': train_labels
    }

    # File path to store the pickle file
    file_path = "train_data.pickle"

    # Dump the data to a pickle file
    with open(file_path, 'wb') as f:
        pickle.dump(data_dict, f)

    print("Data saved to", file_path)


def save_data_demo(train_data,save=None, train_num = 1,
    ratio=None,window_size=3,splite_rate = 0.1, efficient=True,
              epochs=10, seed=None,pretrained_result=None):

    train_sequences_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_sequence_data.pkl'.format(key) for key in train_data]
    train_dssp_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_dssp_data.pkl'.format(key) for key in train_data]
    train_pssm_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_pssm_data.pkl'.format(key) for key in train_data]
    train_label_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_label.pkl'.format(key) for key in train_data]
    all_list_file = '/content/drive/MyDrive/DeepPPISP-master/data_cache/all_dset_list.pkl'
    train_list_file = '/content/drive/MyDrive/DeepPPISP-master/data_cache/training_list.pkl'

    # Datasets
    train_dataSet = dataSet(window_size, train_sequences_file, train_pssm_file, train_dssp_file, train_label_file,
                                             all_list_file)

    # Train the model
    save_data(train_data, train_data_set=train_dataSet, save=save,
          train_file=train_list_file)
    print('Done!')

if __name__ == '__main__':

    ratio_list = (2,1)  #glboal:local
    path_dir = "./checkpoints/deep_ppi_saved_models"
    train_data = ["dset186","dset164","dset72"]
    if not os.path.exists(path_dir):
        os.makedirs(path_dir)

    for ii in range(1):
        save_data_demo(train_data,path_dir,ii,ratio_list)

/content/drive/MyDrive/DeepPPISP-master/data_cache/testing_list.pkl
Started
Data saved to train_data.pickle


ValueError: Please reshape the input data into 2-dimensional matrix.

In [None]:
import xgboost as xgb
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, confusion_matrix,roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE,SMOTENC,BorderlineSMOTE,ADASYN

# Load the data array
data_file = 'train_data.pickle'
with open(data_file, 'rb') as fp:
    train_data = pickle.load(fp)


train_features = train_data["train_features"]
train_labels = train_data["train_labels"]

for i in range(507):
  for j in range(i*49,i*49+20):
    category_indices.append(j)
  for j in range(i*49+40,i*49+49):
    category_indices.append(j)
smotenc = SMOTENC(sampling_strategy='auto', categorical_features=category_indices, random_state=42)
features_resampled, labels_resampled = smotenc.fit_resample(train_features,train_labels)


In [None]:
file_path = "smote_data.pickle"
data_dict={"train_features":features_resampled,
           "train_labels":labels_resampled}
    # Dump the data to a pickle file
with open(file_path, 'wb') as f:
    pickle.dump(data_dict, f)

In [None]:
#-*- encoding:utf8 -*-

import os
import time


import pickle
import numpy as np
import torch
from torch.optim import lr_scheduler
from torch.nn.init import xavier_normal,xavier_normal_
from torch import nn
import torch.utils.data.sampler as sampler
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from torch.utils.data import DataLoader, TensorDataset
import gc


configs = DefaultConfig()
THREADHOLD = 0.2

class AverageMeter(object):

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def weight_init(m):
    if isinstance(m,nn.Conv2d):
        xavier_normal_(m.weight.data)
    elif isinstance(m,nn.Linear):
        xavier_normal_(m.weight.data)


def train_epoch(model, loader, optimizer, epoch, all_epochs, print_freq=100):
    batch_time = AverageMeter()
    losses = AverageMeter()

    global THREADHOLD
    # Model on train mode
    model.train()

    end = time.time()
    for batch_idx, (seq_data, pssm_data, dssp_data, local_data, label) in enumerate(loader):
        # Create vaiables
        with torch.no_grad():
            seq_var = torch.autograd.Variable(seq_data.float())
            pssm_var = torch.autograd.Variable(pssm_data.float())
            dssp_var = torch.autograd.Variable(dssp_data.float())
            local_var = torch.autograd.Variable(local_data.float())
            target_var = torch.autograd.Variable(label.float())

        # compute output
        output = model(seq_var, dssp_var, pssm_var, local_var)
        shapes = output.data.shape
        output = output.view(shapes[0]*shapes[1])
        loss = torch.nn.functional.binary_cross_entropy(output, target_var)

        # measure accuracy and record loss
        batch_size = label.size(0)
        pred_out = output.ge(THREADHOLD)
        MiP, MiR, MiF, PNum, RNum = micro_score(pred_out.data.cpu().numpy(),
                                                target_var.data.cpu().numpy())
        losses.update(loss.item(), batch_size)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print stats
        if batch_idx % print_freq == 0:
            res = '\t'.join([
                'Epoch: [%d/%d]' % (epoch + 1, all_epochs),
                'Iter: [%d/%d]' % (batch_idx + 1, len(loader)),
                'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                'Loss %.4f (%.4f)' % (losses.val, losses.avg),
                'f_max:%.6f' % (MiP),
                'p_max:%.6f' % (MiR),
                'r_max:%.6f' % (MiF),
                't_max:%.2f' % (PNum)])
            print(res)

    return batch_time.avg, losses.avg


def eval_epoch(model, loader, print_freq=10, is_test=True):
    batch_time = AverageMeter()
    losses = AverageMeter()
    error = AverageMeter()

    global THREADHOLD
    # Model on eval mode
    model.eval()

    all_trues = []
    all_preds = []
    all_gos = []
    end = time.time()
    for batch_idx, (seq_data, pssm_data, dssp_data, local_data, label) in enumerate(loader):

        # Create vaiables
        with torch.no_grad():

            seq_var = torch.autograd.Variable(seq_data.float())
            pssm_var = torch.autograd.Variable(pssm_data.float())
            dssp_var = torch.autograd.Variable(dssp_data.float())
            local_var = torch.autograd.Variable(local_data.float())
            target_var = torch.autograd.Variable(label.float())

        # compute output
        output =  model(seq_var, dssp_var, pssm_var, local_var)
        shapes = output.data.shape
        output = output.view(shapes[0]*shapes[1])

        loss = torch.nn.functional.binary_cross_entropy(output, target_var)

        # measure accuracy and record loss
        batch_size = label.size(0)
        losses.update(loss.item(), batch_size)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print stats
        if batch_idx % print_freq == 0:
            res = '\t'.join([
                'Test' if is_test else 'Valid',
                'Iter: [%d/%d]' % (batch_idx + 1, len(loader)),
                'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                'Loss %.4f (%.4f)' % (losses.val, losses.avg),
            ])
            print(res)
        all_trues.append(label.numpy())
        all_preds.append(output.data.cpu().numpy())

    all_trues = np.concatenate(all_trues, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    auc = compute_roc(all_preds, all_trues)
    aupr = compute_aupr(all_preds, all_trues)
    f_max, p_max, r_max, t_max, predictions_max = compute_performance(all_preds,all_trues)
    acc_val = acc_score(predictions_max,all_trues)
    mcc = compute_mcc(predictions_max, all_trues)
    return batch_time.avg, losses.avg, acc_val, f_max, p_max, r_max, auc, aupr,t_max, mcc


def train(class_tag,model, train_data_set, save, n_epochs=3,
          batch_size=64, lr=0.001, wd=0.0001, momentum=0.9, seed=None, num=1,
          train_file=None):

    class_tag = "all_dset"
    if seed is not None:
        torch.manual_seed(seed)
    global THREADHOLD
    # split data
    with open(train_file,"rb") as fp:
        train_list = pickle.load(fp)


    samples_num =len(train_list)
    split_num = int(configs.splite_rate * samples_num)
    data_index = train_list
    np.random.shuffle(data_index)
    train_index = data_index[:split_num]
    eval_index = data_index[split_num:]

    with open("smote_data.pickle","rb") as fp:
        train_data_smote = pickle.load(fp)

    dim1 = 500  # Number of elements in the first dimension of each sub-array
    dim2 = 20  # Number of elements in the second dimension of the first and second sub-arrays
    dim3 = 9  # Number of elements in the second dimension of the third sub-array
    features_resampled=train_data_smote["train_features"]
    labels_resampled=train_data_smote["train_labels"]
    smote_features=[]
    for i in range(len(features_resampled)):
      feature_reshaped=np.array(features_resampled[i])
      smote_features.append((
        feature_reshaped[:dim1 * dim2].reshape(1, dim1, dim2),  # First sub-array
        feature_reshaped[dim1 * dim2:2 * dim1 * dim2].reshape(1, dim1, dim2),  # Second sub-array
        feature_reshaped[2 * dim1 * dim2:2 * dim1 * dim2 + dim1 * dim3].reshape(1, dim1, dim3),  # Third sub-array
        feature_reshaped[2 * dim1 * dim2 + dim1 * dim3:].reshape(343),
        labels_resampled[i]# Fourth sub-array
    ))

    #. . . . . . . . . . . . . . . . . .
    test_sequences_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_sequence_data.pkl'.format(key) for key in train_data]
    test_dssp_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_dssp_data.pkl'.format(key) for key in train_data]
    test_pssm_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_pssm_data.pkl'.format(key) for key in train_data]
    test_label_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_label.pkl'.format(key) for key in train_data]
    all_list_file = '/content/drive/MyDrive/DeepPPISP-master/data_cache/all_dset_list.pkl'
    test_list_file = '/content/drive/MyDrive/DeepPPISP-master/data_cache/testing_list.pkl'

    batch_size = configs.batch_size

    print(test_list_file)

    batch_size = configs.batch_size


    test_dataSet = dataSet(3, test_sequences_file, test_pssm_file, test_dssp_file, test_label_file,
                                             all_list_file)



    with open(test_list_file,"rb") as fp:
        test_list = pickle.load(fp)

    print(len(test_list))
    test_samples = sampler.SubsetRandomSampler(test_list)




    train_list=[]
    for i in range(len(smote_features)):
        train_list.append(i)
    np.random.shuffle(train_list)
    train_index=train_list
    train_samples = sampler.SubsetRandomSampler(train_index)
    eval_samples = sampler.SubsetRandomSampler(eval_index)

    train_loader = torch.utils.data.DataLoader(smote_features, batch_size=batch_size,
                                               sampler=train_samples,num_workers=5, drop_last=False)
    valid_loader = torch.utils.data.DataLoader(test_dataSet, batch_size=batch_size,sampler=test_samples,num_workers=5, drop_last=False)



    # Wrap model for multi-GPUs, if necessary
    model_wrapper = model

    # Optimizer
    optimizer = torch.optim.Adam(model_wrapper.parameters(), lr=0.001)

    # Start log
    with open(os.path.join(save, 'DeepPPI_results.csv'), 'w') as f:
        f.write('epoch,loss,acc,F_value, precision,recall,auc,aupr,mcc,threadhold\n')

        # Train model
        best_F = 0
        threadhold = 0
        count = 0
        for epoch in range(n_epochs):
            _, train_loss = train_epoch(
                model=model_wrapper,
                loader=train_loader,
                optimizer=optimizer,
                epoch=epoch,
                all_epochs=n_epochs,
            )
            _, valid_loss, acc, f_max, p_max, r_max, auc, aupr,t_max,mcc= eval_epoch(
                model=model_wrapper,
                loader=valid_loader,
                is_test=(not valid_loader)
            )

            print(
            'epoch:%03d,valid_loss:%0.5f\nacc:%0.6f,F_value:%0.6f, precision:%0.6f,recall:%0.6f,auc:%0.6f,aupr:%0.6f,mcc:%0.6f,threadhold:%0.6f\n' % ((epoch + 1), valid_loss, acc, f_max, p_max, r_max,auc, aupr,mcc,t_max))
            if f_max > best_F:
                count = 0
                best_F = f_max
                THREADHOLD = t_max
                print("new best F_value:{0}(threadhold:{1})".format(f_max, THREADHOLD))
                torch.save(model.state_dict(), os.path.join(save, 'DeepPPI_model.dat'))
            else:
                count += 1
                if count>=5:
                    return None
            # Log results
            f.write('%03d,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f,%0.6f\n' % ((epoch + 1), valid_loss, acc, f_max, p_max, r_max, auc, aupr,mcc,t_max))



def demo(train_data,save=None, train_num = 1,
    ratio=None,window_size=3,splite_rate = 0.1, efficient=True,
              epochs=10, seed=None,pretrained_result=None):

    train_sequences_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_sequence_data.pkl'.format(key) for key in train_data]
    train_dssp_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_dssp_data.pkl'.format(key) for key in train_data]
    train_pssm_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_pssm_data.pkl'.format(key) for key in train_data]
    train_label_file = ['/content/drive/MyDrive/DeepPPISP-master/data_cache/{0}_label.pkl'.format(key) for key in train_data]
    all_list_file = '/content/drive/MyDrive/DeepPPISP-master/data_cache/all_dset_list.pkl'
    train_list_file = '/content/drive/MyDrive/DeepPPISP-master/data_cache/training_list.pkl'


    #parameters
    batch_size = configs.batch_size

    # Datasets
    train_dataSet = dataSet(window_size, train_sequences_file, train_pssm_file, train_dssp_file, train_label_file,
                                             all_list_file)
    # Models

    class_nums = 1
    model = DeepPPI(class_nums,window_size,ratio)
    model.apply(weight_init)

    # Train the model
    train(train_data,model=model, train_data_set=train_dataSet, save=save,
          n_epochs=epochs, batch_size=batch_size, seed=seed,num=train_num,
          train_file=train_list_file)
    print('Done!')

if __name__ == '__main__':

    ratio_list = (2,1)  #glboal:local
    path_dir = "./checkpoints/deep_ppi_saved_models"
    train_data = ["dset186","dset164","dset72"]
    if not os.path.exists(path_dir):
        os.makedirs(path_dir)

    for ii in range(1):
        demo(train_data,path_dir,ii,ratio_list)

/content/drive/MyDrive/DeepPPISP-master/data_cache/testing_list.pkl
11791
Epoch: [1/10]	Iter: [1/2045]	Time 9.403 (9.403)	Loss 0.7997 (0.7997)	f_max:0.666667	p_max:0.666667	r_max:0.666667	t_max:0.47
Epoch: [1/10]	Iter: [101/2045]	Time 0.917 (1.036)	Loss 0.6934 (0.8831)	f_max:0.448276	p_max:1.000000	r_max:0.619048	t_max:0.91
Epoch: [1/10]	Iter: [201/2045]	Time 0.876 (0.980)	Loss 0.6695 (0.7771)	f_max:0.437500	p_max:1.000000	r_max:0.608696	t_max:1.00
Epoch: [1/10]	Iter: [301/2045]	Time 0.879 (0.965)	Loss 0.6644 (0.7386)	f_max:0.500000	p_max:1.000000	r_max:0.666667	t_max:1.00
Epoch: [1/10]	Iter: [401/2045]	Time 0.924 (0.956)	Loss 0.5739 (0.7122)	f_max:0.629630	p_max:1.000000	r_max:0.772727	t_max:0.84
Epoch: [1/10]	Iter: [501/2045]	Time 0.882 (0.950)	Loss 0.6332 (0.6947)	f_max:0.451613	p_max:1.000000	r_max:0.622222	t_max:0.97
Epoch: [1/10]	Iter: [601/2045]	Time 0.923 (0.948)	Loss 0.6659 (0.6811)	f_max:0.548387	p_max:0.944444	r_max:0.693878	t_max:0.97
Epoch: [1/10]	Iter: [701/2045]	Time 1.1

KeyboardInterrupt: 

In [None]:
#XG BOOST
import xgboost as xgb
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef, confusion_matrix,roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE,SMOTENC,BorderlineSMOTE,ADASYN

# Load the data array
data_file = 'smote_data.pickle'
with open(data_file, 'rb') as fp:
    train_data = pickle.load(fp)

# Check if all tuples in data_array have consistent lengths
with open('test_data.pickle', 'rb') as fp:
    test_data = pickle.load(fp)

train_features = train_data["train_features"]
train_labels = train_data["train_labels"]





# Extract features and labels for test set
test_features = test_data["test_features"]
test_labels =test_data["test_labels"]
# Convert data to xgboost DMatrix
train_features = xgb.DMatrix(train_features, label=train_labels)
test_features = xgb.DMatrix(test_features, label=test_labels)

# Set XGBoost parameters
params = {
    'objective': 'binary:logistic',  # for binary classification
    'max_depth': 9,
    'learning_rate': 0.1,
    'min_child_weight':5,
    'eval_metric': 'logloss'
}

num_round = 80  # Number of boosting rounds

# Train the XGBoost model
XGBModel = xgb.train(params, train_features, num_round)

# from sklearn.model_selection import GridSearchCV


# Make predictions on the test set
y_pred = XGBModel.predict(test_features)
y_pred_binary = (y_pred > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(test_labels, y_pred_binary)
precision = precision_score(test_labels, y_pred_binary)
recall = recall_score(test_labels, y_pred_binary)
mcc = matthews_corrcoef(test_labels, y_pred_binary)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"MCC: {mcc}")

# Print confusion matrix
conf_matrix = confusion_matrix(test_labels, y_pred_binary)
print("Confusion Matrix:")
print(conf_matrix)


auc = roc_auc_score(test_labels, y_pred)

# Calculate AUPR
aupr = average_precision_score(test_labels, y_pred)

from sklearn.metrics import f1_score

# Calculate F1-score
f1 = f1_score(test_labels, y_pred_binary)

print(f"F1-score: {f1}")



Accuracy: 0.7355610211178017
Precision: 0.3417874396135266
Recall: 0.36406518010291594
MCC: 0.1867780145723312
Confusion Matrix:
[[7824 1635]
 [1483  849]]
F1-score: 0.35257475083056483
