In [None]:
from Bio import SeqIO
import os
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, AutoTokenizer,BertModel, AutoModel 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def getFastaData(fasta_path):

    fasta_file = fasta_path


    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append({
            "id": record.id,
            "sequence": str(record.seq)
        })

    result = []
    labels = []

    for sequence in sequences:
        labels.append(int(sequence["id"]))
        result.append(' '.join(sequence["sequence"]))


    return result,labels
    

In [None]:
train_features_first, trian_labels_first = getFastaData(r'./data/upTrain.fasta')

In [None]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

def analyze_protein_sequences(sequence_list):
    results = []
    
    for sequence in sequence_list:

        sequence = sequence.replace(" ", "")
        
        protein_analysis = ProteinAnalysis(sequence)
        
        molecular_weight = protein_analysis.molecular_weight()
        isoelectric_point = protein_analysis.isoelectric_point()
        amino_acid_composition = list(protein_analysis.get_amino_acids_percent().values())
        secondary_structure_fraction = list(protein_analysis.secondary_structure_fraction())
        hydrophobicity = protein_analysis.gravy()
        charge_at_pH_7 = protein_analysis.charge_at_pH(7.0)
        
        features = [
            molecular_weight,
            isoelectric_point,
            *amino_acid_composition,  
            *secondary_structure_fraction,  
            hydrophobicity,
            charge_at_pH_7
        ]
        
        results.append(features)
    
    return results


In [None]:

class Attention(nn.Module):
    def __init__(self, input_dim):
        super(Attention, self).__init__()
        self.W = nn.Linear(input_dim, 1)

    def forward(self, x):
        attn_weights = F.softmax(self.W(x), dim=1)
        output = attn_weights * x
        return output


class myModel(torch.nn.Module):
    def __init__(self,esm2):
        super(myModel,self).__init__()
        self.esm2 = esm2
        self.fc1 = torch.nn.Linear(1051,32)
        self.fc2 = torch.nn.Linear(32,8)
        self.fc3 = torch.nn.Linear(8,1)
        
        self.att1 = Attention(32)
   
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU() 
        self.bn1 = torch.nn.BatchNorm1d(64)  
        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.3)
    def forward(self,x,fe):
        outputs_ = self.esm2(**inputs)

        x = outputs_.last_hidden_state[:, 0, :]
        x = torch.cat((x, fe), dim=1)
        x = self.dropout1(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.att1(x)
        x = self.dropout2(x)
        
        
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x
        
        

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

def comp_result(y_test, y_pred, y_proba):

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    mcc = matthews_corrcoef(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"MCC: {mcc:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Precision: {precision:.4f}")
    
    return accuracy,f1,auc,mcc,recall,precision

In [None]:
from sklearn.model_selection import KFold
import numpy as np
import copy as cp
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_in = 0
feature = np.array(train_features_first)
label = np.array(trian_labels_first)
loaded_scaler = joblib.load('./scaler_model.pkl')

for train_index, test_index in kf.split(feature):
    
    save_model = None
    best_accuracy = 0
    best_f1 = 0
    best_auc = 0
    best_mcc = 0
    best_recall = 0
    best_precision = 0
    kf_in += 1

    print('kf',kf_in)
    train_features, test_features = list(feature[train_index]), list(feature[test_index])
    train_labels, test_labels = list(label[train_index]), list(label[test_index])

    device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

    cache_directory = r"/home/hqzhang/neuropeptide prediction/model"

    tokenizer_ = AutoTokenizer.from_pretrained("Rostlab/prot_bert", cache_dir=cache_directory)
    esm2 = AutoModel.from_pretrained("Rostlab/prot_bert", cache_dir=cache_directory)
    del esm2.encoder.layer[4:]
    model = myModel(esm2)
    model = model.to(device)

    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.003)


    batch_size = 4  
    data_loader = DataLoader(list(zip(train_features, train_labels)), batch_size=batch_size, shuffle=True)

    test_data_loader = DataLoader(list(zip(test_features, test_labels)), batch_size=batch_size, shuffle=False)

    num_epochs = 30
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_input, batch_labels in data_loader:
            
            nol_feature = analyze_protein_sequences(batch_input)
            normalized_nol_feature = loaded_scaler.transform(nol_feature)
            normalized_nol_feature = torch.tensor(normalized_nol_feature).float()
            normalized_nol_feature = normalized_nol_feature.to(device)
            inputs = tokenizer_(batch_input, return_tensors='pt', padding=True, truncation=True)
            inputs = inputs.to(device)

            outputs = model(inputs,normalized_nol_feature)
            outputs = outputs.view(-1)
            loss = criterion(outputs.to('cpu'), batch_labels.to(torch.float32))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        model.eval()  
        test_loss = 0
        correct_predictions = 0
        
        
        test_labels_com = []
        predict_labels_com = []
        test_outputs_com = []
        
        
        with torch.no_grad():
            for test_batch_input, test_batch_labels in test_data_loader:
                
                nol_feature = analyze_protein_sequences(test_batch_input)
                normalized_nol_feature = loaded_scaler.transform(nol_feature)
                normalized_nol_feature = torch.tensor(normalized_nol_feature).float()
                normalized_nol_feature = normalized_nol_feature.to(device)
                
                inputs = tokenizer_(test_batch_input, return_tensors='pt', padding=True, truncation=True)
                inputs = inputs.to(device)

                test_outputs = model(inputs,normalized_nol_feature)
                test_outputs = test_outputs.view(-1)
                test_loss += criterion(test_outputs.to('cpu'), test_batch_labels.to(torch.float32)).item()
                predictions = (test_outputs > 0.5).float()  
                correct_predictions += (predictions.to('cpu') == test_batch_labels).sum().item()
                
                predict_labels_com.extend(predictions.tolist())
                test_labels_com.extend(test_batch_labels.tolist())
                test_outputs_com.extend(test_outputs.tolist())
        
        average_test_loss = test_loss / len(test_data_loader)
        accuracy = correct_predictions / len(test_features)
        
        predict_labels_com = np.array(predict_labels_com)
        test_labels_com = np.array(test_labels_com)
        test_outputs_com = np.array(test_outputs_com)
        temp_accuracy_test,temp_f1,temp_auc,temp_mcc,temp_recall,temp_precision = comp_result(test_labels_com,predict_labels_com,test_outputs_com)
        if temp_accuracy_test > best_accuracy:
            best_accuracy = temp_accuracy_test
            best_f1 = temp_f1
            best_auc = temp_auc
            best_mcc = temp_mcc
            best_recall = temp_recall
            best_precision = temp_precision
            save_model = cp.deepcopy(model)
        average_train_loss = total_loss / len(data_loader)
        print('Epoch [{}/{}], Train Loss: {:.4f}, Test Loss: {:.4f}, Accuracy: {:.2f}%'.format(
            epoch + 1, num_epochs, average_train_loss, average_test_loss, accuracy * 100))
        model.train()
    torch.save(save_model, r'./result/model_base_BBK_%d.pth' % kf_in)
    print('=====================================')
    print('kf',kf_in,"best result:")
    print(best_accuracy)
    print(best_f1)
    print(best_auc)
    print(best_mcc)
    print(best_recall)
    print(best_precision)
    print('=====================================')