In [14]:
# Current directory
import os
os.chdir('F:/One/ACP')

### ESM feature

In [15]:
def esm_embeddings(peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you conputer might automatically kill the job.
  import torch
  import esm
  import pandas as pd
  import collections
  # load the model
  # NOTICE: if the model was not downloaded in your local environment, it will automatically download it.
  model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
  batch_converter = alphabet.get_batch_converter()
  model.eval()  # disables dropout for deterministic results

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t6_8M_UR50D' only has 6 layers, and therefore repr_layers parameters is equal to 6
      results = model(batch_tokens, repr_layers=[6], return_contacts=True)  
  token_representations = results["representations"][6]

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  return embeddings_results

#### 加载序列

In [19]:
import numpy as np
import pandas as pd


# 加载训练数据集
dataset = pd.read_csv('data/train.tsv', sep='\t', na_filter=False)  # 读取TSV文件
sequence_list = dataset['text_a']
# 加载用于模型开发的y数据集
label = dataset['label']

# 保存为CSV文件
label.to_csv('features_label/train_label.csv', index=False, header=None)

print(sequence_list.shape)
print(label.shape)


(1378,)
(1378,)


#### 提取ESM-2特征

In [20]:
# load sequence for esm-2
peptide_sequence_list = []
for seq in sequence_list:
    format_seq = [seq,seq] # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple(format_seq)
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information

# employ ESM model for converting and save the converted data in csv format
embeddings_results = esm_embeddings(peptide_sequence_list)
embeddings_results.to_csv('features_label/train.csv', index=False, header=None)

### PortT5 feature

In [None]:
# define function
def PortT5_embedding(sequence_list):
    
    from transformers import T5Tokenizer, T5EncoderModel
    import torch
    import re

    # Calculate the length of a sequence
    sequence_lengths = [len(sequence) for sequence in sequence_list]

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Load the tokenizer
    tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)

    # Load the model
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)

    # only GPUs support half-precision currently; if you want to run on CPU use full-precision (not recommended, much slower)
    model.full() if device=='cpu' else model.half()

    # replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
    sequence_list = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_list]

    # tokenize sequences and pad up to the longest sequence in the batch
    ids = tokenizer(sequence_list, add_special_tokens=True, padding="longest")

    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    # generate embeddings
    with torch.no_grad():
        embedding_repr = model(input_ids=input_ids, attention_mask=attention_mask)

    # extract residue embeddings for each sequence in the batch and remove padded & special tokens
    embeddings = [embedding_repr.last_hidden_state[i, :length] for i, length in enumerate(sequence_lengths)]

    # if you want to derive a single representation (per-protein embedding) for the whole protein
    per_protein_embeddings = [emb.mean(dim=0) for emb in embeddings]

    return per_protein_embeddings

#### 加载序列数据

In [None]:
import numpy as np
import pandas as pd

# training dataset loading
dataset = pd.read_excel('1030_dataset.xlsx', na_filter = False) # take care the NA sequence problem
sequence_list = dataset['sequences']
# # loading the y dataset for model development 
y = dataset['label']
y = np.array(y) # transformed as np.array for CNN model

# # 验证
print(sequence_list)
print(y)

#### 提取PortT5特征

In [None]:
# load sequence for PortT5

# 使用PortT5模型进行序列嵌入
embeddings_results = PortT5_embedding(sequence_list)

# 将嵌入结果的tensor转换为数值
embeddings_results = [[float(value) for value in result] for result in embeddings_results]

# 将嵌入结果列表转换为DataFrame格式
embeddings_results = pd.DataFrame(embeddings_results)

# 将DataFrame导出为CSV文件
embeddings_results.to_csv('PortT5/PortT5_dataset.csv')

### 结尾