In [1]:
import esm
from esm import pretrained
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pickle
from Bio import SeqIO
from torch.cuda.amp import autocast
import os
import random

def infer_sequence(model, tokenizer, sequence, device):
    encoded_inputs = tokenizer(sequence, return_tensors='pt', padding=True, truncation=True)
    encoded_inputs = {k: v.to(device) for k, v in encoded_inputs.items()}
    with torch.no_grad():
        with autocast():  # 使用混合精度
            outputs = model(**encoded_inputs, output_hidden_states=True)
    representations = outputs.hidden_states[-1]
    last_hidden_state = representations[:, 0, :]
    # 清空缓存以释放显存
    torch.cuda.empty_cache()
    # 将形状 (1, 2560) 的张量变成 (2560)
    last_hidden_state = last_hidden_state.squeeze(0)
    return last_hidden_state.cpu().numpy()

def read_fasta(file_path):
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
    return sequences

def main(model_weights_path, model_path, positive_path, negative_path, rdict_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForMaskedLM.from_pretrained(model_path)
    model.load_state_dict(torch.load(model_weights_path), strict=False)
    model = model.to(device)
    
    # 读取两个FASTA文件中的序列
    positive_sequences = read_fasta(positive_path)
    negative_sequences = read_fasta(negative_path)
    
    # 合并序列
    sequences = positive_sequences + negative_sequences
    
    result_dict = {}
    for sequence in sequences:
        if len(sequence) > 4000:
            sequence = sequence[:4000]
        try:
            representation = infer_sequence(model, tokenizer, sequence, device)
            result_dict[sequence] = representation
        except torch.cuda.OutOfMemoryError as e:
            print(f"Out of memory error for sequence length: {len(sequence)}")
            torch.cuda.empty_cache()
            continue
    
    # 保存结果
    with open(rdict_path, 'wb') as file:
        pickle.dump(result_dict, file)
    print(f"字典已成功保存到: {rdict_path}")
    
    for seq, tensor in result_dict.items():
        print(f"Sequence: {seq}")
        print(f"Inferred Tensor: {tensor.shape}")
        print("---")

if __name__ == "__main__":
    model_weights_path = "/public/home/kngll/Mambaphase/data/esm2_t36_3B_UR50D_mlm_finetuned.pth"
    model_path = "/public/home/kngll/llps/data/esm2_t36_3B_UR50D"
    positive_path = "/public/home/kngll/Mambaphase/data/drllps_client_clstr_Homo_sapiens.fasta"
    negative_path = "/public/home/kngll/Mambaphase/data/drllps_nonllps_clstr_Homo_sapiens.fasta"
    rdict_path = "/public/home/kngll/Mambaphase/data/client_result_dict.pkl"
    main(model_weights_path, model_path, positive_path, negative_path, rdict_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  model.load_state_dict(torch.load(model_weights_path), strict=False)
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  with autocast():  # 使用混合精度


字典已成功保存到: /public/home/kngll/Mambaphase/data/client_result_dict.pkl
Sequence: MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGSTTLPSAPPPASAGLKSHPPPPEK
Inferred Tensor: (2560,)
---
Sequence: MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSMVQENGQRKYGGPPPGWEGPHPQRGCEVFVGKIPRDVYEDELVPVFEAVGRIYELRLMMDFDGKNRGYAFVMYCHKHEAKRAVRELNNYEIRPGRLLGVCCSVDNCRLFIGGIPKMKKREEILEEIAKVTEGVLDVIVYASAADKMKNRGFAFVEYESHRAAAMARRKLMPGRIQLWGHQIAVDWAEPEIDVDEDVMETVKILYVRNLMIETTEDTIKKSFGQFNPGCVERVKKIRDYAFVHFTSREDAVHAMNNLNGTELEGSCLEVTLAKPVDKEQYSRYQKAARGGGAAEAAQQPSYVYSCDPYTLAYYGYPYNALIGPNRDYFVKAGSIRGRGRGAAGNRAPGPRGSYLGGYSAGRGIYSRYHEGKGKQQEKGYELVPNLEIPTVNPVAIKPGTVAIPAIGAQYSMFPAAPAPKMIEDGKIHTVEHMISPIAVQPDPASAAAAAAAAAAAAAAVIPTVSTPPPFQGRPITPVYTVAPNVQRIPTAGIYGASYVPFAAPATATIATLQKNAAAAAAMYGGYAGYIPQAFPAAAIQVPIPDVYQTY
Inferred Tensor: (2560,)
---
Sequence: MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGLGERSHAPGSRLGARRRAKTARGLRGHRQRGAGAGLSRPGSARAPSPPRPGGPENPGGVLSVELPGLLAQLARSFALLLPVYALGYLGLSFSWVLLALALLAWCRRSRGLKALRLCRALALLEDEERVVRLGVRAC

In [None]:
import esm
from esm import pretrained
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pickle
from Bio import SeqIO
from torch.cuda.amp import autocast
import os
import random

def infer_sequence(model, tokenizer, sequence, device):
    encoded_inputs = tokenizer(sequence, return_tensors='pt', padding=True, truncation=True)
    encoded_inputs = {k: v.to(device) for k, v in encoded_inputs.items()}
    with torch.no_grad():
        with autocast():  # 使用混合精度
            outputs = model(**encoded_inputs, output_hidden_states=True)
    representations = outputs.hidden_states[-1]
    last_hidden_state = representations[:, 0, :]
    # 清空缓存以释放显存
    torch.cuda.empty_cache()
    # 将形状 (1, 2560) 的张量变成 (2560)
    last_hidden_state = last_hidden_state.squeeze(0)
    return last_hidden_state.cpu().numpy()

def read_fasta(file_path):
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
    return sequences

def main(model_weights_path, model_path, positive_path, negative_path, rdict_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForMaskedLM.from_pretrained(model_path)
    model.load_state_dict(torch.load(model_weights_path), strict=False)
    model = model.to(device)
    
    # 读取两个FASTA文件中的序列
    positive_sequences = read_fasta(positive_path)
    negative_sequences = read_fasta(negative_path)
    
    # 合并序列
    sequences = positive_sequences + negative_sequences
    
    result_dict = {}
    for sequence in sequences:
        if len(sequence) > 4000:
            sequence = sequence[:4000]
        try:
            representation = infer_sequence(model, tokenizer, sequence, device)
            result_dict[sequence] = representation
        except torch.cuda.OutOfMemoryError as e:
            print(f"Out of memory error for sequence length: {len(sequence)}")
            torch.cuda.empty_cache()
            continue
    
    # 保存结果
    with open(rdict_path, 'wb') as file:
        pickle.dump(result_dict, file)
    print(f"字典已成功保存到: {rdict_path}")
    
    for seq, tensor in result_dict.items():
        print(f"Sequence: {seq}")
        print(f"Inferred Tensor: {tensor.shape}")
        print("---")

if __name__ == "__main__":
    model_weights_path = "/public/home/kngll/Mambaphase/data/esm2_t36_3B_UR50D_mlm_finetuned.pth"
    model_path = "/public/home/kngll/llps/data/esm2_t36_3B_UR50D"
    positive_path = "/public/home/kngll/Mambaphase/data/drllps_client_clstr_Homo_sapiens.fasta"
    negative_path = "/public/home/kngll/Mambaphase/data/drllps_nonllps_clstr_Homo_sapiens.fasta"
    rdict_path = "/public/home/kngll/Mambaphase/data/client_result_dict.pkl"
    main(model_weights_path, model_path, positive_path, negative_path, rdict_path)

In [2]:
#将total_rdict_path = "/public/home/kngll/Mambaphase/model/total_rdict.pkl"和“/public/home/kngll/Mambaphase/data/phscaffoldsalt.pkl”进行合并

import pickle

# 定义文件路径
total_rdict_path = "/public/home/kngll/Mambaphase/model/total_rdict.pkl"
phscaffoldsalt_path = "/public/home/kngll/Mambaphase/data/phscaffoldsalt.pkl"
output_path = "/public/home/kngll/Mambaphase/model/merged_dict.pkl"

# 加载第一个文件
with open(total_rdict_path, 'rb') as f:
    total_rdict = pickle.load(f)

# 加载第二个文件
with open(phscaffoldsalt_path, 'rb') as f:
    phscaffoldsalt = pickle.load(f)

# 合并两个字典
# 假设两个文件的内容都是字典，并且键不冲突
merged_dict = {**total_rdict, **phscaffoldsalt}

# 保存合并后的字典到新的文件
with open(output_path, 'wb') as f:
    pickle.dump(merged_dict, f)

print("合并完成，结果已保存到:", output_path)

合并完成，结果已保存到: /public/home/kngll/Mambaphase/model/merged_dict.pkl


TypeError: SequenceToVectorModel.__init__() got an unexpected keyword argument 'n_heads'