In [1]:
import os
import pandas as pd
import iFeatureOmegaCLI
import torch
import numpy as np
from torch_geometric.data import Data
import warnings
warnings.filterwarnings("ignore")
import pickle
from tqdm import tqdm
import multiprocessing
import logging
from torch_geometric.utils import add_self_loops
import contextlib
import io
import esm

# Load ESM-1b model
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
batch_converter = alphabet.get_batch_converter()

In [2]:
cm_path = os.path.join("/home/bli/homology/colabfold_cm_371/"+"AF-P24482-F1-model_v4.cm")
fasta_path = os.path.join("/home/bli/homology/colabfold_fasta_371/"+"AF-P24482-F1-model_v4.fasta")
pkl_path = os.path.join("/home/bli/homology/colabfold_pkl_371/"+"AF-P24482-F1-model_v4-model_v4.pkl")

#esm feature
batch_labels, batch_strs, batch_tokens = batch_converter([("AF-P24482-F1", "MFGSGNVLPVKIQPPLLRPLAYRVLSRKYGLSIKSDGLSALAEFVGTNIGANWRQGPATIKFLEQFAAVWKQQERGLFIDQSGVKEVIQEMKEREKVEWSHEHPIQHEENILGRTDDDENNSDDEMPIAADSSLQNVSLSSPMRQPTERDEYKQPFKPESSKALDWRDYFKVINASQQQRFSYNPHKMQFIFVPNKKQNGLGGIAGFLPDIEDKVQMFLTRYYLTNDRVMRNENFQNSDMFNPLSSMVSLQNELSNTNRQQQSSSMSITPIKNLLGRDAQNFLLLGLLNKNFKGNWSLEDPSGSVEIDISQTIPTQGHYYVPGCMVLVEGIYYSVGNKFHVTSMTLPPGERREITLETIGNLDLLGIHGISNNNFIARLDKDLKIRLHLLEKELTDHKFVILGANLFLDDLKIMTALSKILQKLNDDPPTLLIWQGSFTSVPVFASMSSRNISSSTQFKNNFDALATLLSRFDNLTENTTMIFIPGPNDLWGSMVSLGASGTLPQDPIPSAFTKKINKVCKNVVWSSNPTRIAYLSQEIVIFRDDLSGRFKRHRLEFPFNESEDVYTENDNMMSKDTDIVPIDELVKEPDQLPQKVQETRKLVKTILDQGHLSPFLDSLRPISWDLDHTLTLCPIPSTMVLCDTTSAQFDLTYNGCKVINPGSFIHNRRARYMEYVPSSKKTIQEEIYI")])
with torch.no_grad():
  results = model(batch_tokens, repr_layers=[33], return_contacts=True)

In [3]:
fasta_path

'/home/bli/homology/colabfold_fasta_371/AF-P24482-F1-model_v4.fasta'

In [4]:
protein = iFeatureOmegaCLI.iProtein(fasta_path)

# 创建一个空的文件对象
null_file = io.StringIO()

# 使用 redirect_stdout 上下文管理器将标准输出重定向到空文件
with contextlib.redirect_stdout(null_file):
  protein.import_parameters('/home/bli/homology/feature_extract/Protein_parameters_setting.json')

protein.get_descriptor("BLOSUM62")

node_feature = torch.from_numpy((protein.encodings.values.reshape(-1,20))).float()


In [5]:
node_feature1 = results['representations'][33][0, 1:-1].reshape(-1,1280)

In [6]:
node_feature1

tensor([[ 0.0113,  0.0082,  0.1367,  ..., -0.1121, -0.0857, -0.2022],
        [ 0.1863, -0.0565, -0.0434,  ...,  0.0451, -0.2975,  0.1975],
        [ 0.3125,  0.0436, -0.0959,  ...,  0.0688, -0.1689,  0.0623],
        ...,
        [-0.1508,  0.3432, -0.1703,  ..., -0.0777, -0.3324,  0.0533],
        [ 0.0189,  0.3239, -0.3733,  ..., -0.0380, -0.3489, -0.4201],
        [-0.0009,  0.2701, -0.3858,  ..., -0.1513, -0.5010, -0.1553]])

In [7]:
node_features = torch.cat((node_feature,node_feature1),1)

In [8]:
node_features.shape

torch.Size([689, 1300])

In [9]:
with open(cm_path, 'r') as f:
  content = f.read()
  # 将内容解析为二维数组
  data = np.array([list(map(float, line.split(','))) for line in content.split('\n') if line])
  # 将数组转换为 PyTorch Tensor
  edges = torch.from_numpy(data).type(torch.LongTensor)

label = torch.tensor(0.05).reshape(1,)
data = Data(x=node_features, edge_index=edges.t().contiguous() - 1, y=label)
data.edge_index, _ = add_self_loops(data.edge_index, num_nodes=node_features.shape[0])
with open(pkl_path, 'wb') as fpkl:
  pickle.dump(data, fpkl)

