### 对mf-esm2数据的查看和分析

In [1]:
import pandas as pd
import torch as th
import numpy as np
# import dgl


def get_data(df, features_dict, terms_dict, features_length, features_column):
    """
    Converts dataframe file with protein information and returns
    PyTorch tensors
    """
    data = th.zeros((len(df), features_length), dtype=th.float32)
    labels = th.zeros((len(df), len(terms_dict)), dtype=th.float32)
    for i, row in enumerate(df.itertuples()):
        # Data vector
        if features_column == 'esm2':
            data[i, :] = th.FloatTensor(row.esm2)
        elif features_column == 'interpros':
            for feat in row.interpros:
                if feat in features_dict:
                    data[i, features_dict[feat]]
        elif features_column == 'mf_preds':
            data[i, :] = th.FloatTensor(row.mf_preds)
        elif features_column == 'prop_annotations':
            for feat in row.prop_annotations:
                if feat in features_dict:
                    data[i, features_dict[feat]] = 1
        # Labels vector
        for go_id in row.prop_annotations:
            if go_id in terms_dict:
                g_id = terms_dict[go_id]
                labels[i, g_id] = 1
    return data, labels

def load_data(
        data_root, ont, terms_file, features_length=2560,
        features_column='esm2', test_data_file='test_data.pkl'):
    terms_df = pd.read_pickle(terms_file)
    terms = terms_df['gos'].values.flatten()
    terms_dict = {v: i for i, v in enumerate(terms)}
    print('Terms', len(terms))
    
    ipr_df = pd.read_pickle(f'{data_root}/{ont}/interpros.pkl')
    iprs = ipr_df['interpros'].values
    iprs_dict = {v:k for k, v in enumerate(iprs)}
    if features_column == 'interpros':
        features_length = len(iprs_dict)
    

    train_df = pd.read_pickle(f'{data_root}/{ont}/train_data.pkl')
    valid_df = pd.read_pickle(f'{data_root}/{ont}/valid_data.pkl')
    test_df = pd.read_pickle(f'{data_root}/{ont}/{test_data_file}')

    train_data = get_data(train_df, iprs_dict, terms_dict, features_length, features_column)
    valid_data = get_data(valid_df, iprs_dict, terms_dict, features_length, features_column)
    test_data = get_data(test_df, iprs_dict, terms_dict, features_length, features_column)

    return iprs_dict, terms_dict, train_data, valid_data, test_data, test_df

In [2]:
import click as ck
import pandas as pd
import torch as th
import numpy as np
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.optim.lr_scheduler import MultiStepLR
from sklearn.metrics import roc_curve, auc, matthews_corrcoef
import copy
from torch.utils.data import DataLoader, IterableDataset, TensorDataset
from itertools import cycle
import math
from deepgo.torch_utils import FastTensorDataLoader
from deepgo.utils import Ontology, propagate_annots
from multiprocessing import Pool
from functools import partial

In [3]:
data_root = 'data'
ont = 'bp'
terms_file = f'{data_root}/{ont}/terms.pkl'
features_length = 2560
features_column = 'esm2'
test_data_file = 'test_data.pkl'
iprs_dict, terms_dict, train_data, valid_data, test_data, test_df = load_data(
        data_root, ont, terms_file, features_length, features_column, test_data_file)

Terms 21356


In [5]:
test_df.head(3)

Unnamed: 0,index,proteins,accessions,genes,sequences,annotations,string_ids,orgs,interpros,exp_annotations,prop_annotations,cafa_target,esm,esm2,mf_preds
58593,430345,RT26_YEAST,P47141; D6VWS0;,853565,MLVFKRGIHVVPKLPNSKALLQNGVPNILSSSGFKTVWFDYQRYLC...,"[GO:0005763|IDA, GO:0005739|HDA, GO:0046872|IE...",[4932.YJR101W],559292,"[IPR019832, IPR036324, IPR036314]","[GO:0005763, GO:0005739, GO:0003735, GO:0032543]","[GO:1901566, GO:1901576, GO:0044271, GO:004323...",True,"[0.06308255, 0.0822591, -0.024374967, 0.066503...","[-0.05179443, -0.0027887435, -0.06578646, -0.0...","[5.51395669390331e-06, 0.0015262124652508646, ..."
62724,448369,SODF3_ARATH,Q9FMX0; O81240; Q8LCD9;,832395,MSSCVVTTSCFYTISDSSIRLKSPKLLNLSNQQRRRSLRSRGGLKV...,"[GO:0009507|IDA, GO:0042644|IDA, GO:0009534|IE...",[3702.AT5G23310.1],3702,"[IPR001189, IPR019833, IPR019832, IPR019831, I...","[GO:0009507, GO:0042644, GO:0042646, GO:000957...","[GO:0005737, GO:0051716, GO:0043227, GO:000030...",True,"[-0.09370262, 0.19858178, -0.018441962, 0.0563...","[-0.055271477, -0.022739667, -0.029521158, -0....","[5.719591513297928e-05, 0.008652074378915131, ..."
62741,448515,SODM_CUPMC,P17550; Q5NUZ9; Q93JN0;,60825782,MLYEMKPLGCEPAKLTGLSEKLIFSHYENNYGGAVKRLNAITATLA...,"[GO:0046872|IEA, GO:0004784|IDA, GO:0046687|IEA]",[],266264,"[IPR001189, IPR019832, IPR036324, IPR036314]",[GO:0004784],"[GO:0051716, GO:0000305, GO:0019430, GO:001672...",False,"[-0.0472369, 0.04435269, -6.680077e-05, 0.0321...","[-0.068125404, -0.08426394, -0.1095703, -0.034...","[2.7134405627293745e-05, 0.012835425324738026,..."


In [6]:
train_feature,labels = train_data
labels[0].shape

torch.Size([21356])

In [8]:
ont = 'mf'
terms_file = f'{data_root}/{ont}/terms.pkl'
iprs_dict, terms_dict_mf, train_data, valid_data, test_data, test_df = load_data(
        data_root, ont, terms_file, features_length, features_column, test_data_file)
ont = 'cc'
terms_file = f'{data_root}/{ont}/terms.pkl'
iprs_dict, terms_dict_cc, train_data, valid_data, test_data, test_df = load_data(
        data_root, ont, terms_file, features_length, features_column, test_data_file)

Terms 6851
Terms 2829


In [9]:
print('terms_dict-bp:')
for k, v in terms_dict.items():
    print(k, v)
    if v == 2:
        break
print("the length of terms_dict: ", len(terms_dict))
print("terms_dict_mf:")
for k, v in terms_dict_mf.items():
    print(k, v)
    if v == 2:
        break
print('the length of terms_dict_mf: ', len(terms_dict_mf))
print("terms_dict_cc:")
for k, v in terms_dict_cc.items():
    print(k, v)
    if v == 2:
        break
print('the length of terms_dict_cc: ', len(terms_dict_cc))
train_feature,train_lable = train_data
print('the length of train label: ', len(train_lable))
print('the length of train feature: ', len(train_feature))

terms_dict-bp:
GO:0032504 0
GO:0048608 1
GO:0048856 2
the length of terms_dict:  21356
terms_dict_mf:
GO:0045735 0
GO:0097367 1
GO:0097159 2
the length of terms_dict_mf:  6851
terms_dict_cc:
GO:0110165 0
GO:0033643 1
GO:0030430 2
the length of terms_dict_cc:  2829
the length of train label:  52072
the length of train feature:  52072


In [11]:
# 把terms_dict，terms_dict_mf,terms_dict_cc三者的键转换成集合，看是否有重复的term
terms_dict_set = set(terms_dict.keys())
terms_dict_mf_set = set(terms_dict_mf.keys())
terms_dict_cc_set = set(terms_dict_cc.keys())
len(terms_dict_set & terms_dict_mf_set)

0

### 特征查看和总结

In [7]:
# 打印iprs_dict, terms_dict, train_data, valid_data, test_data, test_df，分别取前五个元素
print('iprs_dict:')
for k, v in iprs_dict.items():
    print(k, v)
    if v == 2:
        break
print("the length of iprs_dict: ", len(iprs_dict))
print('terms_dict:')
for k, v in terms_dict.items():
    print(k, v)
    if v == 2:
        break
print("the length of terms_dict: ", len(terms_dict))
print("the shape of train_data: ", train_data[0].shape)
print('valid_data:')
print(valid_data[0][:3])
print("the shape of valid_data: ", valid_data[0].shape)
print("valid_data_labels:")
print(valid_data[1][:3])
print("the shape of valid_data_labels: ", valid_data[1].shape)
print("test_data:")
print(test_data[0][:3])
print("the shape of test_data: ", test_data[0].shape)
print("the shape of test_data_labels: ", test_data[1].shape)

print("test_df:")
print("test_df-shape: ",test_df.shape)
# # test_df前五行保存为excel文件
# pd.DataFrame(test_df.head()).to_csv("test_df.csv", index=False)
test_df.head(3)

iprs_dict:
IPR022379 0
IPR006044 1
IPR006045 2
the length of iprs_dict:  26406
terms_dict:
GO:0032504 0
GO:0048608 1
GO:0048856 2
the length of terms_dict:  21356
the shape of train_data:  torch.Size([52584, 2560])
valid_data:
tensor([[ 0.0195,  0.0484,  0.0353,  ..., -0.0280, -0.1314, -0.0181],
        [ 0.0662, -0.0373, -0.0047,  ...,  0.0496,  0.0596, -0.0065],
        [ 0.0379, -0.0409, -0.0049,  ..., -0.0007, -0.0422, -0.0101]])
the shape of valid_data:  torch.Size([2870, 2560])
valid_data_labels:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
the shape of valid_data_labels:  torch.Size([2870, 21356])
test_data:
tensor([[-0.0518, -0.0028, -0.0658,  ...,  0.0699,  0.0365,  0.0777],
        [-0.0553, -0.0227, -0.0295,  ...,  0.0328,  0.0206,  0.0156],
        [-0.0681, -0.0843, -0.1096,  ...,  0.0600,  0.0646, -0.0385]])
the shape of test_data:  torch.Size([3275, 2560])
the shape of test_data_labels:  torch.Si

Unnamed: 0,index,proteins,accessions,genes,sequences,annotations,string_ids,orgs,interpros,exp_annotations,prop_annotations,cafa_target,esm,esm2,mf_preds
58593,430345,RT26_YEAST,P47141; D6VWS0;,853565,MLVFKRGIHVVPKLPNSKALLQNGVPNILSSSGFKTVWFDYQRYLC...,"[GO:0005763|IDA, GO:0005739|HDA, GO:0046872|IE...",[4932.YJR101W],559292,"[IPR019832, IPR036324, IPR036314]","[GO:0005763, GO:0005739, GO:0003735, GO:0032543]","[GO:1901566, GO:1901576, GO:0044271, GO:004323...",True,"[0.06308255, 0.0822591, -0.024374967, 0.066503...","[-0.05179443, -0.0027887435, -0.06578646, -0.0...","[5.51395669390331e-06, 0.0015262124652508646, ..."
62724,448369,SODF3_ARATH,Q9FMX0; O81240; Q8LCD9;,832395,MSSCVVTTSCFYTISDSSIRLKSPKLLNLSNQQRRRSLRSRGGLKV...,"[GO:0009507|IDA, GO:0042644|IDA, GO:0009534|IE...",[3702.AT5G23310.1],3702,"[IPR001189, IPR019833, IPR019832, IPR019831, I...","[GO:0009507, GO:0042644, GO:0042646, GO:000957...","[GO:0005737, GO:0051716, GO:0043227, GO:000030...",True,"[-0.09370262, 0.19858178, -0.018441962, 0.0563...","[-0.055271477, -0.022739667, -0.029521158, -0....","[5.719591513297928e-05, 0.008652074378915131, ..."
62741,448515,SODM_CUPMC,P17550; Q5NUZ9; Q93JN0;,60825782,MLYEMKPLGCEPAKLTGLSEKLIFSHYENNYGGAVKRLNAITATLA...,"[GO:0046872|IEA, GO:0004784|IDA, GO:0046687|IEA]",[],266264,"[IPR001189, IPR019832, IPR036324, IPR036314]",[GO:0004784],"[GO:0051716, GO:0000305, GO:0019430, GO:001672...",False,"[-0.0472369, 0.04435269, -6.680077e-05, 0.0321...","[-0.068125404, -0.08426394, -0.1095703, -0.034...","[2.7134405627293745e-05, 0.012835425324738026,..."


In [None]:
valid_features, valid_labels = valid_data

In [None]:
df=test_df.head()
for i, row in enumerate(df.itertuples()):
    print("row:", row)
    print("type(row):", type(row))
    print("row.prop_annotations:", row.prop_annotations)
    print("row.interpros:", row.interpros)
    if i == 2:
        break

In [None]:

data = th.zeros((len(df), features_length), dtype=th.float32)
labels = th.zeros((len(df), len(terms_dict)), dtype=th.float32)
for i, row in enumerate(df.itertuples()):
    # Data vector
    if features_column == 'esm2':
        data[i, :] = th.FloatTensor(row.esm2)
    # Labels vector
    for go_id in row.prop_annotations:
        if go_id in terms_dict:
            g_id = terms_dict[go_id]
            labels[i, g_id] = 1

In [None]:
train_df = pd.read_pickle(f'{data_root}/{ont}/train_data.pkl')
# 输出所有列名
print(train_df.columns)
train_df.head(3)

In [None]:
from deepgo.utils import Ontology, propagate_annots
go_file = 'data/go.obo'
# Load Gene Ontology and Normalized axioms
go = Ontology(go_file, with_rels=True)
print("go.ic:",go.ic)
print("go.ic_norm:",go.ic_norm)
print('go.ancestors:', go.ancestors)
go.ont

### 尝试加载gpt2激活数据来源

In [19]:
import pickle
from datasets import Dataset
import pandas as pd
valid_df = pd.read_pickle('./data/bp/test_data.pkl')
valid_df.head(3)
# 看一下proteins列是否有重复数据
valid_df['proteins'].duplicated().sum()

0

In [20]:
# 只取proteins,sequences两列
valid_df = valid_df[['proteins',  'sequences']]
# 修改sequences列名为Sequence,proteins列名为Entry
valid_df.columns = ['Entry',  'Sequence']
# Sequence列和accession列互换位置
valid_df = valid_df[['Entry', 'Sequence']]

print("shape of df: ", valid_df.shape)
valid_df.head(3)

shape of df:  (3275, 2)


Unnamed: 0,Entry,Sequence
58593,RT26_YEAST,MLVFKRGIHVVPKLPNSKALLQNGVPNILSSSGFKTVWFDYQRYLC...
62724,SODF3_ARATH,MSSCVVTTSCFYTISDSSIRLKSPKLLNLSNQQRRRSLRSRGGLKV...
62741,SODM_CUPMC,MLYEMKPLGCEPAKLTGLSEKLIFSHYENNYGGAVKRLNAITATLA...


In [21]:
# 保存为csv和tsv文件
valid_df.to_csv('../Get_activate/ctrlprot_dataset/process/test0.csv', index=False)
# valid_df.to_csv('../Get_activate/ctrlprot_dataset/function/0.tsv', sep='\t', index=False)

# 假设加载的数据是一个字典列表（可以根据你的数据结构调整）
# 将其转换为 Hugging Face dataset 格式
dataset = Dataset.from_pandas(valid_df)

# 查看数据集
print(dataset)

Dataset({
    features: ['Entry', 'Sequence', '__index_level_0__'],
    num_rows: 3275
})


### 看一下mf和cc有无重合的蛋白质

In [23]:
import pickle
from datasets import Dataset
import pandas as pd
cc_train_df = pd.read_pickle('./data/cc/train_data.pkl')
mf_train_df = pd.read_pickle('./data/mf/train_data.pkl')
mf_train_df = mf_train_df['proteins']
cc_train_df = cc_train_df['proteins']
cc_train_df.head(3)

39691    MRT2_CAEEL
47190    PCNA_YEAST
47188    PCNA_SCHPO
Name: proteins, dtype: object

In [24]:
#将cc_train_df和mf_train_df的数据保存到集合中
cc_train_set = set(cc_train_df)
mf_train_set = set(mf_train_df)

In [25]:
# 看这两个集合中是否有相同的元素，输出相同元素数目
len(cc_train_set & mf_train_set)

25985

In [None]:

bp_train_df = pd.read_pickle('./data/bp/train_data.pkl')
bp_train_df = bp_train_df['proteins']
bp_train_set = set(bp_train_df)

34424

In [None]:

print(len(mf_train_set & bp_train_set))
print(len(cc_train_set & bp_train_set))

30758
34424


### 进行类别合并和转换

In [None]:
import pandas as pd
import torch as th
bp_test_data = pd.read_pickle('./data/bp/test_data.pkl')
cc_test_data = pd.read_pickle('./data/cc/test_data.pkl')
mf_test_data = pd.read_pickle('./data/mf/test_data.pkl')
cc_test_data.head(3)

In [None]:
mf_test_data.head(3)

In [None]:
terms_file = './data/bp/terms.pkl'
terms_df = pd.read_pickle(terms_file)
terms_df


In [None]:
terms = terms_df['gos'].values.flatten()
terms_dict = {v: i for i, v in enumerate(terms)}
terms_dict

In [11]:
import torch
import torch.nn.functional as F

# 模拟模型的输出（经过 Sigmoid 激活函数后的概率值）
input = torch.tensor([0], dtype=torch.float32)

# 实际标签
target = torch.tensor([1], dtype=torch.float32)

# 计算二元交叉熵损失
loss = F.binary_cross_entropy(input, target)

print(f'Binary Cross Entropy Loss: {loss.item()}')

Binary Cross Entropy Loss: 100.0
