In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp -r /content/drive/MyDrive/TargetP2_data /content/mywork

In [3]:
import os
os.chdir('/content/mywork')
current_path = os.getcwd()
os.listdir(current_path)

['targetp.fasta', 'targetp_data.npz']

In [26]:
import pandas as pd

def fasta_to_dataframe(file_path):
    """将FASTA文件转换为pandas DataFrame"""
    # 初始化序列数据列表
    seq_data = []

    with open(file_path, 'r') as file:
        sequence_id = None
        sequence = ''
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if sequence_id is not None:
                    # 将当前序列添加到列表
                    seq_data.append([sequence_id, sequence])
                sequence_id = line[1:]  # 更新序列标识符
                sequence = ''  # 重置序列
            else:
                sequence += line  # 追加序列
        # 添加最后一个序列
        if sequence_id is not None:
            seq_data.append([sequence_id, sequence])

    # 创建DataFrame
    df = pd.DataFrame(seq_data, columns=['Entry', 'Sequence'])
    return df

file_path = 'targetp.fasta'
df1 = fasta_to_dataframe(file_path)
print(df1)



        Entry                                           Sequence
0      P92192  MKFLIVFVALFAMAVARPNLAEIVRQVSDVEPEKWSSDVETSDGTS...
1      P30042  MAAVRALVASRLAAASAFTSLSPGGRTPSQRAALHLSVPRPAARVA...
2      Q9D666  MDFSRLHTYTPPQCVPENTGYTYALSSSYSSDALDFETEHKLEPVF...
3      Q9BUL9  MENFRKVRSEEAPAGCGAEGGGPGSGPFADLAPGAVHMRVKEGSKI...
4      U6A629  MLAEYLLLPLLASYASAVTISVAKSGGNVTTGLQYGAMEEEINHCG...
...       ...                                                ...
13000  Q38799  MLGILRQRAIDGASTLRRTRFALVSARSYAAGAKEMTVRDALNSAI...
13001  Q8BMA5  MLLPSDVARLVLGYLQQENLTSTCQTFILESSNLKEYAEHCTDEGF...
13002  Q2YHJ8  MRTLWIMAVLLVGVEGSVIQLGKMILQETGKNPVKYYGAYGCNCGP...
13003  Q9LVG4  MCFNNIETGDEVETERQVFGSSEEDEFRVEDTARNTNNVQISQQQQ...
13004  P60266  MNSLLMITACLALVGTVWAKEGYLVNSYTGCKFECFKLGDNDYCLR...

[13005 rows x 2 columns]


In [27]:
import numpy as np

def load_npz_file(file_path):
    # 加载npz文件
    data = np.load(file_path)

    # 从npz文件中读取数组
    fold = data['fold']
    ids = data['ids']
    y_type = data['y_type']

    # 定义y_type数字到文本标签的映射
    y_type_labels = {0: 'noTP', 1: 'SP', 2: 'mTP', 3: 'cTP', 4: 'luTP'}

    # 将y_type数组中的数字替换为对应的文本标签
    y_type_text = np.array([y_type_labels[y] for y in y_type])

    # 打印数组信息
    print("Fold:", fold)
    print("Entry:", ids)
    print("EC number:", y_type_text)

    # 关闭文件
    data.close()

    return fold, ids, y_type_text

# 调用函数并传入npz文件路径
file_path = 'targetp_data.npz'
folds, ids, y_type_text = load_npz_file(file_path)


Fold: [0 0 0 ... 4 3 0]
Entry: ['P10719' 'Q38786' 'P15289' ... 'Q07800' 'O14228' 'Q9VUL1']
EC number: ['mTP' 'cTP' 'SP' ... 'noTP' 'noTP' 'noTP']


In [28]:
# 将数组转换为Pandas DataFrame
data = {
    'Fold': folds,
    'Entry': ids,
    'EC Number': y_type_text
}
df2 = pd.DataFrame(data)
print(df2)

       Fold   Entry EC Number
0         0  P10719       mTP
1         0  Q38786       cTP
2         0  P15289        SP
3         1  P20933        SP
4         1  Q9UII2       mTP
...     ...     ...       ...
13000     4  P35998      noTP
13001     1  Q9EST4      noTP
13002     4  Q07800      noTP
13003     3  O14228      noTP
13004     0  Q9VUL1      noTP

[13005 rows x 3 columns]


In [30]:
# 根据'Entry'列合并两个DataFrame
merged_df = pd.merge(df1, df2, on='Entry', how='inner')

# 显示合并后的DataFrame
print(merged_df)

        Entry                                           Sequence  Fold  \
0      P92192  MKFLIVFVALFAMAVARPNLAEIVRQVSDVEPEKWSSDVETSDGTS...     0   
1      P30042  MAAVRALVASRLAAASAFTSLSPGGRTPSQRAALHLSVPRPAARVA...     2   
2      Q9D666  MDFSRLHTYTPPQCVPENTGYTYALSSSYSSDALDFETEHKLEPVF...     3   
3      Q9BUL9  MENFRKVRSEEAPAGCGAEGGGPGSGPFADLAPGAVHMRVKEGSKI...     4   
4      U6A629  MLAEYLLLPLLASYASAVTISVAKSGGNVTTGLQYGAMEEEINHCG...     4   
...       ...                                                ...   ...   
13000  Q38799  MLGILRQRAIDGASTLRRTRFALVSARSYAAGAKEMTVRDALNSAI...     0   
13001  Q8BMA5  MLLPSDVARLVLGYLQQENLTSTCQTFILESSNLKEYAEHCTDEGF...     4   
13002  Q2YHJ8  MRTLWIMAVLLVGVEGSVIQLGKMILQETGKNPVKYYGAYGCNCGP...     1   
13003  Q9LVG4  MCFNNIETGDEVETERQVFGSSEEDEFRVEDTARNTNNVQISQQQQ...     1   
13004  P60266  MNSLLMITACLALVGTVWAKEGYLVNSYTGCKFECFKLGDNDYCLR...     1   

      EC Number  
0            SP  
1           mTP  
2          noTP  
3          noTP  
4            SP  
...

In [31]:
merged_df.to_csv('result.csv', index=False)