#### InterPro signatures were used to generate binary protein representation vectors, where a value of 1 indicates the presence of a specific InterPro signature in a protein.

In [1]:
from pathlib import Path
import pandas as pd



base_path = Path('path_to_data_directory')


bp_train_path = base_path / 'bp/train_data.pkl'
bp_valid_path = base_path / 'bp/valid_data.pkl'
bp_test_path = base_path / 'bp/test_data.pkl'

cc_train_path = base_path / 'cc/train_data.pkl'
cc_valid_path = base_path / 'cc/valid_data.pkl'
cc_test_path = base_path / 'cc/test_data.pkl'

mf_train_path = base_path / 'mf/train_data.pkl'
mf_valid_path = base_path / 'mf/valid_data.pkl'
mf_test_path = base_path / 'mf/test_data.pkl'


def preprocess(data_path, data_type, ont):
    data = pd.read_pickle(data_path)
    data.rename(columns={'prop_annotations': 'term'}, inplace=True)
    data = data[['proteins', 'sequences', 'term']].rename(columns={'proteins': 'protein_name'})
    data['Set'] = data_type
    data['aspect'] = ont
    return data


bp_train = preprocess(bp_train_path, "Train", "BPO")
cc_train = preprocess(cc_train_path, "Train", "CCO")
mf_train = preprocess(mf_train_path, "Train", "MFO")

bp_valid = preprocess(bp_valid_path, "Valid", "BPO")
cc_valid = preprocess(cc_valid_path, "Valid", "CCO")
mf_valid = preprocess(mf_valid_path, "Valid", "MFO")

bp_test = preprocess(bp_test_path, "Test", "BPO")
cc_test = preprocess(cc_test_path, "Test", "CCO")
mf_test = preprocess(mf_test_path, "Test", "MFO")

# Concatenate 
mf = pd.concat([mf_train, mf_valid, mf_test], ignore_index=True)
cc = pd.concat([cc_train, cc_valid, cc_test], ignore_index=True)
bp = pd.concat([bp_train, bp_valid, bp_test], ignore_index=True)

data = pd.concat([bp, cc, mf], ignore_index=True)


In [3]:
seq = data[['protein_name', 'sequences']].drop_duplicates().reset_index(drop = True)
seq.head()

Unnamed: 0,protein_name,sequences
0,VGFR2_MOUSE,MESKALLAVALWFCVETRAASVGLPGDFLHPPKLSTQKDILTILAN...
1,VGFR2_RAT,MESRALLAVALWFCVETRAASVGLPGDSLHPPKLSTQKDILTILAN...
2,VGFR2_HUMAN,MQSKVLLAVALWLCVETRAASVGLPSVSLDLPRLSIQKDILTIKAN...
3,VGFR2_DANRE,MAKTSYALLLLDILLTFNVAKAIELRFVPDPPTLNITEKTIKINAS...
4,KIT_MOUSE,MRGARGAWDLLCVLLVLLRGQTATSQPSASPGEPSPPSIHPAQSEL...


### Write sequences into fasta file and run InterProScan software
#### ./interproscan.sh -i ../sequences.fasta -b ../

In [3]:
fasta_file = 'zerogo_sequences.fasta'
with open(fasta_file, 'w') as f:
    for index, row in data.iterrows():
        protein_name = row['protein_name']
        sequence = row['sequences']
        f.write(f'>{protein_name}\n{sequence}\n')

### Load output of InterProScan 

In [5]:
import pandas as pd


file_path = "path_to_fasta_file/zerogo_sequences.fasta.tsv"
interpro = pd.read_csv(file_path, sep='\t', header=None)
column_names = [
    "protein_name",
    "Sequence MD5 Digest",
    "Sequence Length",
    "Analysis",
    "Signature Accession",
    "Signature Description",
    "Start Location",
    "Stop Location",
    "Score",
    "Status",
    "Date",
    "accession",
    "InterPro Annotation Description",
    "GO Annotations",
    "Pathways annotations"
]

interpro.columns = column_names


pro = interpro[["protein_name","accession" ]].drop_duplicates()
pro.reset_index(drop=True, inplace=True)
pro = pro.replace('-', pd.NA).dropna()

interpro.head()

Unnamed: 0,protein_name,Sequence MD5 Digest,Sequence Length,Analysis,Signature Accession,Signature Description,Start Location,Stop Location,Score,Status,Date,accession,InterPro Annotation Description,GO Annotations,Pathways annotations
0,LSP1_MOUSE,8df114eda8d502a39d58e7eb4288789f,330,MobiDBLite,mobidb-lite,consensus disorder prediction,112,163,-,T,13-06-2024,-,-,-,-
1,LSP1_MOUSE,8df114eda8d502a39d58e7eb4288789f,330,Coils,Coil,Coil,29,49,-,T,13-06-2024,-,-,-,-
2,LSP1_MOUSE,8df114eda8d502a39d58e7eb4288789f,330,MobiDBLite,mobidb-lite,consensus disorder prediction,95,111,-,T,13-06-2024,-,-,-,-
3,LSP1_MOUSE,8df114eda8d502a39d58e7eb4288789f,330,PRINTS,PR01083,Lymphocyte-specific protein signature,306,327,2.1E-123,T,13-06-2024,IPR002211,Lymphocyte-specific protein,-,-
4,LSP1_MOUSE,8df114eda8d502a39d58e7eb4288789f,330,PRINTS,PR01083,Lymphocyte-specific protein signature,131,147,2.1E-123,T,13-06-2024,IPR002211,Lymphocyte-specific protein,-,-


In [8]:
# Proteins and their corresponding signatures
pro.head()

Unnamed: 0,protein_name,accession
1,LSP1_MOUSE,IPR002211
2,LSP1_MOUSE,IPR006018
3,ALG5_SCHPO,IPR029044
4,ALG5_SCHPO,IPR035518
6,ALG5_SCHPO,IPR001173


In [10]:
#cluster based on signitures
unique_accessions = pro.groupby('accession')['protein_name'].apply(list).reset_index()
unique_accessions.head()

Unnamed: 0,accession,protein_name
0,IPR000001,"[KREM1_MOUSE, NETR_MOUSE, PLMN_RAT, KREM2_MOUS..."
1,IPR000003,"[RXRAB_DANRE, RXRGA_DANRE, RXRG_RAT, RXRA_MOUS..."
2,IPR000006,"[MT2_DANRE, MT1H_HUMAN, MT1B_HUMAN, MT1L_HUMAN..."
3,IPR000007,"[TLP10_ARATH, TLP6_ARATH, TUB_RAT, TLP1_ARATH,..."
4,IPR000008,"[SYGP1_MOUSE, ABR_HUMAN, C2D2A_HUMAN, SYTL1_MO..."


In [10]:
len(unique_accessions['protein_name'][0])

45

In [11]:
import itertools

protein_pairs = []

for proteins in unique_accessions['protein_name']:
    
    if len(proteins) > 1:
        pairs = list(itertools.combinations(proteins, 2))
        protein_pairs.extend(pairs)


network= pd.DataFrame(protein_pairs, columns=['protein1', 'protein2'])

In [12]:
network.head()

Unnamed: 0,protein1,protein2
0,KREM1_MOUSE,NETR_MOUSE
1,KREM1_MOUSE,PLMN_RAT
2,KREM1_MOUSE,KREM2_MOUSE
3,KREM1_MOUSE,HGFL_HUMAN
4,KREM1_MOUSE,UROK_RAT


In [13]:
network.to_pickle('zerogo_network.pkl')

In [13]:
unique_proteins = pro['protein_name'].unique()
unique_accessions = pro['accession'].unique()


matrix = pd.DataFrame(0, index=unique_proteins, columns=unique_accessions)

for _, row in pro.iterrows():
    matrix.loc[row['protein_name'], row['accession']] = 1

# Convert the matrix to DataFrame
result_df = pd.DataFrame(matrix)
result_df.reset_index(inplace=True)
result_df.rename(columns={'index': 'protein_name'}, inplace=True)
result_df.set_index('protein_name', inplace=True)
result_df.head()

Unnamed: 0_level_0,IPR002211,IPR006018,IPR029044,IPR035518,IPR001173,IPR014729,IPR003694,IPR022310,IPR022926,IPR029460,...,IPR052055,IPR040244,IPR021480,IPR029709,IPR053066,IPR047938,IPR052907,IPR026673,IPR048744,IPR049071
protein_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LSP1_MOUSE,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ALG5_SCHPO,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NADE_ECOLI,0,0,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
DNAE2_CORGL,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
BTSS_ECOLI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
result_df['binary_vector'] = result_df.apply(lambda row: row.tolist(), axis=1)
binary_df =result_df[['binary_vector']]
binary_df.head()

Unnamed: 0_level_0,binary_vector
protein_name,Unnamed: 1_level_1
LSP1_MOUSE,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
ALG5_SCHPO,"[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
NADE_ECOLI,"[0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
DNAE2_CORGL,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
BTSS_ECOLI,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
binary_df.to_pickle('binary_zerogo.pkl')