In [2]:
import pandas as pd
import re
import requests
from tqdm import tqdm

In [5]:
def parse_fasta(filepath):
    records = []
    with open(filepath, 'r') as f:
        header = None
        seq_lines = []
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if header:
                    # Parse header fields
                    parts = header.split(' ', 1)
                    ids = parts[0].split('|')
                    db = ids[0]
                    accession = ids[1]
                    entry_name = ids[2]
                    rest = parts[1] if len(parts) > 1 else ''
                    kv_pattern = r'(\w+)=([^\s]+)'
                    kv_pairs = dict(re.findall(kv_pattern, rest))
                    first_kv = re.search(kv_pattern, rest)
                    protein_name = rest[:first_kv.start()].strip() if first_kv else rest.strip()
                    records.append({
                        'database': db,
                        'accession_number': accession,
                        'entry_name': entry_name,
                        'protein_desc': protein_name,
                        'organism_species': kv_pairs.get('OS', ''),
                        'organism_identifier': kv_pairs.get('OX', ''),
                        'gene_name': kv_pairs.get('GN', ''),
                        'protein_existence_level': kv_pairs.get('PE', ''),
                        'sequence_version': kv_pairs.get('SV', ''),
                        'sequence': ''.join(seq_lines)
                    })
                header = line[1:]  # remove '>'
                seq_lines = []
            else:
                seq_lines.append(line)
        # Add the last record
        if header:
            parts = header.split(' ', 1)
            ids = parts[0].split('|')
            db = ids[0]
            accession = ids[1]
            entry_name = ids[2]
            rest = parts[1] if len(parts) > 1 else ''
            kv_pattern = r'(\w+)=([^\s]+)'
            kv_pairs = dict(re.findall(kv_pattern, rest))
            first_kv = re.search(kv_pattern, rest)
            protein_name = rest[:first_kv.start()].strip() if first_kv else rest.strip()
            records.append({
                'database': db,
                'accession_number': accession,
                'entry_name': entry_name,
                'protein_desc': protein_name,
                'organism_species': kv_pairs.get('OS', ''),
                'organism_identifier': kv_pairs.get('OX', ''),
                'gene_name': kv_pairs.get('GN', ''),
                'protein_existence_level': kv_pairs.get('PE', ''),
                'sequence_version': kv_pairs.get('SV', ''),
                'sequence': ''.join(seq_lines)
            })
    return pd.DataFrame(records)

In [6]:
df = parse_fasta("../data/swiss.fasta")
df[['protein_id', 'organism_id']] = df['entry_name'].str.split('_', n=1, expand=True)
df = df.drop(columns=['entry_name'])
df

Unnamed: 0,database,accession_number,protein_desc,organism_species,organism_identifier,gene_name,protein_existence_level,sequence_version,sequence,protein_id,organism_id
0,sp,Q6GZX4,Putative transcription factor 001R,Frog,654924,FV3-001R,4,1,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,001R,FRG3G
1,sp,Q6GZX3,Uncharacterized protein 002L,Frog,654924,FV3-002L,4,1,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,002L,FRG3G
2,sp,Q197F8,Uncharacterized protein 002R,Invertebrate,345201,IIV3-002R,4,1,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,002R,IIV3
3,sp,Q197F7,Uncharacterized protein 003L,Invertebrate,345201,IIV3-003L,4,1,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,003L,IIV3
4,sp,Q6GZX2,Uncharacterized protein 3R,Frog,654924,FV3-003R,3,1,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,003R,FRG3G
...,...,...,...,...,...,...,...,...,...,...,...
573656,sp,Q6UY62,RING finger protein Z,Sabia,3052299,Z,1,1,MGNSKSKSKLSANQYEQQTVNSTKQVAILKRQAEPSLYGRHNCRCC...,Z,SABVB
573657,sp,P08105,Putative uncharacterized protein Z,Ovis,9940,,4,1,MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPH...,Z,SHEEP
573658,sp,Q88470,RING finger protein Z,Tacaribe,928313,Z,1,3,MGNCNRTQKPSSSSNNLEKPPQAAEFRRTAEPSLYGRYNCKCCWFA...,Z,TACVF
573659,sp,A9JR22,RING finger protein Z,Tamiami,3052329,Z,3,1,MGLRYSKEVRDRHGDKDPEGRIPITQTMPQTLYGRYNCKSCWFANK...,Z,TAMVU


In [9]:
def fetch_interpro_annotations(accession):
    url = f"https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/uniprot/{accession}/"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        return []

    data = response.json().get("results", [])
    print(data)
    annotations = []
    for entry in data:
        meta = entry.get("metadata", {})
        interpro_id = meta.get("accession", "")
        interpro_name = meta.get("name", "")
        source_database = meta.get("source_database", "")
        interpro_type = meta.get("type", "")
        integrated = meta.get("integrated", None)
        member_databases = meta.get("member_databases", {})
        go_terms = meta.get("go_terms", None)

        for protein in entry.get("proteins", []):
            for loc in protein.get("entry_protein_locations", []):
                for fragment in loc.get("fragments", []):
                    annotations.append({
                        "interpro_id": interpro_id,
                        "interpro_name": interpro_name,
                        "source_database": source_database,
                        "interpro_type": interpro_type,
                        "integrated": integrated,
                        "member_databases": member_databases,
                        "go_terms": go_terms,
                        "start": fragment.get("start"),
                        "end": fragment.get("end")
                    })
    return annotations

def get_interpro_dataframe_from_fasta(df):
    expanded_rows = []
    i = 0
    for _, row in tqdm(df.iterrows()):
        accession = row['accession_number']
        annotations = fetch_interpro_annotations(accession)
        for ann in annotations:
            expanded_rows.append({
                "accession_number": accession,
                "protein_id": row['protein_id'],
                "organism_id": row['organism_id'],
                **ann
            })
        i += 1
        if i == 10:
            break

    return pd.DataFrame(expanded_rows)

enriched_df = get_interpro_dataframe_from_fasta(df)

1it [00:10, 10.02s/it]

[{'metadata': {'accession': 'IPR007031', 'name': 'Poxvirus VLTF3, late transcription factor', 'source_database': 'interpro', 'type': 'family', 'integrated': None, 'member_databases': {'pfam': {'PF04947': 'Poxvirus Late Transcription Factor VLTF3 like'}}, 'go_terms': [{'identifier': 'GO:0046782', 'name': 'regulation of viral transcription', 'category': {'code': 'P', 'name': 'biological_process'}}]}, 'proteins': [{'accession': 'q6gzx4', 'protein_length': 256, 'source_database': 'reviewed', 'organism': '654924', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': [{'start': 81, 'end': 253, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': None, 'score': None}]}]}]


2it [00:10,  4.62s/it]

[{'metadata': {'accession': 'IPR004251', 'name': 'Pox virus entry-fusion-complex G9/A16', 'source_database': 'interpro', 'type': 'family', 'integrated': None, 'member_databases': {'pfam': {'PF03003': 'Pox virus entry-fusion-complex G9/A16'}}, 'go_terms': None}, 'proteins': [{'accession': 'q6gzx3', 'protein_length': 320, 'source_database': 'reviewed', 'organism': '654924', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': [{'start': 169, 'end': 247, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': None, 'score': None}]}]}]


8it [00:16,  1.17s/it]

[{'metadata': {'accession': 'IPR003360', 'name': 'US22-like', 'source_database': 'interpro', 'type': 'family', 'integrated': None, 'member_databases': {'pfam': {'PF02393': 'US22 like'}}, 'go_terms': None}, 'proteins': [{'accession': 'q6gzx0', 'protein_length': 204, 'source_database': 'reviewed', 'organism': '654924', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': [{'start': 37, 'end': 150, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': None, 'score': None}]}]}]


9it [00:17,  1.07s/it]

[{'metadata': {'accession': 'IPR017880', 'name': 'KilA, N-terminal', 'source_database': 'interpro', 'type': 'domain', 'integrated': None, 'member_databases': {'profile': {'PS51301': 'KilA-N domain profile'}}, 'go_terms': None}, 'proteins': [{'accession': 'q91g88', 'protein_length': 352, 'source_database': 'reviewed', 'organism': '176652', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': [{'start': 15, 'end': 123, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': None, 'score': None}]}]}, {'metadata': {'accession': 'IPR018004', 'name': 'KilA/APSES-type HTH, DNA-binding', 'source_database': 'interpro', 'type': 'domain', 'integrated': None, 'member_databases': {'smart': {'SM01252': 'KilA-N'}, 'pfam': {'PF04383': 'KilA-N domain'}}, 'go_terms': None}, 'proteins': [{'accession': 'q91g88', 'protein_length': 352, 'source_database': 'reviewed', 'organism': '176652', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': 

9it [00:18,  2.06s/it]


In [10]:
enriched_df

Unnamed: 0,accession_number,protein_id,organism_id,interpro_id,interpro_name,source_database,interpro_type,integrated,member_databases,go_terms,start,end
0,Q6GZX4,001R,FRG3G,IPR007031,"Poxvirus VLTF3, late transcription factor",interpro,family,,{'pfam': {'PF04947': 'Poxvirus Late Transcript...,"[{'identifier': 'GO:0046782', 'name': 'regulat...",81,253
1,Q6GZX3,002L,FRG3G,IPR004251,Pox virus entry-fusion-complex G9/A16,interpro,family,,{'pfam': {'PF03003': 'Pox virus entry-fusion-c...,,169,247
2,Q6GZX0,005R,FRG3G,IPR003360,US22-like,interpro,family,,{'pfam': {'PF02393': 'US22 like'}},,37,150
3,Q91G88,006L,IIV6,IPR017880,"KilA, N-terminal",interpro,domain,,{'profile': {'PS51301': 'KilA-N domain profile'}},,15,123
4,Q91G88,006L,IIV6,IPR018004,"KilA/APSES-type HTH, DNA-binding",interpro,domain,,"{'smart': {'SM01252': 'KilA-N'}, 'pfam': {'PF0...",,21,123
5,Q91G88,006L,IIV6,IPR022549,Domain of unknown function DUF3627,interpro,domain,,{'pfam': {'PF12299': 'Protein of unknown funct...,,231,322


In [13]:
# merge dataframes
merged_df = pd.merge(
    enriched_df,
    df,
    on=['accession_number'],
    how='left'  # or 'inner' if you only want rows with matches in both
)

In [14]:
merged_df

Unnamed: 0,accession_number,protein_id_x,organism_id_x,interpro_id,interpro_name,source_database,interpro_type,integrated,member_databases,go_terms,...,database,protein_desc,organism_species,organism_identifier,gene_name,protein_existence_level,sequence_version,sequence,protein_id_y,organism_id_y
0,Q6GZX4,001R,FRG3G,IPR007031,"Poxvirus VLTF3, late transcription factor",interpro,family,,{'pfam': {'PF04947': 'Poxvirus Late Transcript...,"[{'identifier': 'GO:0046782', 'name': 'regulat...",...,sp,Putative transcription factor 001R,Frog,654924,FV3-001R,4,1,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,001R,FRG3G
1,Q6GZX3,002L,FRG3G,IPR004251,Pox virus entry-fusion-complex G9/A16,interpro,family,,{'pfam': {'PF03003': 'Pox virus entry-fusion-c...,,...,sp,Uncharacterized protein 002L,Frog,654924,FV3-002L,4,1,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,002L,FRG3G
2,Q6GZX0,005R,FRG3G,IPR003360,US22-like,interpro,family,,{'pfam': {'PF02393': 'US22 like'}},,...,sp,Uncharacterized protein 005R,Frog,654924,FV3-005R,4,1,MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...,005R,FRG3G
3,Q91G88,006L,IIV6,IPR017880,"KilA, N-terminal",interpro,domain,,{'profile': {'PS51301': 'KilA-N domain profile'}},,...,sp,Putative KilA-N domain-containing protein 006L,Invertebrate,176652,IIV6-006L,3,1,MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...,006L,IIV6
4,Q91G88,006L,IIV6,IPR018004,"KilA/APSES-type HTH, DNA-binding",interpro,domain,,"{'smart': {'SM01252': 'KilA-N'}, 'pfam': {'PF0...",,...,sp,Putative KilA-N domain-containing protein 006L,Invertebrate,176652,IIV6-006L,3,1,MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...,006L,IIV6
5,Q91G88,006L,IIV6,IPR022549,Domain of unknown function DUF3627,interpro,domain,,{'pfam': {'PF12299': 'Protein of unknown funct...,,...,sp,Putative KilA-N domain-containing protein 006L,Invertebrate,176652,IIV6-006L,3,1,MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...,006L,IIV6
