In [1]:
import pandas as pd
import re
import requests
from tqdm import tqdm

In [2]:
def parse_fasta(filepath):
    records = []
    with open(filepath, 'r') as f:
        header = None
        seq_lines = []
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if header:
                    # Parse header fields
                    parts = header.split(' ', 1)
                    ids = parts[0].split('|')
                    db = ids[0]
                    accession = ids[1]
                    entry_name = ids[2]
                    rest = parts[1] if len(parts) > 1 else ''
                    kv_pattern = r'(\w+)=([^\s]+)'
                    kv_pairs = dict(re.findall(kv_pattern, rest))
                    first_kv = re.search(kv_pattern, rest)
                    protein_name = rest[:first_kv.start()].strip() if first_kv else rest.strip()
                    records.append({
                        'database': db,
                        'accession_number': accession,
                        'entry_name': entry_name,
                        'protein_desc': protein_name,
                        'organism_species': kv_pairs.get('OS', ''),
                        'organism_identifier': kv_pairs.get('OX', ''),
                        'gene_name': kv_pairs.get('GN', ''),
                        'protein_existence_level': kv_pairs.get('PE', ''),
                        'sequence_version': kv_pairs.get('SV', ''),
                        'sequence': ''.join(seq_lines)
                    })
                header = line[1:]  # remove '>'
                seq_lines = []
            else:
                seq_lines.append(line)
        # Add the last record
        if header:
            parts = header.split(' ', 1)
            ids = parts[0].split('|')
            db = ids[0]
            accession = ids[1]
            entry_name = ids[2]
            rest = parts[1] if len(parts) > 1 else ''
            kv_pattern = r'(\w+)=([^\s]+)'
            kv_pairs = dict(re.findall(kv_pattern, rest))
            first_kv = re.search(kv_pattern, rest)
            protein_name = rest[:first_kv.start()].strip() if first_kv else rest.strip()
            records.append({
                'database': db,
                'accession_number': accession,
                'entry_name': entry_name,
                'protein_desc': protein_name,
                'organism_species': kv_pairs.get('OS', ''),
                'organism_identifier': kv_pairs.get('OX', ''),
                'gene_name': kv_pairs.get('GN', ''),
                'protein_existence_level': kv_pairs.get('PE', ''),
                'sequence_version': kv_pairs.get('SV', ''),
                'sequence': ''.join(seq_lines)
            })
    return pd.DataFrame(records)

In [3]:
df = parse_fasta("../data/swiss.fasta")
df[['protein_id', 'organism_id']] = df['entry_name'].str.split('_', n=1, expand=True)
df = df.drop(columns=['entry_name'])
df

Unnamed: 0,database,accession_number,protein_desc,organism_species,organism_identifier,gene_name,protein_existence_level,sequence_version,sequence,protein_id,organism_id
0,sp,Q6GZX4,Putative transcription factor 001R,Frog,654924,FV3-001R,4,1,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,001R,FRG3G
1,sp,Q6GZX3,Uncharacterized protein 002L,Frog,654924,FV3-002L,4,1,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,002L,FRG3G
2,sp,Q197F8,Uncharacterized protein 002R,Invertebrate,345201,IIV3-002R,4,1,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,002R,IIV3
3,sp,Q197F7,Uncharacterized protein 003L,Invertebrate,345201,IIV3-003L,4,1,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,003L,IIV3
4,sp,Q6GZX2,Uncharacterized protein 3R,Frog,654924,FV3-003R,3,1,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,003R,FRG3G
...,...,...,...,...,...,...,...,...,...,...,...
573656,sp,Q6UY62,RING finger protein Z,Sabia,3052299,Z,1,1,MGNSKSKSKLSANQYEQQTVNSTKQVAILKRQAEPSLYGRHNCRCC...,Z,SABVB
573657,sp,P08105,Putative uncharacterized protein Z,Ovis,9940,,4,1,MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPH...,Z,SHEEP
573658,sp,Q88470,RING finger protein Z,Tacaribe,928313,Z,1,3,MGNCNRTQKPSSSSNNLEKPPQAAEFRRTAEPSLYGRYNCKCCWFA...,Z,TACVF
573659,sp,A9JR22,RING finger protein Z,Tamiami,3052329,Z,3,1,MGLRYSKEVRDRHGDKDPEGRIPITQTMPQTLYGRYNCKSCWFANK...,Z,TAMVU


In [4]:
def fetch_interpro_annotations(accession):
    url = f"https://www.ebi.ac.uk/interpro/api/entry/interpro/protein/uniprot/{accession}/"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        return []

    data = response.json().get("results", [])
    print(data)
    annotations = []
    for entry in data:
        meta = entry.get("metadata", {})
        interpro_id = meta.get("accession", "")
        interpro_name = meta.get("name", "")
        source_database = meta.get("source_database", "")
        interpro_type = meta.get("type", "")
        integrated = meta.get("integrated", None)
        member_databases = meta.get("member_databases", {})
        go_terms = meta.get("go_terms", None)

        for protein in entry.get("proteins", []):
            for loc in protein.get("entry_protein_locations", []):
                for fragment in loc.get("fragments", []):
                    annotations.append({
                        "interpro_id": interpro_id,
                        "interpro_name": interpro_name,
                        "source_database": source_database,
                        "interpro_type": interpro_type,
                        "integrated": integrated,
                        "member_databases": member_databases,
                        "go_terms": go_terms,
                        "start": fragment.get("start"),
                        "end": fragment.get("end")
                    })
    return annotations

def get_interpro_dataframe_from_fasta(df):
    expanded_rows = []
    i = 0
    for _, row in tqdm(df.iterrows()):
        accession = row['accession_number']
        annotations = fetch_interpro_annotations(accession)
        for ann in annotations:
            expanded_rows.append({
                "accession_number": accession,
                "protein_id": row['protein_id'],
                "organism_id": row['organism_id'],
                **ann
            })
        i += 1
        if i == 10:
            break

    return pd.DataFrame(expanded_rows)

enriched_df = get_interpro_dataframe_from_fasta(df)

1it [00:01,  1.01s/it]

[{'metadata': {'accession': 'IPR007031', 'name': 'Poxvirus VLTF3, late transcription factor', 'source_database': 'interpro', 'type': 'family', 'integrated': None, 'member_databases': {'pfam': {'PF04947': 'Poxvirus Late Transcription Factor VLTF3 like'}}, 'go_terms': [{'identifier': 'GO:0046782', 'name': 'regulation of viral transcription', 'category': {'code': 'P', 'name': 'biological_process'}}]}, 'proteins': [{'accession': 'q6gzx4', 'protein_length': 256, 'source_database': 'reviewed', 'organism': '654924', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': [{'start': 81, 'end': 253, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': None, 'score': None}]}]}]


2it [00:01,  1.27it/s]

[{'metadata': {'accession': 'IPR004251', 'name': 'Pox virus entry-fusion-complex G9/A16', 'source_database': 'interpro', 'type': 'family', 'integrated': None, 'member_databases': {'pfam': {'PF03003': 'Pox virus entry-fusion-complex G9/A16'}}, 'go_terms': None}, 'proteins': [{'accession': 'q6gzx3', 'protein_length': 320, 'source_database': 'reviewed', 'organism': '654924', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': [{'start': 169, 'end': 247, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': None, 'score': None}]}]}]


8it [00:05,  1.57it/s]

[{'metadata': {'accession': 'IPR003360', 'name': 'US22-like', 'source_database': 'interpro', 'type': 'family', 'integrated': None, 'member_databases': {'pfam': {'PF02393': 'US22 like'}}, 'go_terms': None}, 'proteins': [{'accession': 'q6gzx0', 'protein_length': 204, 'source_database': 'reviewed', 'organism': '654924', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': [{'start': 37, 'end': 150, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': None, 'score': None}]}]}]


9it [00:06,  1.58it/s]

[{'metadata': {'accession': 'IPR017880', 'name': 'KilA, N-terminal', 'source_database': 'interpro', 'type': 'domain', 'integrated': None, 'member_databases': {'profile': {'PS51301': 'KilA-N domain profile'}}, 'go_terms': None}, 'proteins': [{'accession': 'q91g88', 'protein_length': 352, 'source_database': 'reviewed', 'organism': '176652', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': [{'start': 15, 'end': 123, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': None, 'score': None}]}]}, {'metadata': {'accession': 'IPR018004', 'name': 'KilA/APSES-type HTH, DNA-binding', 'source_database': 'interpro', 'type': 'domain', 'integrated': None, 'member_databases': {'smart': {'SM01252': 'KilA-N'}, 'pfam': {'PF04383': 'KilA-N domain'}}, 'go_terms': None}, 'proteins': [{'accession': 'q91g88', 'protein_length': 352, 'source_database': 'reviewed', 'organism': '176652', 'in_alphafold': False, 'in_bfvd': False, 'entry_protein_locations': [{'fragments': 

9it [00:06,  1.35it/s]


In [6]:
enriched_df

Unnamed: 0,accession_number,protein_id,organism_id,interpro_id,interpro_name,source_database,interpro_type,integrated,member_databases,go_terms,start,end
0,Q6GZX4,001R,FRG3G,IPR007031,"Poxvirus VLTF3, late transcription factor",interpro,family,,{'pfam': {'PF04947': 'Poxvirus Late Transcript...,"[{'identifier': 'GO:0046782', 'name': 'regulat...",81,253
1,Q6GZX3,002L,FRG3G,IPR004251,Pox virus entry-fusion-complex G9/A16,interpro,family,,{'pfam': {'PF03003': 'Pox virus entry-fusion-c...,,169,247
2,Q6GZX0,005R,FRG3G,IPR003360,US22-like,interpro,family,,{'pfam': {'PF02393': 'US22 like'}},,37,150
3,Q91G88,006L,IIV6,IPR017880,"KilA, N-terminal",interpro,domain,,{'profile': {'PS51301': 'KilA-N domain profile'}},,15,123
4,Q91G88,006L,IIV6,IPR018004,"KilA/APSES-type HTH, DNA-binding",interpro,domain,,"{'smart': {'SM01252': 'KilA-N'}, 'pfam': {'PF0...",,21,123
5,Q91G88,006L,IIV6,IPR022549,Domain of unknown function DUF3627,interpro,domain,,{'pfam': {'PF12299': 'Protein of unknown funct...,,231,322


In [7]:
# merge dataframes
merged_df = pd.merge(
    enriched_df,
    df,
    on=['accession_number'],
    how='left'  # or 'inner' if you only want rows with matches in both
)

In [8]:
merged_df

Unnamed: 0,accession_number,protein_id_x,organism_id_x,interpro_id,interpro_name,source_database,interpro_type,integrated,member_databases,go_terms,...,database,protein_desc,organism_species,organism_identifier,gene_name,protein_existence_level,sequence_version,sequence,protein_id_y,organism_id_y
0,Q6GZX4,001R,FRG3G,IPR007031,"Poxvirus VLTF3, late transcription factor",interpro,family,,{'pfam': {'PF04947': 'Poxvirus Late Transcript...,"[{'identifier': 'GO:0046782', 'name': 'regulat...",...,sp,Putative transcription factor 001R,Frog,654924,FV3-001R,4,1,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,001R,FRG3G
1,Q6GZX3,002L,FRG3G,IPR004251,Pox virus entry-fusion-complex G9/A16,interpro,family,,{'pfam': {'PF03003': 'Pox virus entry-fusion-c...,,...,sp,Uncharacterized protein 002L,Frog,654924,FV3-002L,4,1,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,002L,FRG3G
2,Q6GZX0,005R,FRG3G,IPR003360,US22-like,interpro,family,,{'pfam': {'PF02393': 'US22 like'}},,...,sp,Uncharacterized protein 005R,Frog,654924,FV3-005R,4,1,MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...,005R,FRG3G
3,Q91G88,006L,IIV6,IPR017880,"KilA, N-terminal",interpro,domain,,{'profile': {'PS51301': 'KilA-N domain profile'}},,...,sp,Putative KilA-N domain-containing protein 006L,Invertebrate,176652,IIV6-006L,3,1,MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...,006L,IIV6
4,Q91G88,006L,IIV6,IPR018004,"KilA/APSES-type HTH, DNA-binding",interpro,domain,,"{'smart': {'SM01252': 'KilA-N'}, 'pfam': {'PF0...",,...,sp,Putative KilA-N domain-containing protein 006L,Invertebrate,176652,IIV6-006L,3,1,MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...,006L,IIV6
5,Q91G88,006L,IIV6,IPR022549,Domain of unknown function DUF3627,interpro,domain,,{'pfam': {'PF12299': 'Protein of unknown funct...,,...,sp,Putative KilA-N domain-containing protein 006L,Invertebrate,176652,IIV6-006L,3,1,MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...,006L,IIV6


In [10]:
!pip install modlamp

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [49]:
! pip install modlamp

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [59]:
from typing import Optional, Dict
from Bio.SeqUtils.ProtParam import ProteinAnalysis


# Fix ambiguous amino acids
AA_FIXES = {
    'B': 'D', 'Z': 'E', 'J': 'L',
    'U': 'C', 'O': 'K', 'X': 'G',
}
VALID_AAS = set("ACDEFGHIKLMNPQRSTVWY")


def clean_sequence(seq: str) -> str:
    """Replace ambiguous amino acids with nearest valid equivalents."""
    return ''.join(AA_FIXES.get(aa, aa) for aa in seq if aa in VALID_AAS or aa in AA_FIXES)


def compute_biopython_features(seq: str, verbose: bool = False) -> Optional[Dict[str, float]]:
    """
    Compute amino acid features using Biopython and optionally modlamp.

    Args:
        seq (str): Original amino acid sequence.
        verbose (bool): If True, print error messages.

    Returns:
        dict or None: Feature dictionary or None if failed.
    """
    cleaned_seq = clean_sequence(seq)
    
    if len(cleaned_seq) < 3:
        if verbose:
            print(f"[Too short after cleaning] {seq}")
        return None

    try:
        analysis = ProteinAnalysis(cleaned_seq)
        aa_counts = analysis.count_amino_acids()
        aa_percents = analysis.amino_acids_percent
        sec_frac = analysis.secondary_structure_fraction()


        features = {
            "Sequence": seq,
            "Cleaned_Sequence": cleaned_seq,
            "length": len(cleaned_seq),
            "mol_weight": analysis.molecular_weight(),
            "iso_point": analysis.isoelectric_point(),
            "aromaticity": analysis.aromaticity(),
            "instability_index": analysis.instability_index(),
            "gravy": analysis.gravy(),
            "helix_frac": sec_frac[0],
            "turn_frac": sec_frac[1],
            "sheet_frac": sec_frac[2],
        }

        for aa in aa_counts:
            features[f"count_{aa}"] = aa_counts[aa]
            features[f"percent_{aa}"] = round(aa_percents[aa], 3)

    except Exception as e:
        if verbose:
            print(f"[Biopython Error] {seq}: {e}")
        return None

    return features


In [69]:
bio_features = merged_df["sequence"].apply(compute_biopython_features)
bio_features[0] # sample

{'Sequence': 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL',
 'Cleaned_Sequence': 'MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDDSFRKIYTDLGWKFTPL',
 'length': 256,
 'mol_weight': 29735.10070000003,
 'iso_point': 9.370173454284668,
 'aromaticity': 0.1015625,
 'instability_index': 31.205078124999996,
 'gravy': -0.5386718750000006,
 'helix_frac': 0.34375,
 'turn_frac': 0.25,
 'sheet_frac': 0.359375,
 'count_A': 13,
 'percent_A': 5.078,
 'count_C': 4,
 'percent_C': 1.562,
 'count_D': 17,
 'percent_D': 6.641,
 'count_E': 15,
 'percent_E': 5.859,
 'count_F': 8,
 'percent_F': 3.125,
 'count_G':

In [61]:
import re
from logging import getLogger
from typing import Dict, List, Optional, Set, Tuple

import pandas as pd

logger = getLogger(__name__)


def process_binary_feature(
    column_data: str, column_name: str, seq_len: int, current_index: int
) -> Tuple[List[bool], int]:
    """
    Process binary features from UniProt annotations (i.e. Helix, Turn)
    """
    indices = [False] * seq_len
    if pd.isna(column_data):
        return indices, current_index
    entries = column_data.split(f"{column_name} ")[1:]
    for entry in entries:
        index = entry.split(";")[0]
        try:
            if ":" in index or "?" in index:
                continue
            if ".." in index:
                start, end = index.split("..")
                start = start.strip("<")
                end = end.strip(">")
                start, end = int(start), int(end)
                for i in range(start - 1, min(end, seq_len)):
                    indices[i] = current_index
            else:
                idx = int(index) - 1
                if 0 <= idx < seq_len:
                    indices[idx] = current_index

            current_index += 1
        except Exception as e:
            print(f"Error processing binary column {column_name}: {e}")
            print(f"Column data: {column_data}, Index: {index}")
    return indices, current_index


def process_interaction_feature(
    column_data: str, column_name: str, seq_len: int
) -> Tuple[List[bool], List[Optional[int]]]:
    """
    Process interaction features (i.e. disulfide bonds) from UniProt annotations.
    """
    indices = [False] * seq_len
    pairs = [None] * seq_len
    if pd.isna(column_data):
        return indices, pairs
    n_pairs = 0
    entries = column_data.split(f"{column_name} ")[1:]
    for entry in entries:
        index = entry.split(";")[0]
        try:
            if ":" in index or "?" in index:
                continue
            if ".." in index:
                start, end = index.split("..")
                start = int(start.strip("<"))
                end = int(end.strip(">"))

                if 0 <= start - 1 < seq_len:
                    indices[start - 1] = True
                    pairs[start - 1] = n_pairs
                if 0 <= end - 1 < seq_len:
                    indices[end - 1] = True
                    pairs[end - 1] = n_pairs
                n_pairs += 1
            else:
                idx = int(index) - 1
                if 0 <= idx < seq_len:
                    indices[idx] = True
        except Exception as e:
            print(f"Error processing interaction column {column_name}: {e}")
            print(f"Column data: {column_data}, Index: {index}")
    return indices, pairs


def process_categorical_feature(
    column_data: str,
    column_name: str,
    category_options: Set[str],
    seq_len: int,
    current_index: Dict[str, int],
) -> tuple[list, dict]:
    """
    Process categorical features (i.e. Domain has multiple sub-categories) from UniProt annotations.

    Args:
        column_data: Raw feature data from UniProt
        column_name: Name of the feature column
        category_options: Set of valid category names
        seq_len: Length of the protein sequence
        current_index: Dictionary tracking current index for each category

    Returns:
        Tuple of (list of category indices vectors, updated current_index)
    """
    category_indices = {
        category_name: [False] * seq_len for category_name in category_options
    }
    if pd.isna(column_data):
        return [
            category_indices[category] for category in category_options
        ], current_index

    entries = column_data.split(f"{column_name} ")[1:]
    for entry in entries:
        positions_in_entry = entry.split(";")[0]
        entry_category = re.search(r'/note="([^"]+)"', entry)
        if entry_category:
            entry_category = entry_category.group(1).split(";")[0]

            if entry_category not in category_options:
                # skip this one
                continue

            indices_in_entry = []
            try:
                # Skip the undefined cases
                if ":" in positions_in_entry or "?" in positions_in_entry:
                    continue

                # Case where the position is a range
                if ".." in positions_in_entry:
                    start, end = positions_in_entry.split("..")
                    start = start.strip("<")
                    end = end.strip(">")
                    indices_in_entry = range(int(start), int(end) + 1)
                else:
                    indices_in_entry = [int(positions_in_entry)]
                for index in indices_in_entry:
                    if 0 <= index - 1 < seq_len:
                        if entry_category in category_options:
                            category_indices[entry_category][index - 1] = current_index[
                                entry_category
                            ]
                        category_indices["any"][index - 1] = current_index["any"]

                current_index[entry_category] += 1
                current_index["any"] += 1

            except Exception as e:
                print(
                    f"Error processing binary column index {positions_in_entry} with seq length {seq_len}"
                )
    return [category_indices[category] for category in category_options], current_index


def analyze_categorical_features(
    df: pd.DataFrame, category: str, category_name: str, separator_name: str = "note"
) -> Tuple[int, pd.Series, List[int], pd.Series]:
    """
    Helper function for analyzing categorical features to find common categories and their statistics.

    Returns:
        Tuple of (number of proteins, occurrences per protein, lengths, note counts)
    """
    non_na = df[category].dropna()
    n_per_prot = non_na.apply(lambda x: x.count(category_name)).value_counts()

    all_notes = []
    all_lengths = []

    for row in non_na:
        entries = row.split(category_name)[1:]
        for entry in entries:
            # Extract note
            note_start = entry.find(separator_name)
            note = entry[note_start + len(separator_name) + 1 :]
            note_end = note.find(";")
            note = note[:note_end].strip('"')

            if "/evidence" not in note:
                all_notes.append(note)

            try:
                # Extract length
                location = entry[1:]
                end_of_loc = location.find(";")
                location = location[:end_of_loc] if end_of_loc != -1 else location

                if ":" in location or "?" in location:
                    continue

                if ".." in location:
                    start, end = location.split("..")
                    start = int(start.strip("<"))
                    end = int(end.strip(">"))
                    all_lengths.append(end - start)
                else:
                    all_lengths.append(1)

            except Exception as e:
                logger.warning(f"Error analyzing feature length: {e}")

    return (
        len(non_na),
        n_per_prot.sort_values(ascending=False),
        all_lengths,
        pd.Series(all_notes).value_counts(),
    )