In [10]:
from pathlib import Path

import h5py
import pandas as pd

from taxonomy_fetcher import TaxonomyFetcher

In [None]:
email = "tobias.senoner@tum.de"

data_dir = Path("../processed_data")
raw_data = Path("../raw_data")
csv_file =  raw_data / "noise.csv"
csv_file2 =  raw_data / "toxins.tsv"
h5_file = raw_data / "mature_seqs_prot_t5_xl_uniref50.h5"

csv_hymenoptera_out = data_dir / "hymenoptera.csv"
csv_toxins_out = data_dir / "toxins_all.csv"
output_hymenoptera_h5 = csv_hymenoptera_out.with_suffix(".h5")

In [12]:
df = pd.read_csv(csv_file)
df2 = pd.read_csv(csv_file2, sep="\t")
df = df.merge(df2[['Entry Name', 'Entry']],
                on='Entry Name',
                how='left')
df.columns

Index(['Entry Name', 'Protein names', 'Organism', 'Protein families',
       'Sequence', 'Signal peptide', 'Sequence_noSP', 'blast_labels',
       'embeddings_labels', 'labels', 'new_protein_family', 'shortened_label',
       'Entry'],
      dtype='object')

## Process CSV -> Extract `Hymenoptera`

In [13]:
organisms = df['Organism'].apply(lambda x: x.split(' ')[0].strip()).unique()
fetcher = TaxonomyFetcher(email=email)
taxon_ids = fetcher.resolve_taxon_names(organisms)
taxon_records = fetcher.fetch_taxonomy_records(taxon_ids)

No taxon ID found for name 'Tliltocatl'


In [14]:
def get_taxonomic_rank(record, target_rank):
    """Extract specific taxonomic rank from lineage"""
    if hasattr(record, 'ranks') and hasattr(record, 'lineage'):
        for rank, name in zip(record.ranks, record.lineage):
            if rank.lower() == target_rank.lower():
                return name
    return None

# Create a mapping of genus to taxonomy info
taxonomy_map = {}
for record in taxon_records:
    genus_prefix = record.scientific_name.split(' ')[0]
    taxonomy_map[genus_prefix] = {
        'Order': get_taxonomic_rank(record, 'order'),
        'Family': get_taxonomic_rank(record, 'family'),
        'Genus': get_taxonomic_rank(record, 'genus')
    }

# Add taxonomy columns to dataframe
def get_taxonomy(organism):
    genus = organism.split(' ')[0].strip()
    return pd.Series(taxonomy_map.get(genus, {'Order': None, 'Family': None, 'Genus': None}))

# Add taxonomy columns
taxonomy_df = df['Organism'].apply(get_taxonomy)
df[['Order', 'Family', 'Genus']] = taxonomy_df

# Filter columns and rows
columns = ['Entry', 'Order', 'Family', 'Genus', 'Protein families',
           'Protein names', 'embeddings_labels', 'blast_labels']

result_df = df[columns]
result_df = result_df.rename(columns={'Entry': 'identifier'})
result_df.to_csv(csv_toxins_out, index=False)

# Filter for Hymenoptera
hymenoptera_df = result_df[df['Order'] == 'Hymenoptera'].copy()
hymenoptera_df.to_csv(csv_hymenoptera_out, index=False)

In [15]:
result_df["Order"].unique()

array(['Neogastropoda', 'Scorpiones', 'Araneae', 'Squamata',
       'Blenniiformes', 'Hymenoptera', 'Scolopendromorpha',
       'Trebouxiales', 'Diptera', 'Valvatida', 'Perciformes',
       'Batrachoidiformes', 'Lepidoptera', 'Monotremata', 'Hemiptera',
       'Myliobatiformes', 'Nectiopoda', 'Scutigeromorpha', None],
      dtype=object)

## Filter H5 file to only `Hymenoptera`

In [16]:
hymenoptera_ids = hymenoptera_df["identifier"].to_list()
with h5py.File(h5_file, 'r') as h5_in, h5py.File(output_hymenoptera_h5, 'w') as h5_out:
    # for header, emb in h5_in.items():
    #     print(header)
    #     break
    # Iterate through datasets in input H5
    for dataset_name in h5_in.keys():
        if dataset_name in hymenoptera_ids:
            # Copy matching datasets to output H5
            h5_in.copy(dataset_name, h5_out)

## Map `protein familes`

In [17]:
import pandas as pd
import numpy as np
import re

In [27]:
df = pd.read_csv("../processed_data/toxins_all.csv")

In [28]:
def create_protein_family_mapping():
    """
    Creates a dictionary mapping specific protein families to categories,
    with split categories for lectins, growth factors, and neurotoxins.
    """
    return {
        # Combined conotoxin groups
        'conotoxin': [
            r'conotoxin.*superfamily',
            r'conotoxin.*family',
            r'cono.*peptide'
        ],

        # Scorpion toxin groups
        'scorpion_ktx': [
            r'.*ktx.*subfamily',
            r'.*potassium channel inhibitor.*subfamily'
        ],
        'scorpion_long_toxin': [
            r'long.*scorpion toxin.*subfamily',
            r'long \([34] c-c\) scorpion toxin'
        ],
        'scorpion_short_toxin': [
            r'short.*scorpion toxin',
        ],

        # Combined three-finger toxin group
        'three_finger_toxin': [
            r'snake three-finger toxin.*subfamily'
        ],

        # neurotoxin groups
        'neurotoxin': [
            r'neurotoxin.*family',
            r'neurotoxin.*subfamily',
            # r'.*toxin.*subfamily'
        ],

        # Enzyme groups
        'phospholipase_a2': [
            r'phospholipase a2.*family'
        ],
        'phospholipase_other': [
            r'phospholipase [^a2].*family'
        ],
        'metalloproteinase': [
            r'.*metalloproteinase.*family'
        ],
        'peptidase': [
            r'peptidase.*family'
        ],

        # Peptide groups
        'antimicrobial_peptide': [
            r'.*antimicrobial peptide.*family',
            r'.*defensin.*family'
        ],
        'cationic_peptide': [
            r'cationic peptide.*family'
        ],
        'bradykinin_related': [
            r'bradykinin.*family'
        ],

        # Specialized toxin groups
        'scoloptoxin': [
            r'scoloptoxin.*family'
        ],
        'hainantoxin': [
            r'hainantoxin.*family'
        ],
        'teretoxin': [
            r'teretoxin.*superfamily'
        ],

        # Venom protein groups
        'venom_kunitz': [
            r'venom kunitz.*family'
        ],
        'venom_protein_other': [
            r'venom protein.*family'
        ],

        # Split growth factors
        'ngf': [
            r'.*\bngf\b.*family',
            r'.*nerve growth factor.*family'
        ],
        'vegf': [
            r'.*\bvegf\b.*family',
            r'.*vascular endothelial growth factor.*family'
        ],
        'egf': [
            r'.*\begf\b.*family',
            r'.*epidermal growth factor.*family'
        ],

        # Hormone related
        'hormone_related': [
            r'.*hormone.*family',
            r'insulin family',
            r'glucagon family'
        ],

        # Split lectin groups
        'snaclec': [
            r'snaclec family',
            r'snake.*lectin.*family'
        ],
        'ficolin': [
            r'ficolin.*family'
        ],

        # Other specific groups
        'crisp_related': [
            r'crisp family',
            r'crisp.*subfamily'
        ],
        'disintegrin': [
            r'disintegrin.*family'
        ],
        'mcd_related': [
            r'mcd family',
            r'mcd.*subfamily'
        ]
    }

In [29]:
def classify_protein_family(family_name, categories):
    """
    Classifies a protein family name into categories based on regex patterns.
    Returns np.nan instead of 'unknown' for unmatched cases.

    Parameters:
    family_name (str): Original protein family name
    categories (dict): Dictionary of category patterns

    Returns:
    str or np.nan: Category name or np.nan if no match found
    """
    if pd.isna(family_name):
        return pd.NA

    family_name = str(family_name).lower()

    for category, patterns in categories.items():
        if any(re.search(pattern.lower(), family_name) for pattern in patterns):
            return category

    return pd.NA

def map_protein_families(df, family_column):
    """
    Maps protein families in a DataFrame to categories.

    Parameters:
    df (pandas.DataFrame): DataFrame containing protein family information
    family_column (str): Name of the column containing protein family names

    Returns:
    pandas.DataFrame: Original DataFrame with new 'protein_category' column
    """
    # Create the mapping categories
    categories = create_protein_family_mapping()

    # Create new column with mapped categories
    df['protein_category'] = df[family_column].apply(
        lambda x: classify_protein_family(x, categories)
    )

    return df

In [30]:
df_new = map_protein_families(df, "Protein families")
df_new = df_new[['identifier', 'Order', 'Family', 'Genus', 'protein_category', 'Protein families']]
df_new.to_csv("../processed_data/toxins.csv", index=False)

In [24]:
df_new.columns

Index(['identifier', 'Order', 'Family', 'Genus', 'Protein families',
       'Protein names', 'embeddings_labels', 'blast_labels',
       'protein_category'],
      dtype='object')

In [63]:
# https://plotly.com/python/marker-style/
from plotly.validators.scatter.marker import SymbolValidator

def extract_proper_strings(input_list):
    # Filter out integers and string representations of numbers
    return [item for item in input_list if isinstance(item, str) and not item.isdigit()]

raw_symbols = SymbolValidator().values
raw_symbols = extract_proper_strings(raw_symbols)
# raw_symbols = list(set([i.split("-")[0] for i in raw_symbols]))
sorted(raw_symbols)

['arrow',
 'arrow-bar-down',
 'arrow-bar-down-open',
 'arrow-bar-left',
 'arrow-bar-left-open',
 'arrow-bar-right',
 'arrow-bar-right-open',
 'arrow-bar-up',
 'arrow-bar-up-open',
 'arrow-down',
 'arrow-down-open',
 'arrow-left',
 'arrow-left-open',
 'arrow-open',
 'arrow-right',
 'arrow-right-open',
 'arrow-up',
 'arrow-up-open',
 'arrow-wide',
 'arrow-wide-open',
 'asterisk',
 'asterisk-open',
 'bowtie',
 'bowtie-open',
 'circle',
 'circle-cross',
 'circle-cross-open',
 'circle-dot',
 'circle-open',
 'circle-open-dot',
 'circle-x',
 'circle-x-open',
 'cross',
 'cross-dot',
 'cross-open',
 'cross-open-dot',
 'cross-thin',
 'cross-thin-open',
 'diamond',
 'diamond-cross',
 'diamond-cross-open',
 'diamond-dot',
 'diamond-open',
 'diamond-open-dot',
 'diamond-tall',
 'diamond-tall-dot',
 'diamond-tall-open',
 'diamond-tall-open-dot',
 'diamond-wide',
 'diamond-wide-dot',
 'diamond-wide-open',
 'diamond-wide-open-dot',
 'diamond-x',
 'diamond-x-open',
 'hash',
 'hash-dot',
 'hash-open',
 

In [9]:
result_df["Protein families"].unique()

array([nan,
       'Short scorpion toxin superfamily, Potassium channel inhibitor kappa-KTx family, Kappa-KTx 2 subfamily',
       'Helical arthropod-neuropeptide-derived (HAND) family',
       'Short scorpion toxin superfamily, Potassium channel inhibitor family, Alpha-KTx 09 subfamily',
       'Nucleotide pyrophosphatase/phosphodiesterase family',
       'Conotoxin M superfamily', 'Venom complement C3 homolog family',
       'Conotoxin D superfamily',
       'Conotoxin C superfamily, Consomatin family',
       'Cationic peptide 03 (latarcin) family, 01 subfamily',
       'Endothelin/sarafotoxin family',
       'Opioid neuropeptide precursor family',
       'FARP (FMRFamide related peptide) family',
       'MCD family, Mastoparan subfamily',
       'Neurotoxin 30 (phrixotoxin) family',
       'Long chain scorpion toxin family, Class 2 subfamily',
       'Conotoxin A superfamily', 'Crotamine-myotoxin family',
       'Bradykinin-related peptide family',
       'O2 superfamily, Contrypha

In [7]:
df

Unnamed: 0,Entry Name,Protein names,Organism,Protein families,Sequence,Signal peptide,Sequence_noSP,blast_labels,embeddings_labels,labels,new_protein_family,shortened_label,Entry,Order,Family,Genus
0,TE53_HASHE,Augerpeptide hhe53,Hastula hectica (Sea snail) (Impages hectica),,GLSQSGCQAFTGRWCVGCERLRSRVVWECSPKRVVNSI,,GLSQSGCQAFTGRWCVGCERLRSRVVWECSPKRVVNSI,-1,-1,-1.0,,,P0CI21,Neogastropoda,Terebridae,Hastula
1,KKX25_OPICY,Potassium channel toxin kappa-KTx 2.5 (OcyC8) ...,Opisthacanthus cayaporum (South American scorp...,"Short scorpion toxin superfamily, Potassium ch...",MESSRKSYVLMLFLAFVIMNVCSVSGEPKDGEIAGFEMEEARYDAC...,"SIGNAL 1..26; /evidence=""ECO:0000255""",EPKDGEIAGFEMEEARYDACVNACLEHHPNVRECEEACKNPVPP,4,-1,,,,P86110,Scorpiones,Hemiscorpiidae,Opisthacanthus
2,TXI2_ERAAG,U1-agatoxin-Ta1b (U1-AGTX-Ta1b) (Insecticidal ...,Eratigena agrestis (Hobo spider) (Tegenaria ag...,Helical arthropod-neuropeptide-derived (HAND) ...,MKLQLMICLVLLPCFFCEPDEICRARMTNKEFTYKSNVCNNCGDQV...,"SIGNAL 1..17; /evidence=""ECO:0000269|PubMed:95...",EPDEICRARMTNKEFTYKSNVCNNCGDQVAACEAECFRNDVYTACH...,4,-1,,,,O46167,Araneae,Agelenidae,Eratigena
3,KAX96_HOTJU,Potassium channel toxin alpha-KTx 9.6 (BjTx-1),Hottentotta judaicus (Black scorpion) (Buthotu...,"Short scorpion toxin superfamily, Potassium ch...",VGCEECPAHCKGKNAKPTCDDGVCNCNV,,VGCEECPAHCKGKNAKPTCDDGVCNCNV,4,-1,,,,P83405,Scorpiones,Buthidae,Hottentotta
4,PDE1_CROAD,Venom phosphodiesterase 1 (PDE) (EC 3.6.1.-),Crotalus adamanteus (Eastern diamondback rattl...,Nucleotide pyrophosphatase/phosphodiesterase f...,MIQQKVLFISLVAVTLGLGLGLGLKESVQPQVSCRYRCNETFSKMA...,"SIGNAL 1..23; /evidence=""ECO:0000255""",LKESVQPQVSCRYRCNETFSKMASGCSCDDKCTERQACCSDYEDTC...,4,-1,,,,J3SEZ3,Squamata,Viperidae,Crotalus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5176,H3F01_CYRHA,Mu-theraphotoxin-Hhn2n (Mu-TRTX-Hhn2n) (Hainan...,Cyriopagopus hainanus (Chinese bird spider) (H...,"Neurotoxin 10 (Hwtx-1) family, 15 (Hntx-3) sub...",MKASMYLALAGLVLLFVVGYASESEEKEFPRELLSKIFAVDDFKGE...,"SIGNAL 1..21; /evidence=""ECO:0000255""",SESEEKEFPRELLSKIFAVDDFKGEERGCKGFSDSCTPGKNECCPN...,31,22,,Neurotoxin 10 (Hwtx-1) family,HWTX1,D2Y1Y9,Araneae,Theraphosidae,Cyriopagopus
5177,H3A02_CYRHA,Hainantoxin-III 2 (HnTx-III) (Hainantoxin-3.2)...,Cyriopagopus hainanus (Chinese bird spider) (H...,"Neurotoxin 10 (Hwtx-1) family, 15 (Hntx-3) sub...",MKASMYLALAGLVLLFVVGYASESEEKEFPRELLSKIFAVDDFKGE...,"SIGNAL 1..21; /evidence=""ECO:0000255""",SESEEKEFPRELLSKIFAVDDFKGEERGCKGFGDSCTPGKNECCPN...,31,22,,Neurotoxin 10 (Hwtx-1) family,HWTX1,D2Y1Y0,Araneae,Theraphosidae,Cyriopagopus
5178,H3G01_CYRHA,Mu-theraphotoxin-Hhn2o (Mu-TRTX-Hhn2o) (Hainan...,Cyriopagopus hainanus (Chinese bird spider) (H...,"Neurotoxin 10 (Hwtx-1) family, 15 (Hntx-3) sub...",MKASMFLALAGLVLLFVVGYASESEEKEFPIELLSKIFAVDVFKGE...,"SIGNAL 1..21; /evidence=""ECO:0000255""",SESEEKEFPIELLSKIFAVDVFKGEERGCKGFGDSCTPGKNECCPN...,31,22,,Neurotoxin 10 (Hwtx-1) family,HWTX1,D2Y1Z0,Araneae,Theraphosidae,Cyriopagopus
5179,H3H01_CYRHA,Mu-theraphotoxin-Hhn2p (Mu-TRTX-Hhn2p) (Hainan...,Cyriopagopus hainanus (Chinese bird spider) (H...,"Neurotoxin 10 (Hwtx-1) family, 15 (Hntx-3) sub...",MKASMYLALAGLVLLFVVGYASESEEKEFPRELLSKIFAVDDFKGE...,"SIGNAL 1..21; /evidence=""ECO:0000255""",SESEEKEFPRELLSKIFAVDDFKGEERGCKGFGDSCTPGKNECCPN...,31,22,,Neurotoxin 10 (Hwtx-1) family,HWTX1,D2Y1Z1,Araneae,Theraphosidae,Cyriopagopus
