# Python Notebook

In [17]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
import pandas as pd
# Change working directory to ConnectomeDB
project_root = "/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories/ConnectomeDB"
os.chdir(project_root)
sys.path.append(os.path.abspath("src"))

In [10]:
gene_pair.to_csv("test.csv")

In [22]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import re
from bs4 import BeautifulSoup
from createDataTable import gene_pair, gene_pair000, human_columns, lrPairsCount
import warnings


# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)

def cleanup_species_info(species_info, species_name):
    """
    Simple cleanup: replace empty associated_gene_name with homolog_ensembl_gene
    """
    # Find the associated gene name column
    associated_gene_col = None
    homolog_ensembl_col = None
    
    for col in species_info.columns:
        if species_name in col and 'homolog' in col and 'associated_gene_name' in col:
            associated_gene_col = col
        elif species_name in col and 'homolog_ensembl_gene' in col:
            homolog_ensembl_col = col
    
    if associated_gene_col and homolog_ensembl_col:
        # Replace empty or NaN values in associated_gene_name with homolog_ensembl_gene
        mask = (species_info[associated_gene_col] == "") | (species_info[associated_gene_col].isna())
        species_info.loc[mask, associated_gene_col] = species_info.loc[mask, homolog_ensembl_col]
        print(f"Filled {mask.sum()} empty {associated_gene_col} values with {homolog_ensembl_col}")
    
    return species_info
    
def process_species(gene_pair_df, gene_pair000_df, species, id_prefix, ligand_index, receptor_index):
    """
    Processes ligand-receptor interactions for a given species.

    Parameters:
        gene_pair_df (pd.DataFrame): The main gene pair DataFrame.
        gene_pair000_df (pd.DataFrame): The filtered gene pair DataFrame.
        species (str): The species name (e.g., "Mouse", "Rat", "Zebrafish").
        id_prefix (str): The identifier prefix for the species (e.g., "MGI" for Mouse, "RGD" for Rat).
        ligand_index (int): Index for selecting the ligand column.
        receptor_index (int): Index for selecting the receptor column.

    Returns:
        pd.DataFrame: Processed DataFrame for the species.
    """
    if species in ["Mouse", "Rat", "Zebrafish"]:
        species_columns = [col for col in gene_pair_df.columns if id_prefix in col or species in col]
    else:
        species_columns = [col for col in gene_pair_df.columns if species in col]

    # Filter rows where all species-specific columns are not empty
    species_gene_pair = gene_pair000_df[(gene_pair000_df[species_columns].map(str.strip) != "").all(axis=1)]
    
    # Rename columns to remove species name
    species_gene_pair.columns = [
        col.replace(f"{species} ", "").strip() if "Ligand" in col or "Receptor" in col else col
        for col in species_gene_pair.columns
    ]

    if id_prefix == "Ensembl":
        # Extract species names dynamically
        all_species = {"Chimpanzee", "Pig", "Dog", "Cow", 
                       "Chicken", "Horse", "Sheep", "Marmoset", "Macaque"}
        # Function to clean column names by removing HTML tags
        def clean_column_name(col):
            # Remove HTML tags using BeautifulSoup
            cleaned_name = BeautifulSoup(col, "html.parser").get_text()
            return cleaned_name.strip()
        
        # Step 1: Identify columns that contain any species from all_species
        cleaned_columns = [clean_column_name(col) for col in species_gene_pair.columns]
        
        # Identify columns where the species from all_species is in the cleaned name
        columns_to_remove = [
            species_gene_pair.columns[i] for i, cleaned_name in enumerate(cleaned_columns)
            if any(species in cleaned_name for species in all_species)
        ]
        species_gene_pair = species_gene_pair.drop(columns=columns_to_remove)
        
    
    # Apply the formatting function
    species_gene_pair1 = species_gene_pair.copy()

    # Identify ligand and receptor columns dynamically
    ligand_col = [col for col in species_gene_pair1.columns if "Ligand&nbsp;" in col][ligand_index]
    receptor_col = [col for col in species_gene_pair1.columns if "Receptor&nbsp;" in col][receptor_index]
    ligand_Location = [col for col in species_gene_pair1.columns if "Ligand Location" in col][0]
    receptor_Location = [col for col in species_gene_pair1.columns if "Receptor Location" in col][0]
    # Identify relevant columns for the species
    species_columns = [col for col in species_gene_pair1.columns if id_prefix in col]
    new_order = [human_columns[0]]+ [ligand_col, receptor_col] + species_columns + human_columns[1:]
    species_gene_pair1 = species_gene_pair1[new_order].reset_index(drop=True)

    # Apply the formatting function
    #species_gene_pair1 = species_gene_pair.copy()
            
    def extract_hgnc_id(col):
        """Use regular expression to extract the HGNC ID after 'HGNC:'."""
        match = re.search(r'HGNC:(\d+)', col)
        if match:
            return "HGNC:"+match.group(1)
        return None
    #clean HGNC
    

    species_name = {
        "Mouse": "mmusculus",
        "Rat": "rnorvegicus",
        "Zebrafish":"drerio" ,
        "Chimpanzee":"ptroglodytes",
        "Chicken":"ggallus",
        "Pig":"sscrofa",
        "Cow":"btaurus",
        "Dog":"clfamiliaris",
        "Horse":"ecaballus",
        "Sheep":"oarambouillet",
        "Marmoset": "cjacchus" ,
        "Macaque": "mmulatta"   
    }.get(species, "Unknown species")
    
    # Load species-specific data
    species_info = pd.read_csv(f"data/{species_name}_ID_biomart.csv")
    # replace empty ligand/receptor symbols with ens id for now
    species_info = cleanup_species_info(species_info, species_name)
    # Keep relevant columns - use species code, not species_name
    species_mapping = {
        "mmusculus": "mgi_id",
        "rnorvegicus": "rgd_id", 
        "drerio": "zfin_id_id"
    }
 # Fix: Use species (not species_name) for lookup
    species_id = species_mapping.get(species_name, f"{species_name}_homolog_ensembl_gene")
    
    species_info = species_info.dropna(subset=['hgnc_id'])
    species_info = species_info.dropna(subset=[species_id])
    # Merge with ligand data
    ligand_hgnc = [col for col in species_gene_pair1.columns if "Ligand HGNC ID" in col][0]
    species_gene_pair1["Lig HGNC ID"] = species_gene_pair1[ligand_hgnc].apply(extract_hgnc_id)
    species_gene_pair1 = species_gene_pair1.merge(species_info, how='left', 
                               left_on='Lig HGNC ID', right_on='hgnc_id',
                               suffixes=('', '_lig'))

    # Turn Interaction ID into clickable links
    # species_gene_pair1[species_gene_pair1.columns[0]] = species_gene_pair1[species_gene_pair1.columns[0]].apply(
    #     lambda x: f"<a href='https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/database/filter/{x}.html'>{x}</a>"
    # )
        # Rename columns for ligand info
    rename_dict = {}
    if f"{species_name}_homolog_goc_score" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_goc_score"] = f"{species} Ligand GOC score"
    if f"{species_name}_homolog_wga_coverage" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_wga_coverage"] = f"{species} Ligand WGA coverage"   
    if f"{species_name}_homolog_perc_id" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id"] = f"{species} Ligand % Identity"
    if f"{species_name}_homolog_perc_id_r1" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id_r1"] = f"{species} Ligand Target % Identity"           
    if f"{species_name}_homolog_orthology_confidence" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_orthology_confidence"] = f"{species} Ligand Orthology Confidence"           
    if f"{species_name}_homolog_associated_gene_name" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_associated_gene_name"] = f"{species} Ligand"
    if "mgi_description" in species_gene_pair1.columns:
        rename_dict["mgi_description"] = f"{species} Ligand Name"
    if "description" in species_gene_pair1.columns:
        rename_dict["description"] = f"{species} Ligand Name"
    if f"{species_name}_homolog_ensembl_gene" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_ensembl_gene"] = f"{species} Ligand Ensembl ID"
    if species_id in species_gene_pair1.columns and species_id != f"{species_name}_homolog_ensembl_gene":
        rename_dict[species_id] = f"{species} Ligand ID"
    species_gene_pair1 = species_gene_pair1.rename(columns=rename_dict)
    # Merge with receptor data
    receptor_hgnc = [col for col in species_gene_pair1.columns if "Receptor HGNC ID" in col][0]
    species_gene_pair1["Rec HGNC ID"] = species_gene_pair1[receptor_hgnc].apply(extract_hgnc_id)
    species_gene_pair1 = species_gene_pair1.merge(species_info, how='left', 
                               left_on='Rec HGNC ID', right_on='hgnc_id',
                               suffixes=('', '_rec'))
    # Rename columns for ligand info
    rename_dict = {}
    if f"{species_name}_homolog_goc_score" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_goc_score"] = f"{species} Receptor GOC score"
    if f"{species_name}_homolog_wga_coverage" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_wga_coverage"] = f"{species} Receptor WGA coverage"   
    if f"{species_name}_homolog_perc_id" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id"] = f"{species} Receptor % Identity"
    if f"{species_name}_homolog_perc_id_r1" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id_r1"] = f"{species} Receptor Target % Identity"           
    if f"{species_name}_homolog_orthology_confidence" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_orthology_confidence"] = f"{species} Receptor Orthology Confidence"           
    if f"{species_name}_homolog_associated_gene_name" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_associated_gene_name"] = f"{species} Receptor"
    if "mgi_description" in species_gene_pair1.columns:
        rename_dict["mgi_description"] = f"{species} Receptor Name"
    if "description" in species_gene_pair1.columns:
        rename_dict["description"] = f"{species} Receptor Name"
    if f"{species_name}_homolog_ensembl_gene" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_ensembl_gene"] = f"{species} Receptor Ensembl ID"
    if species_id in species_gene_pair1.columns and species_id != f"{species_name}_homolog_ensembl_gene":
        rename_dict[species_id] = f"{species} Receptor ID"
        
    species_gene_pair1 = species_gene_pair1.rename(columns=rename_dict)
    # Drop hgnc_id columns
    cols_to_drop = [col for col in species_gene_pair1.columns if col in ['hgnc_id', 'hgnc_id_rec', 'hgnc_symbol_rec',
                                                                         'hgnc_id_lig', 'hgnc_symbol_lig',  'Rec HGNC ID','ensembl_gene_id']]
    if cols_to_drop:
        origDF = species_gene_pair1.drop(columns=cols_to_drop)

    # Drop columns where all values are NaN
    species_gene_pair1 = species_gene_pair1.dropna(axis=1, how='all')
    col_to_rename = [col for col in species_gene_pair1.columns if "Human LR Pair" in col][0]
    species_gene_pair1.rename(columns={col_to_rename: "LR Pair Card"}, inplace=True)
    # Update ligand_col values only if species-specific replacement is available
    # Only override if the cleaned species-specific columns exist now
    new_ligand_col = f"{species} Ligand"
    new_receptor_col = f"{species} Receptor"
    
    if new_ligand_col in species_gene_pair1.columns:
        species_gene_pair1[ligand_col] = [
            new if pd.notna(new) and str(new).strip() else orig
            for new, orig in zip(species_gene_pair1[new_ligand_col], species_gene_pair1[ligand_col])
        ]

    if new_receptor_col in species_gene_pair1.columns:
        species_gene_pair1[receptor_col] = [
            new if pd.notna(new) and str(new).strip() else orig
            for new, orig in zip(species_gene_pair1[new_receptor_col], species_gene_pair1[receptor_col])
        ]

    if species in ["Mouse", "Rat", "Zebrafish"]:
        ligand_mgi_id_col = [col for col in species_gene_pair1.columns if f"Ligand {id_prefix} ID" in col][0]
        receptor_mgi_id_col = [col for col in species_gene_pair1.columns if f"Receptor {id_prefix} ID" in col][0]
    
        ligand_source = species_gene_pair1[f"{species} Ligand ID"]
        receptor_source = species_gene_pair1[f"{species} Receptor ID"]
    
        # URL builders by species
        def build_link(val, species):
            if pd.isna(val) or not str(val).strip() or str(val).strip().lower() in {"none", "nan"}:
                return ""
        
            val = str(val).strip()
            
            try:
                val = str(int(float(val)))  # Remove trailing ".0"
            except ValueError:
                pass
        
            if species == "Mouse":
                return f'<a href="https://www.informatics.jax.org/marker/{val}" target="_blank">{val}</a>'
            elif species == "Rat":
                return f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=RGD:{val}" target="_blank">RGD:{val}</a>'
            elif species == "Zebrafish":
                return f'<a href="https://zfin.org/{val}" target="_blank">{val}</a>'
            else:
                return val
    
        ligand_links = [build_link(val, species) for val in ligand_source]
        receptor_links = [build_link(val, species) for val in receptor_source]
    
        # Replace only if new link is non-empty
        species_gene_pair1[ligand_mgi_id_col] = [
            link if link else orig
            for link, orig in zip(ligand_links, species_gene_pair1[ligand_mgi_id_col])
        ]
    
        species_gene_pair1[receptor_mgi_id_col] = [
            link if link else orig
            for link, orig in zip(receptor_links, species_gene_pair1[receptor_mgi_id_col])
        ]
    
        # Drop original columns
        species_gene_pair1 = species_gene_pair1.drop(columns=[f"{species} Ligand ID", f"{species} Receptor ID"])
    else:
        # Update ligand_ens values only if species-specific replacement is available
    
        # Only override if the cleaned species-specific columns exist now
        new_ligand_ens = f"{species} Ligand Ensembl ID"
        new_receptor_ens = f"{species} Receptor Ensembl ID"
        ligand_ens = [col for col in species_gene_pair1.columns if "Ligand Ensembl ID" in col][0]
        receptor_ens= [col for col in species_gene_pair1.columns if "Receptor Ensembl ID" in col][0]
        if new_ligand_ens in species_gene_pair1.columns:
            species_gene_pair1[ligand_ens] = [
                new if pd.notna(new) and str(new).strip() else orig
                for new, orig in zip(species_gene_pair1[new_ligand_ens], species_gene_pair1[ligand_ens])
            ]
    
        if new_receptor_ens in species_gene_pair1.columns:
            species_gene_pair1[receptor_ens] = [
                new if pd.notna(new) and str(new).strip() else orig
                for new, orig in zip(species_gene_pair1[new_receptor_ens], species_gene_pair1[receptor_ens])
            ]
    
    # ligand_col = f"{species} Ligand"   
    # receptor_col = f"{species} Receptor"  
    def format_lr_pair(row):
        if row[ligand_Location] in ['secreted', '']:
            return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
        elif row[receptor_Location] == 'plasma membrane':
            return f"{row[ligand_col]} <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
        else:
            return f"{row[ligand_col]} \u2192 {row[receptor_col]}"
            
    species_gene_pair1.loc[:, f"{species} LR Pair"] = species_gene_pair1.apply(format_lr_pair, axis=1)
       # Identify relevant columns for the species
    species_lr_pair_col = f"{species} LR Pair"
    species_columns = [
        col for col in species_gene_pair1.columns
        if (id_prefix in col or species in col)
        and col not in [ligand_col, receptor_col, species_lr_pair_col]  # <- singular
    ]

    new_order = [human_columns[0]]+ [f"{species} LR Pair", ligand_col, receptor_col] + species_columns + ["LR Pair Card"] + human_columns[2:]
    species_gene_pair1 = species_gene_pair1[new_order].reset_index(drop=True)
    species_gene_pair1 = species_gene_pair1.loc[:, ~species_gene_pair1.columns.duplicated()]
    species_gene_pair1.columns = [
        col if col == species_lr_pair_col else col.replace(species, "").strip()
        for col in species_gene_pair1.columns
    ]
    if id_prefix == "Ensembl":
        species_gene_pair1 = species_gene_pair1.drop(columns=['Ligand Ensembl ID',
                                                              'Receptor Ensembl ID'])
    species_gene_pair1 = species_gene_pair1.drop(columns=['Ligand',
                                                              'Receptor'])

    return species_gene_pair1


# Process each species
# Mouse
mouse_gene_pair1 = process_species(gene_pair, gene_pair000, "Mouse", "MGI", 1, 1)
MouselrPairsCount = len(pd.unique(mouse_gene_pair1.iloc[:, 0]))# unique
HumanMouseLRPairsPer = (MouselrPairsCount/lrPairsCount)*100
HumanMouseLRPairsPer = round(HumanMouseLRPairsPer, 2)
# Round up to the nearest 0.5%
HumanMouseLRPairsPer = ((HumanMouseLRPairsPer * 2 + 1) // 1) / 2
### adding the mouse-specific annotations ####
mouse_gene_pair1 = mouse_gene_pair1.reset_index(drop=True) 
mouse_rat_info = pd.read_csv("data/mouse_name_mapping.csv")
ligand_mgi_id_col = [col for col in mouse_gene_pair1.columns if f"Ligand MGI ID" in col][0]
receptor_mgi_id_col = [col for col in mouse_gene_pair1.columns if f"Receptor MGI ID" in col][0]   
mapping_mouse_name = dict(zip(mouse_rat_info['MGI ID'], mouse_rat_info['MGI description']))
mapping_mouse_ens = dict(zip(mouse_rat_info['MGI ID'], mouse_rat_info['Gene stable ID']))
# extract mgi
def extract_mgi_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'MGI:(\d+)', col)
    if match:
        return 'MGI:' +str(match.group(1))
    return None
    

mouse_gene_pair1['Ligand MGI ID'] = mouse_gene_pair1[ligand_mgi_id_col].apply(extract_mgi_id)
mouse_gene_pair1['Receptor MGI ID'] = mouse_gene_pair1[receptor_mgi_id_col].apply(extract_mgi_id)
# Apply the mapping to 'Ligand Name'
mouse_gene_pair1['Ligand Name'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_name.get(row['Ligand MGI ID'], row['Ligand Name'])
    if pd.isna(row['Ligand Name']) or str(row['Ligand Name']).strip() == '' else row['Ligand Name'],
    axis=1
)

 # Apply the mapping to 'Receptor Name'
mouse_gene_pair1['Receptor Name'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_name.get(row['Receptor MGI ID'], row['Receptor Name'])
    if pd.isna(row['Receptor Name']) or str(row['Receptor Name']).strip() == '' else row['Receptor Name'],
    axis=1
)

# Apply the mapping to 'Ligand Ensembl ID'
mouse_gene_pair1['Ligand Ensembl ID'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_ens.get(row['Ligand MGI ID'], row['Ligand Ensembl ID'])
    if pd.isna(row['Ligand Ensembl ID']) or str(row['Ligand Ensembl ID']).strip() == '' else row['Ligand Ensembl ID'],
    axis=1
)

 # Apply the mapping to 'Receptor Ensembl ID'
mouse_gene_pair1['Receptor Ensembl ID'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_ens.get(row['Receptor MGI ID'], row['Receptor Ensembl ID'])
    if pd.isna(row['Receptor Ensembl ID']) or str(row['Receptor Ensembl ID']).strip() == '' else row['Receptor Ensembl ID'],
    axis=1
)
mouse_gene_pair1 = mouse_gene_pair1.drop(columns=['Ligand MGI ID',
                                                  'Receptor MGI ID'])
################################################################

# Rat
rat_gene_pair1 = process_species(gene_pair, gene_pair000, "Rat", "RGD", 2, 2)
# Zebrafish
zebrafish_gene_pair1 = process_species(gene_pair, gene_pair000, "Zebrafish", "ZFIN", 3, 3)

# Chimpanzee
chimpanzee_gene_pair1 = process_species(gene_pair, gene_pair000, "Chimpanzee", "Ensembl", 4, 4)

# Chicken
chicken_gene_pair1 = process_species(gene_pair, gene_pair000, "Chicken", "Ensembl", 4, 4)

# Pig
pig_gene_pair1 = process_species(gene_pair, gene_pair000, "Pig", "Ensembl", 4, 4)

# Cow
cow_gene_pair1 = process_species(gene_pair, gene_pair000, "Cow", "Ensembl", 4, 4)

# Dog
dog_gene_pair1 = process_species(gene_pair, gene_pair000, "Dog", "Ensembl", 4, 4)

# Horse
horse_gene_pair1 = process_species(gene_pair, gene_pair000, "Horse", "Ensembl",4, 4)

# Sheep
sheep_gene_pair1 = process_species(gene_pair, gene_pair000, "Sheep", "Ensembl", 4, 4)

# Marmoset
marmoset_gene_pair1 = process_species(gene_pair, gene_pair000, "Marmoset", "Ensembl", 4, 4)

# Rhesus macaque
macaque_gene_pair1 = process_species(gene_pair, gene_pair000, "Macaque", "Ensembl", 4, 4)



Filled 28063 empty mmusculus_homolog_associated_gene_name values with mmusculus_homolog_ensembl_gene
Filled 31610 empty rnorvegicus_homolog_associated_gene_name values with rnorvegicus_homolog_ensembl_gene
Filled 33059 empty drerio_homolog_associated_gene_name values with drerio_homolog_ensembl_gene
Filled 25983 empty ptroglodytes_homolog_associated_gene_name values with ptroglodytes_homolog_ensembl_gene
Filled 35409 empty ggallus_homolog_associated_gene_name values with ggallus_homolog_ensembl_gene
Filled 32745 empty sscrofa_homolog_associated_gene_name values with sscrofa_homolog_ensembl_gene
Filled 31140 empty btaurus_homolog_associated_gene_name values with btaurus_homolog_ensembl_gene
Filled 31614 empty clfamiliaris_homolog_associated_gene_name values with clfamiliaris_homolog_ensembl_gene
Filled 31473 empty ecaballus_homolog_associated_gene_name values with ecaballus_homolog_ensembl_gene
Filled 31941 empty oarambouillet_homolog_associated_gene_name values with oarambouillet_homol

In [25]:
mouse_gene_pair1.to_csv("test.csv")

In [3]:
    if species in ["Mouse", "Rat", "Zebrafish"]:
        species_columns = [col for col in gene_pair_df.columns if id_prefix in col or species in col]
    else:
        species_columns = [col for col in gene_pair_df.columns if species in col]

    # Filter rows where all species-specific columns are not empty
    species_gene_pair = gene_pair000_df[(gene_pair000_df[species_columns].map(str.strip) != "").all(axis=1)]
    
    # Rename columns to remove species name
    species_gene_pair.columns = [
        col.replace(f"{species} ", "").strip() if "Ligand" in col or "Receptor" in col else col
        for col in species_gene_pair.columns
    ]

    if id_prefix == "Ensembl":
        # Extract species names dynamically
        all_species = {"Chimpanzee", "Pig", "Dog", "Cow", 
                       "Chicken", "Horse", "Sheep", "Marmoset", "Macaque"}
        # Function to clean column names by removing HTML tags
        def clean_column_name(col):
            # Remove HTML tags using BeautifulSoup
            cleaned_name = BeautifulSoup(col, "html.parser").get_text()
            return cleaned_name.strip()
        
        # Step 1: Identify columns that contain any species from all_species
        cleaned_columns = [clean_column_name(col) for col in species_gene_pair.columns]
        
        # Identify columns where the species from all_species is in the cleaned name
        columns_to_remove = [
            species_gene_pair.columns[i] for i, cleaned_name in enumerate(cleaned_columns)
            if any(species in cleaned_name for species in all_species)
        ]
        species_gene_pair = species_gene_pair.drop(columns=columns_to_remove)
        
    
    # Apply the formatting function
    species_gene_pair1 = species_gene_pair.copy()

    # Identify ligand and receptor columns dynamically
    ligand_col = [col for col in species_gene_pair1.columns if "Ligand&nbsp;" in col][ligand_index]
    receptor_col = [col for col in species_gene_pair1.columns if "Receptor&nbsp;" in col][receptor_index]
    ligand_Location = [col for col in species_gene_pair1.columns if "Ligand Location" in col][0]
    receptor_Location = [col for col in species_gene_pair1.columns if "Receptor Location" in col][0]
    # Identify relevant columns for the species
    species_columns = [col for col in species_gene_pair1.columns if id_prefix in col]
    new_order = [human_columns[0]]+ [ligand_col, receptor_col] + species_columns + human_columns[1:]
    species_gene_pair1 = species_gene_pair1[new_order].reset_index(drop=True)

In [5]:
    def extract_hgnc_id(col):
        """Use regular expression to extract the HGNC ID after 'HGNC:'."""
        match = re.search(r'HGNC:(\d+)', col)
        if match:
            return "HGNC:"+match.group(1)
        return None
    #clean HGNC
    

    species_name = {
        "Mouse": "mmusculus",
        "Rat": "rnorvegicus",
        "Zebrafish":"drerio" ,
        "Chimpanzee":"ptroglodytes",
        "Chicken":"ggallus",
        "Pig":"sscrofa",
        "Cow":"btaurus",
        "Dog":"clfamiliaris",
        "Horse":"ecaballus",
        "Sheep":"oarambouillet",
        "Marmoset": "cjacchus" ,
        "Macaque": "mmulatta"   
    }.get(species, "Unknown species")
    
    # Load species-specific data
    species_info = pd.read_csv(f"data/{species_name}_ID_biomart.csv")

    # Keep relevant columns - use species code, not species_name
    species_mapping = {
        "mmusculus": "mgi_id",
        "rnorvegicus": "rgd_id", 
        "drerio": "zfin_id_id"
    }
 # Fix: Use species (not species_name) for lookup
    species_id = species_mapping.get(species_name, f"{species_name}_homolog_ensembl_gene")
    
    species_info = species_info.dropna(subset=['hgnc_id'])
    species_info = species_info.dropna(subset=[species_id])
    # Merge with ligand data
    ligand_hgnc = [col for col in species_gene_pair1.columns if "Ligand HGNC ID" in col][0]
    species_gene_pair1["Lig HGNC ID"] = species_gene_pair1[ligand_hgnc].apply(extract_hgnc_id)
    species_gene_pair1 = species_gene_pair1.merge(species_info, how='left', 
                               left_on='Lig HGNC ID', right_on='hgnc_id',
                               suffixes=('', '_lig'))

    # Turn Interaction ID into clickable links
    # species_gene_pair1[species_gene_pair1.columns[0]] = species_gene_pair1[species_gene_pair1.columns[0]].apply(
    #     lambda x: f"<a href='https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/database/filter/{x}.html'>{x}</a>"
    # )
        # Rename columns for ligand info
    rename_dict = {}
    if f"{species_name}_homolog_goc_score" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_goc_score"] = f"{species} Ligand GOC score"
    if f"{species_name}_homolog_wga_coverage" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_wga_coverage"] = f"{species} Ligand WGA coverage"   
    if f"{species_name}_homolog_perc_id" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id"] = f"{species} Ligand % Identity"
    if f"{species_name}_homolog_perc_id_r1" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id_r1"] = f"{species} Ligand Target % Identity"           
    if f"{species_name}_homolog_orthology_confidence" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_orthology_confidence"] = f"{species} Ligand Orthology Confidence"           
    if f"{species_name}_homolog_associated_gene_name" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_associated_gene_name"] = f"{species} Ligand"
    if "mgi_description" in species_gene_pair1.columns:
        rename_dict["mgi_description"] = f"{species} Ligand Name"
    if "description" in species_gene_pair1.columns:
        rename_dict["description"] = f"{species} Ligand Name"
    if f"{species_name}_homolog_ensembl_gene" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_ensembl_gene"] = f"{species} Ligand Ensembl ID"
    if species_id in species_gene_pair1.columns and species_id != f"{species_name}_homolog_ensembl_gene":
        rename_dict[species_id] = f"{species} Ligand ID"
    species_gene_pair1 = species_gene_pair1.rename(columns=rename_dict)
    # Merge with receptor data
    receptor_hgnc = [col for col in species_gene_pair1.columns if "Receptor HGNC ID" in col][0]
    species_gene_pair1["Rec HGNC ID"] = species_gene_pair1[receptor_hgnc].apply(extract_hgnc_id)
    species_gene_pair1 = species_gene_pair1.merge(species_info, how='left', 
                               left_on='Rec HGNC ID', right_on='hgnc_id',
                               suffixes=('', '_rec'))
    # Rename columns for ligand info
    rename_dict = {}
    if f"{species_name}_homolog_goc_score" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_goc_score"] = f"{species} Receptor GOC score"
    if f"{species_name}_homolog_wga_coverage" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_wga_coverage"] = f"{species} Receptor WGA coverage"   
    if f"{species_name}_homolog_perc_id" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id"] = f"{species} Receptor % Identity"
    if f"{species_name}_homolog_perc_id_r1" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id_r1"] = f"{species} Receptor Target % Identity"           
    if f"{species_name}_homolog_orthology_confidence" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_orthology_confidence"] = f"{species} Receptor Orthology Confidence"           
    if f"{species_name}_homolog_associated_gene_name" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_associated_gene_name"] = f"{species} Receptor"
    if "mgi_description" in species_gene_pair1.columns:
        rename_dict["mgi_description"] = f"{species} Receptor Name"
    if "description" in species_gene_pair1.columns:
        rename_dict["description"] = f"{species} Receptor Name"
    if f"{species_name}_homolog_ensembl_gene" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_ensembl_gene"] = f"{species} Receptor Ensembl ID"
    if species_id in species_gene_pair1.columns and species_id != f"{species_name}_homolog_ensembl_gene":
        rename_dict[species_id] = f"{species} Receptor ID"
        
    species_gene_pair1 = species_gene_pair1.rename(columns=rename_dict)
    # Drop hgnc_id columns
    cols_to_drop = [col for col in species_gene_pair1.columns if col in ['hgnc_id', 'hgnc_id_rec', 'hgnc_symbol_rec',
                                                                         'hgnc_id_lig', 'hgnc_symbol_lig',  'Rec HGNC ID','ensembl_gene_id']]
    if cols_to_drop:
        origDF = species_gene_pair1.drop(columns=cols_to_drop)

    # Drop columns where all values are NaN
    species_gene_pair1 = species_gene_pair1.dropna(axis=1, how='all')
    col_to_rename = [col for col in species_gene_pair1.columns if "Human LR Pair" in col][0]
    species_gene_pair1.rename(columns={col_to_rename: "LR Pair Card"}, inplace=True)
    # Update ligand_col values only if species-specific replacement is available
    # Only override if the cleaned species-specific columns exist now
    new_ligand_col = f"{species} Ligand"
    new_receptor_col = f"{species} Receptor"
    
    if new_ligand_col in species_gene_pair1.columns:
        species_gene_pair1[ligand_col] = [
            new if pd.notna(new) and str(new).strip() else orig
            for new, orig in zip(species_gene_pair1[new_ligand_col], species_gene_pair1[ligand_col])
        ]

    if new_receptor_col in species_gene_pair1.columns:
        species_gene_pair1[receptor_col] = [
            new if pd.notna(new) and str(new).strip() else orig
            for new, orig in zip(species_gene_pair1[new_receptor_col], species_gene_pair1[receptor_col])
        ]

In [10]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import re
from bs4 import BeautifulSoup
from createDataTable import gene_pair, gene_pair000, human_columns, lrPairsCount
import warnings


# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


def process_species(gene_pair_df, gene_pair000_df, species, id_prefix, ligand_index, receptor_index):
    """
    Processes ligand-receptor interactions for a given species.

    Parameters:
        gene_pair_df (pd.DataFrame): The main gene pair DataFrame.
        gene_pair000_df (pd.DataFrame): The filtered gene pair DataFrame.
        species (str): The species name (e.g., "Mouse", "Rat", "Zebrafish").
        id_prefix (str): The identifier prefix for the species (e.g., "MGI" for Mouse, "RGD" for Rat).
        ligand_index (int): Index for selecting the ligand column.
        receptor_index (int): Index for selecting the receptor column.

    Returns:
        pd.DataFrame: Processed DataFrame for the species.
    """
    if species in ["Mouse", "Rat", "Zebrafish"]:
        species_columns = [col for col in gene_pair_df.columns if id_prefix in col or species in col]
    else:
        species_columns = [col for col in gene_pair_df.columns if species in col]

    # Filter rows where all species-specific columns are not empty
    species_gene_pair = gene_pair000_df[(gene_pair000_df[species_columns].map(str.strip) != "").all(axis=1)]
    
    # Rename columns to remove species name
    species_gene_pair.columns = [
        col.replace(f"{species} ", "").strip() if "Ligand" in col or "Receptor" in col else col
        for col in species_gene_pair.columns
    ]

    if id_prefix == "Ensembl":
        # Extract species names dynamically
        all_species = {"Chimpanzee", "Pig", "Dog", "Cow", 
                       "Chicken", "Horse", "Sheep", "Marmoset", "Macaque"}
        # Function to clean column names by removing HTML tags
        def clean_column_name(col):
            # Remove HTML tags using BeautifulSoup
            cleaned_name = BeautifulSoup(col, "html.parser").get_text()
            return cleaned_name.strip()
        
        # Step 1: Identify columns that contain any species from all_species
        cleaned_columns = [clean_column_name(col) for col in species_gene_pair.columns]
        
        # Identify columns where the species from all_species is in the cleaned name
        columns_to_remove = [
            species_gene_pair.columns[i] for i, cleaned_name in enumerate(cleaned_columns)
            if any(species in cleaned_name for species in all_species)
        ]
        species_gene_pair = species_gene_pair.drop(columns=columns_to_remove)
        
    
    # Apply the formatting function
    species_gene_pair1 = species_gene_pair.copy()

    # Identify ligand and receptor columns dynamically
    ligand_col = [col for col in species_gene_pair1.columns if "Ligand&nbsp;" in col][ligand_index]
    receptor_col = [col for col in species_gene_pair1.columns if "Receptor&nbsp;" in col][receptor_index]
    ligand_Location = [col for col in species_gene_pair1.columns if "Ligand Location" in col][0]
    receptor_Location = [col for col in species_gene_pair1.columns if "Receptor Location" in col][0]
    # Identify relevant columns for the species
    species_columns = [col for col in species_gene_pair1.columns if id_prefix in col]
    new_order = [human_columns[0]]+ [ligand_col, receptor_col] + species_columns + human_columns[1:]
    species_gene_pair1 = species_gene_pair1[new_order].reset_index(drop=True)

    # Apply the formatting function
    #species_gene_pair1 = species_gene_pair.copy()
            
    def extract_hgnc_id(col):
        """Use regular expression to extract the HGNC ID after 'HGNC:'."""
        match = re.search(r'HGNC:(\d+)', col)
        if match:
            return "HGNC:"+match.group(1)
        return None
    #clean HGNC
    

    species_name = {
        "Mouse": "mmusculus",
        "Rat": "rnorvegicus",
        "Zebrafish":"drerio" ,
        "Chimpanzee":"ptroglodytes",
        "Chicken":"ggallus",
        "Pig":"sscrofa",
        "Cow":"btaurus",
        "Dog":"clfamiliaris",
        "Horse":"ecaballus",
        "Sheep":"oarambouillet",
        "Marmoset": "cjacchus" ,
        "Macaque": "mmulatta"   
    }.get(species, "Unknown species")
    
    # Load species-specific data
    species_info = pd.read_csv(f"data/{species_name}_ID_biomart.csv")

    # Keep relevant columns - use species code, not species_name
    species_mapping = {
        "mmusculus": "mgi_id",
        "rnorvegicus": "rgd_id", 
        "drerio": "zfin_id_id"
    }
 # Fix: Use species (not species_name) for lookup
    species_id = species_mapping.get(species_name, f"{species_name}_homolog_ensembl_gene")
    
    species_info = species_info.dropna(subset=['hgnc_id'])
    species_info = species_info.dropna(subset=[species_id])
    # Merge with ligand data
    ligand_hgnc = [col for col in species_gene_pair1.columns if "Ligand HGNC ID" in col][0]
    species_gene_pair1["Lig HGNC ID"] = species_gene_pair1[ligand_hgnc].apply(extract_hgnc_id)
    species_gene_pair1 = species_gene_pair1.merge(species_info, how='left', 
                               left_on='Lig HGNC ID', right_on='hgnc_id',
                               suffixes=('', '_lig'))

    # Turn Interaction ID into clickable links
    # species_gene_pair1[species_gene_pair1.columns[0]] = species_gene_pair1[species_gene_pair1.columns[0]].apply(
    #     lambda x: f"<a href='https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/database/filter/{x}.html'>{x}</a>"
    # )
        # Rename columns for ligand info
    rename_dict = {}
    if f"{species_name}_homolog_goc_score" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_goc_score"] = f"{species} Ligand GOC score"
    if f"{species_name}_homolog_wga_coverage" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_wga_coverage"] = f"{species} Ligand WGA coverage"   
    if f"{species_name}_homolog_perc_id" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id"] = f"{species} Ligand % Identity"
    if f"{species_name}_homolog_perc_id_r1" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id_r1"] = f"{species} Ligand Target % Identity"           
    if f"{species_name}_homolog_orthology_confidence" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_orthology_confidence"] = f"{species} Ligand Orthology Confidence"           
    if f"{species_name}_homolog_associated_gene_name" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_associated_gene_name"] = f"{species} Ligand"
    if "mgi_description" in species_gene_pair1.columns:
        rename_dict["mgi_description"] = f"{species} Ligand Name"
    if "description" in species_gene_pair1.columns:
        rename_dict["description"] = f"{species} Ligand Name"
    if f"{species_name}_homolog_ensembl_gene" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_ensembl_gene"] = f"{species} Ligand Ensembl ID"
    if species_id in species_gene_pair1.columns and species_id != f"{species_name}_homolog_ensembl_gene":
        rename_dict[species_id] = f"{species} Ligand ID"
    species_gene_pair1 = species_gene_pair1.rename(columns=rename_dict)
    # Merge with receptor data
    receptor_hgnc = [col for col in species_gene_pair1.columns if "Receptor HGNC ID" in col][0]
    species_gene_pair1["Rec HGNC ID"] = species_gene_pair1[receptor_hgnc].apply(extract_hgnc_id)
    species_gene_pair1 = species_gene_pair1.merge(species_info, how='left', 
                               left_on='Rec HGNC ID', right_on='hgnc_id',
                               suffixes=('', '_rec'))
    # Rename columns for ligand info
    rename_dict = {}
    if f"{species_name}_homolog_goc_score" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_goc_score"] = f"{species} Receptor GOC score"
    if f"{species_name}_homolog_wga_coverage" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_wga_coverage"] = f"{species} Receptor WGA coverage"   
    if f"{species_name}_homolog_perc_id" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id"] = f"{species} Receptor % Identity"
    if f"{species_name}_homolog_perc_id_r1" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id_r1"] = f"{species} Receptor Target % Identity"           
    if f"{species_name}_homolog_orthology_confidence" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_orthology_confidence"] = f"{species} Receptor Orthology Confidence"           
    if f"{species_name}_homolog_associated_gene_name" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_associated_gene_name"] = f"{species} Receptor"
    if "mgi_description" in species_gene_pair1.columns:
        rename_dict["mgi_description"] = f"{species} Receptor Name"
    if "description" in species_gene_pair1.columns:
        rename_dict["description"] = f"{species} Receptor Name"
    if f"{species_name}_homolog_ensembl_gene" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_ensembl_gene"] = f"{species} Receptor Ensembl ID"
    if species_id in species_gene_pair1.columns and species_id != f"{species_name}_homolog_ensembl_gene":
        rename_dict[species_id] = f"{species} Receptor ID"
        
    species_gene_pair1 = species_gene_pair1.rename(columns=rename_dict)
    # Drop hgnc_id columns
    cols_to_drop = [col for col in species_gene_pair1.columns if col in ['hgnc_id', 'hgnc_id_rec', 'hgnc_symbol_rec',
                                                                         'hgnc_id_lig', 'hgnc_symbol_lig',  'Rec HGNC ID','ensembl_gene_id']]
    if cols_to_drop:
        origDF = species_gene_pair1.drop(columns=cols_to_drop)

    # Drop columns where all values are NaN
    species_gene_pair1 = species_gene_pair1.dropna(axis=1, how='all')
    col_to_rename = [col for col in species_gene_pair1.columns if "Human LR Pair" in col][0]
    species_gene_pair1.rename(columns={col_to_rename: "LR Pair Card"}, inplace=True)
    # Update ligand_col values only if species-specific replacement is available
    # Only override if the cleaned species-specific columns exist now
    new_ligand_col = f"{species} Ligand"
    new_receptor_col = f"{species} Receptor"
    
    if new_ligand_col in species_gene_pair1.columns:
        species_gene_pair1[ligand_col] = [
            new if pd.notna(new) and str(new).strip() else orig
            for new, orig in zip(species_gene_pair1[new_ligand_col], species_gene_pair1[ligand_col])
        ]

    if new_receptor_col in species_gene_pair1.columns:
        species_gene_pair1[receptor_col] = [
            new if pd.notna(new) and str(new).strip() else orig
            for new, orig in zip(species_gene_pair1[new_receptor_col], species_gene_pair1[receptor_col])
        ]

    if species in ["Mouse", "Rat", "Zebrafish"]:
        ligand_mgi_id_col = [col for col in species_gene_pair1.columns if f"Ligand {id_prefix} ID" in col][0]
        receptor_mgi_id_col = [col for col in species_gene_pair1.columns if f"Receptor {id_prefix} ID" in col][0]
    
        ligand_source = species_gene_pair1[f"{species} Ligand ID"]
        receptor_source = species_gene_pair1[f"{species} Receptor ID"]
    
        # URL builders by species
        def build_link(val, species):
            if pd.isna(val) or not str(val).strip() or str(val).strip().lower() in {"none", "nan"}:
                return ""
        
            val = str(val).strip()
            
            try:
                val = str(int(float(val)))  # Remove trailing ".0"
            except ValueError:
                pass
        
            if species == "Mouse":
                return f'<a href="https://www.informatics.jax.org/marker/{val}" target="_blank">{val}</a>'
            elif species == "Rat":
                return f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=RGD:{val}" target="_blank">RGD:{val}</a>'
            elif species == "Zebrafish":
                return f'<a href="https://zfin.org/{val}" target="_blank">{val}</a>'
            else:
                return val
    
        ligand_links = [build_link(val, species) for val in ligand_source]
        receptor_links = [build_link(val, species) for val in receptor_source]
    
        # Replace only if new link is non-empty
        species_gene_pair1[ligand_mgi_id_col] = [
            link if link else orig
            for link, orig in zip(ligand_links, species_gene_pair1[ligand_mgi_id_col])
        ]
    
        species_gene_pair1[receptor_mgi_id_col] = [
            link if link else orig
            for link, orig in zip(receptor_links, species_gene_pair1[receptor_mgi_id_col])
        ]
    
        # Drop original columns
        species_gene_pair1 = species_gene_pair1.drop(columns=[f"{species} Ligand ID", f"{species} Receptor ID"])
    else:
        # Update ligand_ens values only if species-specific replacement is available
    
        # Only override if the cleaned species-specific columns exist now
        new_ligand_ens = f"{species} Ligand Ensembl ID"
        new_receptor_ens = f"{species} Receptor Ensembl ID"
        ligand_ens = [col for col in species_gene_pair1.columns if "Ligand Ensembl ID" in col][0]
        receptor_ens= [col for col in species_gene_pair1.columns if "Receptor Ensembl ID" in col][0]
        if new_ligand_ens in species_gene_pair1.columns:
            species_gene_pair1[ligand_ens] = [
                new if pd.notna(new) and str(new).strip() else orig
                for new, orig in zip(species_gene_pair1[new_ligand_ens], species_gene_pair1[ligand_ens])
            ]
    
        if new_receptor_ens in species_gene_pair1.columns:
            species_gene_pair1[receptor_ens] = [
                new if pd.notna(new) and str(new).strip() else orig
                for new, orig in zip(species_gene_pair1[new_receptor_ens], species_gene_pair1[receptor_ens])
            ]
    
    # ligand_col = f"{species} Ligand"   
    # receptor_col = f"{species} Receptor"  
    def format_lr_pair(row):
        if row[ligand_Location] in ['secreted', '']:
            return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
        elif row[receptor_Location] == 'plasma membrane':
            return f"{row[ligand_col]} <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
        else:
            return f"{row[ligand_col]} \u2192 {row[receptor_col]}"

    species_gene_pair1.loc[:, f"{species} LR Pair"] = species_gene_pair1.apply(format_lr_pair, axis=1)
       # Identify relevant columns for the species
    species_lr_pair_col = f"{species} LR Pair"
    species_columns = [
        col for col in species_gene_pair1.columns
        if (id_prefix in col or species in col)
        and col not in [ligand_col, receptor_col, species_lr_pair_col]  # <- singular
    ]

    new_order = [human_columns[0]]+ [f"{species} LR Pair", ligand_col, receptor_col] + species_columns + ["LR Pair Card"] + human_columns[2:]
    species_gene_pair1 = species_gene_pair1[new_order].reset_index(drop=True)
    species_gene_pair1 = species_gene_pair1.loc[:, ~species_gene_pair1.columns.duplicated()]
    species_gene_pair1.columns = [
        col if col == species_lr_pair_col else col.replace(species, "").strip()
        for col in species_gene_pair1.columns
    ]
    if id_prefix == "Ensembl":
        species_gene_pair1 = species_gene_pair1.drop(columns=['Ligand Ensembl ID',
                                                              'Receptor Ensembl ID'])
    species_gene_pair1 = species_gene_pair1.drop(columns=['Ligand',
                                                              'Receptor'])

    return species_gene_pair1


# Process each species
# Mouse
mouse_gene_pair1 = process_species(gene_pair, gene_pair000, "Mouse", "MGI", 1, 1)
MouselrPairsCount = len(pd.unique(mouse_gene_pair1.iloc[:, 0]))# unique
HumanMouseLRPairsPer = (MouselrPairsCount/lrPairsCount)*100
HumanMouseLRPairsPer = round(HumanMouseLRPairsPer, 2)
# Round up to the nearest 0.5%
HumanMouseLRPairsPer = ((HumanMouseLRPairsPer * 2 + 1) // 1) / 2
### adding the mouse-specific annotations ####
mouse_gene_pair1 = mouse_gene_pair1.reset_index(drop=True) 
mouse_rat_info = pd.read_csv("data/mouse_name_mapping.csv")
ligand_mgi_id_col = [col for col in mouse_gene_pair1.columns if f"Ligand MGI ID" in col][0]
receptor_mgi_id_col = [col for col in mouse_gene_pair1.columns if f"Receptor MGI ID" in col][0]   
mapping_mouse_name = dict(zip(mouse_rat_info['MGI ID'], mouse_rat_info['MGI description']))
mapping_mouse_ens = dict(zip(mouse_rat_info['MGI ID'], mouse_rat_info['Gene stable ID']))
# extract mgi
def extract_mgi_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'MGI:(\d+)', col)
    if match:
        return 'MGI:' +str(match.group(1))
    return None
    

mouse_gene_pair1['Ligand MGI ID'] = mouse_gene_pair1[ligand_mgi_id_col].apply(extract_mgi_id)
mouse_gene_pair1['Receptor MGI ID'] = mouse_gene_pair1[receptor_mgi_id_col].apply(extract_mgi_id)
# Apply the mapping to 'Ligand Name'
mouse_gene_pair1['Ligand Name'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_name.get(row['Ligand MGI ID'], row['Ligand Name'])
    if pd.isna(row['Ligand Name']) or str(row['Ligand Name']).strip() == '' else row['Ligand Name'],
    axis=1
)

 # Apply the mapping to 'Receptor Name'
mouse_gene_pair1['Receptor Name'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_name.get(row['Receptor MGI ID'], row['Receptor Name'])
    if pd.isna(row['Receptor Name']) or str(row['Receptor Name']).strip() == '' else row['Receptor Name'],
    axis=1
)

# Apply the mapping to 'Ligand Ensembl ID'
mouse_gene_pair1['Ligand Ensembl ID'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_ens.get(row['Ligand MGI ID'], row['Ligand Ensembl ID'])
    if pd.isna(row['Ligand Ensembl ID']) or str(row['Ligand Ensembl ID']).strip() == '' else row['Ligand Ensembl ID'],
    axis=1
)

 # Apply the mapping to 'Receptor Ensembl ID'
mouse_gene_pair1['Receptor Ensembl ID'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_ens.get(row['Receptor MGI ID'], row['Receptor Ensembl ID'])
    if pd.isna(row['Receptor Ensembl ID']) or str(row['Receptor Ensembl ID']).strip() == '' else row['Receptor Ensembl ID'],
    axis=1
)
mouse_gene_pair1 = mouse_gene_pair1.drop(columns=['Ligand MGI ID',
                                                  'Receptor MGI ID'])
################################################################

# Rat
rat_gene_pair1 = process_species(gene_pair, gene_pair000, "Rat", "RGD", 2, 2)
# Zebrafish
zebrafish_gene_pair1 = process_species(gene_pair, gene_pair000, "Zebrafish", "ZFIN", 3, 3)

# Chimpanzee
chimpanzee_gene_pair1 = process_species(gene_pair, gene_pair000, "Chimpanzee", "Ensembl", 4, 4)

# Chicken
chicken_gene_pair1 = process_species(gene_pair, gene_pair000, "Chicken", "Ensembl", 4, 4)

In [15]:
# Pig
pig_gene_pair1 = process_species(gene_pair, gene_pair000, "Pig", "Ensembl", 4, 4)

# Cow
cow_gene_pair1 = process_species(gene_pair, gene_pair000, "Cow", "Ensembl", 4, 4)

# Dog
dog_gene_pair1 = process_species(gene_pair, gene_pair000, "Dog", "Ensembl", 4, 4)

# Horse
horse_gene_pair1 = process_species(gene_pair, gene_pair000, "Horse", "Ensembl",4, 4)

# Sheep
sheep_gene_pair1 = process_species(gene_pair, gene_pair000, "Sheep", "Ensembl", 4, 4)

# Marmoset
marmoset_gene_pair1 = process_species(gene_pair, gene_pair000, "Marmoset", "Ensembl", 4, 4)

# Rhesus macaque
macaque_gene_pair1 = process_species(gene_pair, gene_pair000, "Macaque", "Ensembl", 4, 4)



In [16]:
macaque_gene_pair1.to_csv("test.csv")

In [92]:
# Apply the mapping to 'Receptor MGI ID'
gene_pair['Receptor RGD ID'] = gene_pair.apply(
    lambda row: map_if_empty(row, 'Receptor MGI ID', 'Receptor RGD ID', mapping_mouse_to_rat),
    axis=1
)

def map_if_empty(row, source_col, target_col, mapping_dict):
    current_val = row[target_col]
    source_val = row[source_col]

    # Don't overwrite if target already has a value
    if pd.notna(current_val) and str(current_val).strip():
        return current_val

    # Only try mapping if source is non-empty
    if pd.isna(source_val) or not str(source_val).strip():
        return ""

    # Clean source value
    key = str(source_val).strip()
    
    # ✅ If multiple IDs (comma-separated), map only the first one
    key = key.split(",")[0].strip()
    
    # ✅ Ensure it starts with "RGD:"
    if not key.startswith("RGD:"):
        try:
            key = f"RGD:{int(float(key))}"
        except ValueError:
            pass

    return mapping_dict.get(key, "")

    
gene_pair['Rat Receptor'] = gene_pair.apply(
    lambda row: map_if_empty(row, 'Receptor RGD ID', 'Rat Receptor', mapping_mr2),
    axis=1
)

gene_pair['Rat Ligand'] = gene_pair.apply(
    lambda row: map_if_empty(row, 'Ligand RGD ID', 'Rat Ligand', mapping_mr2),
    axis=1
)


In [99]:
gene_pair[["Receptor", "Receptor MGI ID", "Receptor RGD ID"]].drop_duplicates().tail(10)


Unnamed: 0,Receptor,Receptor MGI ID,Receptor RGD ID
5138,Timd2,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://rgd.mcw.edu/rgdweb/report/gen..."
5139,Pcdhb17,"<a href=""https://www.informatics.jax.org/marke...",
5140,Pcdhb18,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://rgd.mcw.edu/rgdweb/report/gen..."
5141,Pcdhb19,"<a href=""https://www.informatics.jax.org/marke...",
5142,Pcdhb20,"<a href=""https://www.informatics.jax.org/marke...",
5143,Pcdhb21,"<a href=""https://www.informatics.jax.org/marke...",
5144,Pcdhb22,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://rgd.mcw.edu/rgdweb/report/gen..."
5145,Pcdhgb8,"<a href=""https://www.informatics.jax.org/marke...",
5146,Klrk1,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://rgd.mcw.edu/rgdweb/report/gen..."
5149,Tlr4,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://rgd.mcw.edu/rgdweb/report/gen..."


In [98]:
def clean_rgd_id(rgd):
    rgd = str(rgd).strip()

# Linkify multiple MGI IDs in Ligand column
gene_pair["Ligand MGI ID"] = gene_pair["Ligand MGI ID"].apply(
    lambda cell: ", ".join(
        f'<a href="https://www.informatics.jax.org/marker/{mgi.strip()}" target="_blank">{mgi.strip()}</a>'
        for mgi in str(cell).split(", ")
        if mgi.strip()
    ) if pd.notna(cell) else ""
)

# Linkify multiple MGI IDs in Receptor column
gene_pair["Receptor MGI ID"] = gene_pair["Receptor MGI ID"].apply(
    lambda cell: ", ".join(
        f'<a href="https://www.informatics.jax.org/marker/{mgi.strip()}" target="_blank">{mgi.strip()}</a>'
        for mgi in str(cell).split(", ")
        if mgi.strip()
    ) if pd.notna(cell) else ""
)

    
gene_pair["Ligand RGD ID"] = gene_pair["Ligand RGD ID"].apply(
    lambda cell: ", ".join(
        f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={clean_rgd_id(rgd)}" target="_blank">{clean_rgd_id(rgd)}</a>'
        for rgd in str(cell).split(", ")
        if rgd.strip()
    ) if pd.notna(cell) else ""
)


# Linkify multiple RGD IDs in Receptor column
gene_pair["Receptor RGD ID"] = gene_pair["Receptor RGD ID"].apply(
    lambda cell: ", ".join(
        f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={rgd.strip()}" target="_blank">{rgd.strip()}</a>'
        for rgd in str(cell).split(", ")
        if rgd.strip()
    ) if pd.notna(cell) else ""
)

In [97]:
mouse_specific_mgi_ids

array(['MGI:107560', 'MGI:2136754', 'MGI:2136756', 'MGI:2136757',
       'MGI:2136758', 'MGI:2136759', 'MGI:2136760', 'MGI:1935200',
       'MGI:1306817', 'MGI:3649078', 'MGI:3774845', 'MGI:98223',
       'MGI:2159681', 'MGI:1196250', 'MGI:96824'], dtype=object)

In [40]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

# Step 1: Fetch the data directly from URL
url = "https://inparanoidb.sbc.su.se/download/sqltable/9606&10090&prot"
response = requests.get(url)
response.raise_for_status()  # Raise error if request failed

# Step 2: Load into DataFrame
data = response.text.strip()
df = pd.read_csv(StringIO(data), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "score", "protein_id", "seed_score"]

# Step 3: Infer species from source_file
def infer_species(src):
    if "9606" in src:
        return "human"
    elif "10090" in src:
        return "mouse"
    return "other"

df["species"] = df["source_file"].apply(infer_species)

# Sanity check
print("Species breakdown:")
print(df["species"].value_counts())

# Step 4: Expand all human–mouse combinations within clusters
orthologs = []
for cluster_id, group in df.groupby("cluster_id"):
    humans = group[group["species"] == "human"]
    mice = group[group["species"] == "mouse"]
    if not humans.empty and not mice.empty:
        for h, m in product(humans.itertuples(index=False), mice.itertuples(index=False)):
            orthologs.append({
                "cluster_id": cluster_id,
                "human_protein": h.protein_id,
                "human_score": h.score,
                "mouse_protein": m.protein_id,
                "mouse_score": m.score
            })

# Step 5: Create final DataFrame
df_orthologs = pd.DataFrame(orthologs)
print(df_orthologs.head())
print(f"Total orthologous pairs: {len(df_orthologs)}")

# Optional: save
df_orthologs.to_csv("data/human_mouse_inParanoid.tsv", sep="\t", index=False)


Species breakdown:
species
mouse    18113
human    17845
Name: count, dtype: int64
   cluster_id human_protein  human_score mouse_protein  mouse_score
0           1        Q8WZ42          1.0        A2ASS6          1.0
1           2        Q8NF91          1.0        Q6ZWR6          1.0
2           3        Q5VST9          1.0        A2AAJ9          1.0
3           4        Q9UPN3          1.0        Q9QXZ0          1.0
4           5        Q03001          1.0        Q91ZU6          1.0
Total orthologous pairs: 20805


In [43]:
def fetch_uniprot_gene_map(uniprot_ids, species="human"):
    """
    Fetch gene symbols and HGNC/MGI IDs from UniProt REST API.
    """
    import requests
    from urllib.parse import quote
    import time

    taxid = "9606" if species == "human" else "10090"
    results = {}

    headers = {"Accept": "application/json"}

    for chunk_start in range(0, len(uniprot_ids), 100):
        chunk = uniprot_ids[chunk_start:chunk_start + 100]
        query = " OR ".join(f"accession:{acc}" for acc in chunk)
        params = {
            "query": f"({query}) AND organism_id:{taxid}",
            "size": 500,
            "format": "json"
        }

        try:
            r = requests.get("https://rest.uniprot.org/uniprotkb/search", params=params, headers=headers)
            r.raise_for_status()
            data = r.json()

            for entry in data.get("results", []):
                acc = entry["primaryAccession"]
                gene = entry.get("genes", [{}])[0].get("geneName", {}).get("value", "")

                # Cross-references
                hgnc = ""
                mgi = ""
                for xref in entry.get("uniProtKBCrossReferences", []):
                    if xref.get("database") == "HGNC":
                        hgnc = xref.get("id")
                    if xref.get("database") == "MGI":
                        mgi = xref.get("id")

                results[acc] = {
                    "gene_symbol": gene,
                    "hgnc_id": hgnc if species == "human" else None,
                    "mgi_symbol": mgi if species == "mouse" else None
                }

        except Exception as e:
            print("Error fetching:", e)
        time.sleep(1)

    return results


In [45]:
# Get unique protein IDs
human_ids = df_orthologs["human_protein"].unique().tolist()
mouse_ids = df_orthologs["mouse_protein"].unique().tolist()

# Fetch mappings
human_map = fetch_uniprot_gene_map(human_ids, species="human")
mouse_map = fetch_uniprot_gene_map(mouse_ids, species="mouse")


KeyboardInterrupt: 

In [None]:
# Add human annotations
df_orthologs["human_gene"] = df_orthologs["human_protein"].map(lambda x: human_map.get(x, {}).get("gene_symbol"))
df_orthologs["hgnc_id"]     = df_orthologs["human_protein"].map(lambda x: human_map.get(x, {}).get("hgnc_id"))

# Add mouse annotations
df_orthologs["mouse_gene"] = df_orthologs["mouse_protein"].map(lambda x: mouse_map.get(x, {}).get("gene_symbol"))
df_orthologs["mgi_symbol"] = df_orthologs["mouse_protein"].map(lambda x: mouse_map.get(x, {}).get("mgi_symbol"))


In [None]:
df_orthologs.to_csv("human_mouse_orthologs_annotated.tsv", sep="\t", index=False)


In [18]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import re
from bs4 import BeautifulSoup
from createDataTable import gene_pair, gene_pair000, human_columns, lrPairsCount
import warnings


# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


def process_species(gene_pair_df, gene_pair000_df, species, id_prefix, ligand_index, receptor_index):
    """
    Processes ligand-receptor interactions for a given species.

    Parameters:
        gene_pair_df (pd.DataFrame): The main gene pair DataFrame.
        gene_pair000_df (pd.DataFrame): The filtered gene pair DataFrame.
        species (str): The species name (e.g., "Mouse", "Rat", "Zebrafish").
        id_prefix (str): The identifier prefix for the species (e.g., "MGI" for Mouse, "RGD" for Rat).
        ligand_index (int): Index for selecting the ligand column.
        receptor_index (int): Index for selecting the receptor column.

    Returns:
        pd.DataFrame: Processed DataFrame for the species.
    """
    if species in ["Mouse", "Rat", "Zebrafish"]:
        species_columns = [col for col in gene_pair_df.columns if id_prefix in col or species in col]
    else:
        species_columns = [col for col in gene_pair_df.columns if species in col]

    # Filter rows where all species-specific columns are not empty
    species_gene_pair = gene_pair000_df[(gene_pair000_df[species_columns].map(str.strip) != "").all(axis=1)]
    
    # Rename columns to remove species name
    species_gene_pair.columns = [
        col.replace(f"{species} ", "").strip() if "Ligand" in col or "Receptor" in col else col
        for col in species_gene_pair.columns
    ]

    if id_prefix == "Ensembl":
        # Extract species names dynamically
        all_species = {"Chimpanzee", "Pig", "Dog", "Cow", 
                       "Chicken", "Horse", "Sheep", "Marmoset", "Macaque"}
        # Function to clean column names by removing HTML tags
        def clean_column_name(col):
            # Remove HTML tags using BeautifulSoup
            cleaned_name = BeautifulSoup(col, "html.parser").get_text()
            return cleaned_name.strip()
        
        # Step 1: Identify columns that contain any species from all_species
        cleaned_columns = [clean_column_name(col) for col in species_gene_pair.columns]
        
        # Identify columns where the species from all_species is in the cleaned name
        columns_to_remove = [
            species_gene_pair.columns[i] for i, cleaned_name in enumerate(cleaned_columns)
            if any(species in cleaned_name for species in all_species)
        ]
        species_gene_pair = species_gene_pair.drop(columns=columns_to_remove)
        
    
    # Apply the formatting function
    species_gene_pair1 = species_gene_pair.copy()

    # Identify ligand and receptor columns dynamically
    ligand_col = [col for col in species_gene_pair1.columns if "Ligand&nbsp;" in col][ligand_index]
    receptor_col = [col for col in species_gene_pair1.columns if "Receptor&nbsp;" in col][receptor_index]
    ligand_Location = [col for col in species_gene_pair1.columns if "Ligand Location" in col][0]
    receptor_Location = [col for col in species_gene_pair1.columns if "Receptor Location" in col][0]
    # Identify relevant columns for the species
    species_columns = [col for col in species_gene_pair1.columns if id_prefix in col]
    new_order = [human_columns[0]]+ [ligand_col, receptor_col] + species_columns + human_columns[1:]
    species_gene_pair1 = species_gene_pair1[new_order].reset_index(drop=True)

    # Apply the formatting function
    species_gene_pair1 = species_gene_pair.copy()
            
    def extract_hgnc_id(col):
        """Use regular expression to extract the HGNC ID after 'HGNC:'."""
        match = re.search(r'HGNC:(\d+)', col)
        if match:
            return "HGNC:"+match.group(1)
        return None
    #clean HGNC
    

    species_name = {
        "Mouse": "mmusculus",
        "Rat": "rnorvegicus",
        "Zebrafish":"drerio" ,
        "Chimpanzee":"ptroglodytes",
        "Chicken":"ggallus",
        "Pig":"sscrofa",
        "Cow":"btaurus",
        "Dog":"clfamiliaris",
        "Horse":"ecaballus",
        "Sheep":"oarambouillet",
        "Marmoset": "cjacchus" ,
        "Macaque": "mmulatta"   
    }.get(species, "Unknown species")
    
    # Load species-specific data
    species_info = pd.read_csv(f"data/{species_name}_ID_biomart.csv")

    # Keep relevant columns - use species code, not species_name
    species_mapping = {
        "mmusculus": "mgi_id",
        "rnorvegicus": "rgd_id", 
        "drerio": "zfin_id_id"
    }
 # Fix: Use species (not species_name) for lookup
    species_id = species_mapping.get(species_name, f"{species_name}_homolog_ensembl_gene")
    
    species_info = species_info.dropna(subset=['hgnc_id'])
    species_info = species_info.dropna(subset=[species_id])
    # Merge with ligand data
    ligand_hgnc = [col for col in species_gene_pair1.columns if "Ligand HGNC ID" in col][0]
    species_gene_pair1["Lig HGNC ID"] = species_gene_pair1[ligand_hgnc].apply(extract_hgnc_id)
    species_gene_pair1 = species_gene_pair1.merge(species_info, how='left', 
                               left_on='Lig HGNC ID', right_on='hgnc_id',
                               suffixes=('', '_lig'))

    # Turn Interaction ID into clickable links
    # species_gene_pair1[species_gene_pair1.columns[0]] = species_gene_pair1[species_gene_pair1.columns[0]].apply(
    #     lambda x: f"<a href='https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/database/filter/{x}.html'>{x}</a>"
    # )
        # Rename columns for ligand info
    rename_dict = {}
    if f"{species_name}_homolog_goc_score" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_goc_score"] = f"{species} Ligand GOC score"
    if f"{species_name}_homolog_wga_coverage" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_wga_coverage"] = f"{species} Ligand WGA coverage"   
    if f"{species_name}_homolog_perc_id" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id"] = f"{species} Ligand % Identity"
    if f"{species_name}_homolog_perc_id_r1" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id_r1"] = f"{species} Ligand Target % Identity"           
    if f"{species_name}_homolog_orthology_confidence" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_orthology_confidence"] = f"{species} Ligand Orthology Confidence"           
    if f"{species_name}_homolog_associated_gene_name" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_associated_gene_name"] = f"{species} Ligand"
    if "mgi_description" in species_gene_pair1.columns:
        rename_dict["mgi_description"] = f"{species} Ligand Name"
    if "description" in species_gene_pair1.columns:
        rename_dict["description"] = f"{species} Ligand Name"
    if f"{species_name}_homolog_ensembl_gene" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_ensembl_gene"] = f"{species} Ligand Ensembl ID"
    if species_id in species_gene_pair1.columns and species_id != f"{species_name}_homolog_ensembl_gene":
        rename_dict[species_id] = f"{species} Ligand ID"
    species_gene_pair1 = species_gene_pair1.rename(columns=rename_dict)
    # Merge with receptor data
    receptor_hgnc = [col for col in species_gene_pair1.columns if "Receptor HGNC ID" in col][0]
    species_gene_pair1["Rec HGNC ID"] = species_gene_pair1[receptor_hgnc].apply(extract_hgnc_id)
    species_gene_pair1 = species_gene_pair1.merge(species_info, how='left', 
                               left_on='Rec HGNC ID', right_on='hgnc_id',
                               suffixes=('', '_rec'))
    # Rename columns for ligand info
    rename_dict = {}
    if f"{species_name}_homolog_goc_score" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_goc_score"] = f"{species} Receptor GOC score"
    if f"{species_name}_homolog_wga_coverage" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_wga_coverage"] = f"{species} Receptor WGA coverage"   
    if f"{species_name}_homolog_perc_id" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id"] = f"{species} Receptor % Identity"
    if f"{species_name}_homolog_perc_id_r1" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_perc_id_r1"] = f"{species} Receptor Target % Identity"           
    if f"{species_name}_homolog_orthology_confidence" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_orthology_confidence"] = f"{species} Receptor Orthology Confidence"           
    if f"{species_name}_homolog_associated_gene_name" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_associated_gene_name"] = f"{species} Receptor"
    if "mgi_description" in species_gene_pair1.columns:
        rename_dict["mgi_description"] = f"{species} Receptor Name"
    if "description" in species_gene_pair1.columns:
        rename_dict["description"] = f"{species} Receptor Name"
    if f"{species_name}_homolog_ensembl_gene" in species_gene_pair1.columns:
        rename_dict[f"{species_name}_homolog_ensembl_gene"] = f"{species} Receptor Ensembl ID"
    if species_id in species_gene_pair1.columns and species_id != f"{species_name}_homolog_ensembl_gene":
        rename_dict[species_id] = f"{species} Receptor ID"
        
    species_gene_pair1 = species_gene_pair1.rename(columns=rename_dict)
    # Drop hgnc_id columns
    cols_to_drop = [col for col in species_gene_pair1.columns if col in ['hgnc_id', 'hgnc_id_rec', 'hgnc_symbol_rec',
                                                                         'hgnc_id_lig', 'hgnc_symbol_lig',  'Rec HGNC ID','ensembl_gene_id']]
    if cols_to_drop:
        origDF = species_gene_pair1.drop(columns=cols_to_drop)

    # Drop columns where all values are NaN
    species_gene_pair1 = species_gene_pair1.dropna(axis=1, how='all')
    col_to_rename = [col for col in species_gene_pair1.columns if "Human LR Pair" in col][0]
    species_gene_pair1.rename(columns={col_to_rename: "LR Pair Card"}, inplace=True)
    # Update ligand_col values only if species-specific replacement is available
    # Only override if the cleaned species-specific columns exist now
    new_ligand_col = f"{species} Ligand"
    new_receptor_col = f"{species} Receptor"
    
    if new_ligand_col in species_gene_pair1.columns:
        species_gene_pair1[ligand_col] = [
            new if pd.notna(new) and str(new).strip() else orig
            for new, orig in zip(species_gene_pair1[new_ligand_col], species_gene_pair1[ligand_col])
        ]

    if new_receptor_col in species_gene_pair1.columns:
        species_gene_pair1[receptor_col] = [
            new if pd.notna(new) and str(new).strip() else orig
            for new, orig in zip(species_gene_pair1[new_receptor_col], species_gene_pair1[receptor_col])
        ]

    if species in ["Mouse", "Rat", "Zebrafish"]:
        ligand_mgi_id_col = [col for col in species_gene_pair1.columns if f"Ligand {id_prefix} ID" in col][0]
        receptor_mgi_id_col = [col for col in species_gene_pair1.columns if f"Receptor {id_prefix} ID" in col][0]
    
        ligand_source = species_gene_pair1[f"{species} Ligand ID"]
        receptor_source = species_gene_pair1[f"{species} Receptor ID"]
    
        # URL builders by species
        def build_link(val, species):
            if pd.isna(val) or not str(val).strip():
                return ""
            
            val = str(val).strip()
            try:
                val = str(int(float(val)))  # Removes ".0" if present
            except ValueError:
                pass
        
            if species == "Mouse":
                return f'<a href="https://www.informatics.jax.org/marker/{val}" target="_blank">{val}</a>'
            elif species == "Rat":
                return f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=RGD:{val}" target="_blank">RGD:{val}</a>'
            elif species == "Zebrafish":
                return f'<a href="https://zfin.org/{val}" target="_blank">{val}</a>'
            else:
                return val
    
        ligand_links = [build_link(val, species) for val in ligand_source]
        receptor_links = [build_link(val, species) for val in receptor_source]
    
        # Replace only if new link is non-empty
        species_gene_pair1[ligand_mgi_id_col] = [
            link if link else orig
            for link, orig in zip(ligand_links, species_gene_pair1[ligand_mgi_id_col])
        ]
    
        species_gene_pair1[receptor_mgi_id_col] = [
            link if link else orig
            for link, orig in zip(receptor_links, species_gene_pair1[receptor_mgi_id_col])
        ]
    
        # Drop original columns
        species_gene_pair1 = species_gene_pair1.drop(columns=[f"{species} Ligand ID", f"{species} Receptor ID"])
    else:
        # Update ligand_ens values only if species-specific replacement is available
    
        # Only override if the cleaned species-specific columns exist now
        new_ligand_ens = f"{species} Ligand Ensembl ID"
        new_receptor_ens = f"{species} Receptor Ensembl ID"
        ligand_ens = [col for col in species_gene_pair1.columns if "Ligand Ensembl ID" in col][0]
        receptor_ens= [col for col in species_gene_pair1.columns if "Receptor Ensembl ID" in col][0]
        if new_ligand_ens in species_gene_pair1.columns:
            species_gene_pair1[ligand_ens] = [
                new if pd.notna(new) and str(new).strip() else orig
                for new, orig in zip(species_gene_pair1[new_ligand_ens], species_gene_pair1[ligand_ens])
            ]
    
        if new_receptor_ens in species_gene_pair1.columns:
            species_gene_pair1[receptor_ens] = [
                new if pd.notna(new) and str(new).strip() else orig
                for new, orig in zip(species_gene_pair1[new_receptor_ens], species_gene_pair1[receptor_ens])
            ]
    
    # ligand_col = f"{species} Ligand"   
    # receptor_col = f"{species} Receptor"  
    def format_lr_pair(row):
        if row[ligand_Location] in ['secreted', '']:
            return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
        elif row[receptor_Location] == 'plasma membrane':
            return f"{row[ligand_col]} <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
        else:
            return f"{row[ligand_col]} \u2192 {row[receptor_col]}"

    species_gene_pair1.loc[:, f"{species} LR Pair"] = species_gene_pair1.apply(format_lr_pair, axis=1)
       # Identify relevant columns for the species
    species_lr_pair_col = f"{species} LR Pair"
    species_columns = [
        col for col in species_gene_pair1.columns
        if (id_prefix in col or species in col)
        and col not in [ligand_col, receptor_col, species_lr_pair_col]  # <- singular
    ]

    new_order = [human_columns[0]]+ [f"{species} LR Pair", ligand_col, receptor_col] + species_columns + ["LR Pair Card"] + human_columns[2:]
    species_gene_pair1 = species_gene_pair1[new_order].reset_index(drop=True)
    species_gene_pair1 = species_gene_pair1.loc[:, ~species_gene_pair1.columns.duplicated()]
    species_gene_pair1.columns = [
        col if col == species_lr_pair_col else col.replace(species, "").strip()
        for col in species_gene_pair1.columns
    ]
    if id_prefix == "Ensembl":
        species_gene_pair1 = species_gene_pair1.drop(columns=['Ligand Ensembl ID',
                                                              'Receptor Ensembl ID'])
    species_gene_pair1 = species_gene_pair1.drop(columns=['Ligand',
                                                              'Receptor'])

    return species_gene_pair1


# Process each species
# Mouse
mouse_gene_pair1 = process_species(gene_pair, gene_pair000, "Mouse", "MGI", 1, 1)
MouselrPairsCount = len(pd.unique(mouse_gene_pair1.iloc[:, 0]))# unique
HumanMouseLRPairsPer = (MouselrPairsCount/lrPairsCount)*100
HumanMouseLRPairsPer = round(HumanMouseLRPairsPer, 2)
# Round up to the nearest 0.5%
HumanMouseLRPairsPer = ((HumanMouseLRPairsPer * 2 + 1) // 1) / 2
### adding the mouse-specific annotations ####
mouse_gene_pair1 = mouse_gene_pair1.reset_index(drop=True) 
mouse_rat_info = pd.read_csv("data/mouse_name_mapping.csv")
ligand_mgi_id_col = [col for col in mouse_gene_pair1.columns if f"Ligand MGI ID" in col][0]
receptor_mgi_id_col = [col for col in mouse_gene_pair1.columns if f"Receptor MGI ID" in col][0]   
mapping_mouse_name = dict(zip(mouse_rat_info['MGI ID'], mouse_rat_info['MGI description']))
mapping_mouse_ens = dict(zip(mouse_rat_info['MGI ID'], mouse_rat_info['Gene stable ID']))
# extract mgi
def extract_mgi_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'MGI:(\d+)', col)
    if match:
        return 'MGI:' +str(match.group(1))
    return None
    

mouse_gene_pair1['Ligand MGI ID'] = mouse_gene_pair1[ligand_mgi_id_col].apply(extract_mgi_id)
mouse_gene_pair1['Receptor MGI ID'] = mouse_gene_pair1[receptor_mgi_id_col].apply(extract_mgi_id)
# Apply the mapping to 'Ligand Name'
mouse_gene_pair1['Ligand Name'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_name.get(row['Ligand MGI ID'], row['Ligand Name'])
    if pd.isna(row['Ligand Name']) or str(row['Ligand Name']).strip() == '' else row['Ligand Name'],
    axis=1
)

 # Apply the mapping to 'Receptor Name'
mouse_gene_pair1['Receptor Name'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_name.get(row['Receptor MGI ID'], row['Receptor Name'])
    if pd.isna(row['Receptor Name']) or str(row['Receptor Name']).strip() == '' else row['Receptor Name'],
    axis=1
)

# Apply the mapping to 'Ligand Ensembl ID'
mouse_gene_pair1['Ligand Ensembl ID'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_ens.get(row['Ligand MGI ID'], row['Ligand Ensembl ID'])
    if pd.isna(row['Ligand Ensembl ID']) or str(row['Ligand Ensembl ID']).strip() == '' else row['Ligand Ensembl ID'],
    axis=1
)

 # Apply the mapping to 'Receptor Ensembl ID'
mouse_gene_pair1['Receptor Ensembl ID'] = mouse_gene_pair1.apply(
    lambda row: mapping_mouse_ens.get(row['Receptor MGI ID'], row['Receptor Ensembl ID'])
    if pd.isna(row['Receptor Ensembl ID']) or str(row['Receptor Ensembl ID']).strip() == '' else row['Receptor Ensembl ID'],
    axis=1
)
mouse_gene_pair1 = mouse_gene_pair1.drop(columns=['Ligand MGI ID',
                                                  'Receptor MGI ID'])
################################################################

# Rat
rat_gene_pair1 = process_species(gene_pair, gene_pair000, "Rat", "RGD", 2, 2)
# Zebrafish
zebrafish_gene_pair1 = process_species(gene_pair, gene_pair000, "Zebrafish", "ZFIN", 3, 3)

# Chimpanzee
chimpanzee_gene_pair1 = process_species(gene_pair, gene_pair000, "Chimpanzee", "Ensembl", 4, 4)

# Chicken
chicken_gene_pair1 = process_species(gene_pair, gene_pair000, "Chicken", "Ensembl", 4, 4)

# Pig
pig_gene_pair1 = process_species(gene_pair, gene_pair000, "Pig", "Ensembl", 4, 4)

# Cow
cow_gene_pair1 = process_species(gene_pair, gene_pair000, "Cow", "Ensembl", 4, 4)

# Dog
dog_gene_pair1 = process_species(gene_pair, gene_pair000, "Dog", "Ensembl", 4, 4)

# Horse
horse_gene_pair1 = process_species(gene_pair, gene_pair000, "Horse", "Ensembl",4, 4)

# Sheep
sheep_gene_pair1 = process_species(gene_pair, gene_pair000, "Sheep", "Ensembl", 4, 4)

# Marmoset
marmoset_gene_pair1 = process_species(gene_pair, gene_pair000, "Marmoset", "Ensembl", 4, 4)

# Rhesus macaque
macaque_gene_pair1 = process_species(gene_pair, gene_pair000, "Macaque", "Ensembl", 4, 4)


In [35]:
interaction_id_col = [col for col in mouse_gene_pair1.columns if "Interaction ID" in col][0]

In [36]:
human_ligand_col = [col for col in mouse_gene_pair1.columns if "Human Ligand" in col][0]
ligand_col = [col for col in mouse_gene_pair1.columns if "Ligand" in col][0]

In [37]:
human_receptor_col = [col for col in mouse_gene_pair1.columns if "Human Receptor" in col][0]
receptor_col = [col for col in mouse_gene_pair1.columns if "Receptor" in col][0]

In [52]:
GOC_col = [col for col in mouse_gene_pair1.columns if "GOC" in col][0]
mouse_gene_pair1[GOC_col]

0        75.0
1        75.0
2       100.0
3       100.0
4       100.0
        ...  
3956      NaN
3957      NaN
3958      NaN
3959      NaN
3960      NaN
Name: Ligand GOC score, Length: 3961, dtype: float64

In [None]:
mouse_gene_pair1.column

In [58]:
def summarize_orthologs(human_col, species_col, label,
                        confidence_orth_col=None, confidence_orth_threshold=None,
                        GOC_col=None, GOC_threshold=None):
    # Start with base DataFrame
    df = mouse_gene_pair1

    # Apply Orthology Confidence filter if specified
    if confidence_orth_col and confidence_orth_threshold is not None:
        df = df[df[confidence_orth_col] == confidence_orth_threshold]

    # Apply GOC filter if specified
    if GOC_col and GOC_threshold is not None:
        df = df[df[GOC_col] == GOC_threshold]

    # Drop duplicates to get unique gene pairs
    unique_pairs = df[[human_col, species_col]].drop_duplicates()

    # Count how many species genes per human gene
    counts = (
        unique_pairs
        .groupby(human_col)[species_col]
        .count()
        .sort_values(ascending=False)
        .reset_index(name='count')
    )

    # Create filename tag based on filters
    filter_tag = label.lower()
    if confidence_orth_threshold is not None:
        filter_tag += f"_conf{confidence_orth_threshold}"
    if GOC_threshold is not None:
        filter_tag += f"_GOC{GOC_threshold}"

    # Save to CSV
    counts.to_csv(f"data/human_mouse_orth_count_{filter_tag}.csv", index=False)

    # Summarize distribution
    summary_counts = counts['count'].value_counts().sort_index()
    total_human_genes = counts.shape[0]

    # Build summary text
    summary_lines = [
        f"Out of {total_human_genes} unique human {label.lower()} genes "
        f"(Orthology Confidence = {confidence_orth_threshold}, GOC = {GOC_threshold}):"
    ]
    for orth_count, gene_count in summary_counts.items():
        summary_lines.append(
            f" - {gene_count} human {label.lower()} genes had {orth_count} mouse ortholog(s)"
        )

    return "\n".join(summary_lines)


In [68]:
def summarize_orthologs(human_col, species_col, label,
                        confidence_orth_col=None, confidence_orth_threshold=None,
                        GOC_col=None, GOC_threshold=None):
    df = mouse_gene_pair1.copy()

    if confidence_orth_col and confidence_orth_threshold is not None:
        df = df[df[confidence_orth_col] == confidence_orth_threshold]

    if GOC_col and GOC_threshold is not None:
        df = df[df[GOC_col] >= GOC_threshold]  # Use >= instead of ==

    unique_pairs = df[[human_col, species_col]].drop_duplicates()

    counts = (
        unique_pairs
        .groupby(human_col)[species_col]
        .count()
        .sort_values(ascending=False)
        .reset_index(name='count')
    )

    filter_tag = label.lower()
    if confidence_orth_threshold is not None:
        filter_tag += f"_conf{confidence_orth_threshold}"
    if GOC_threshold is not None:
        filter_tag += f"_GOCge{GOC_threshold}"

    counts.to_csv(f"data/human_mouse_orth_count_{filter_tag}.csv", index=False)

    summary_counts = counts['count'].value_counts().sort_index()
    total_human_genes = counts.shape[0]

    summary_lines = [
        f"Out of {total_human_genes} unique human {label.lower()} genes "
        f"(Orthology Confidence = {confidence_orth_threshold}, GOC ≥ {GOC_threshold}):"
    ]
    for orth_count, gene_count in summary_counts.items():
        summary_lines.append(
            f" - {gene_count} human {label.lower()} genes had {orth_count} mouse ortholog(s)"
        )

    return "\n".join(summary_lines)


# Detect columns
confidence_orth_ligand = [col for col in mouse_gene_pair1.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in mouse_gene_pair1.columns if "Ligand GOC" in col][0]

confidence_orth_receptor = [col for col in mouse_gene_pair1.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in mouse_gene_pair1.columns if "Receptor GOC" in col][0]

# Generate summaries
ligand_summary = summarize_orthologs(
    human_ligand_col, ligand_col, "Ligand",
    confidence_orth_col=confidence_orth_ligand, confidence_orth_threshold=None,
    GOC_col=GOC_col_ligand, GOC_threshold=0
)

receptor_summary = summarize_orthologs(
    human_receptor_col, receptor_col, "Receptor",
    confidence_orth_col=confidence_orth_receptor, confidence_orth_threshold=None,
    GOC_col=GOC_col_receptor, GOC_threshold=0
)

# Print
print(ligand_summary)
print()
print(receptor_summary)

In [69]:
mouse_gene_pair1[GOC_col_ligand]

0        75.0
1        75.0
2       100.0
3       100.0
4       100.0
        ...  
3956      NaN
3957      NaN
3958      NaN
3959      NaN
3960      NaN
Name: Ligand GOC score, Length: 3961, dtype: float64

In [74]:

# Detect columns
confidence_orth_ligand = [col for col in mouse_gene_pair1.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in mouse_gene_pair1.columns if "Ligand GOC" in col][0]

confidence_orth_receptor = [col for col in mouse_gene_pair1.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in mouse_gene_pair1.columns if "Receptor GOC" in col][0]

# Generate summaries
ligand_summary = summarize_orthologs(
    human_ligand_col, ligand_col, "Ligand",
    confidence_orth_col=confidence_orth_ligand, confidence_orth_threshold=None,
    GOC_col=GOC_col_ligand, GOC_threshold=0
)

receptor_summary = summarize_orthologs(
    human_receptor_col, receptor_col, "Receptor",
    confidence_orth_col=confidence_orth_receptor, confidence_orth_threshold=None,
    GOC_col=GOC_col_receptor, GOC_threshold=0
)

# Print
print(ligand_summary)
print()
print(receptor_summary)

Out of 985 unique human ligand genes (Orthology Confidence = None, GOC ≥ 0):
 - 925 human ligand genes had 1 mouse ortholog(s)
 - 31 human ligand genes had 2 mouse ortholog(s)
 - 5 human ligand genes had 3 mouse ortholog(s)
 - 2 human ligand genes had 4 mouse ortholog(s)
 - 5 human ligand genes had 5 mouse ortholog(s)
 - 3 human ligand genes had 6 mouse ortholog(s)
 - 2 human ligand genes had 7 mouse ortholog(s)
 - 12 human ligand genes had 14 mouse ortholog(s)

Out of 780 unique human receptor genes (Orthology Confidence = None, GOC ≥ 0):
 - 742 human receptor genes had 1 mouse ortholog(s)
 - 15 human receptor genes had 2 mouse ortholog(s)
 - 7 human receptor genes had 3 mouse ortholog(s)
 - 3 human receptor genes had 4 mouse ortholog(s)
 - 3 human receptor genes had 5 mouse ortholog(s)
 - 4 human receptor genes had 6 mouse ortholog(s)
 - 6 human receptor genes had 7 mouse ortholog(s)


In [42]:
len(pd.unique(mouse_gene_pair1[human_receptor_col]))

781

In [32]:
mouse_gene_pair1[human_ligand_col]

0          <span title="alpha-2-macroglobulin">A2M</span>
1          <span title="alpha-2-macroglobulin">A2M</span>
2       <span title="angiotensin I converting enzyme">...
3            <span title="adenosine deaminase">ADA</span>
4       <span title="ADAM metallopeptidase domain 10">...
                              ...                        
3956             <span title=" ">no human ortholog</span>
3957             <span title=" ">no human ortholog</span>
3958             <span title=" ">no human ortholog</span>
3959             <span title=" ">no human ortholog</span>
3960             <span title=" ">no human ortholog</span>
Name: <span title="Official Gene Symbol; Hover on symbols below to show gene names"">Human Ligand&nbsp;&nbsp;&nbsp;</span</span>, Length: 3961, dtype: object