# Python Notebook

In [132]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
import pandas as pd
# Change working directory to ConnectomeDB
project_root = "/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories/ConnectomeDB"
os.chdir(project_root)
sys.path.append(os.path.abspath("src"))

import fetchGSheet 

In [133]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings
import urllib.parse

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)

#https://comp.med.yokohama-cu.ac.jp/reviewer/connectomedb/
site_url = "https://connectomedb.org/"
# Other vertebrates
species_list = [
    "mmusculus", "rnorvegicus", "drerio", "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet",
    "cjacchus", "mmulatta", "xtropicalis"
]

# Select only the relevant columns from pop_up_info
cols_to_keep = cols_to_keep = list(range(0, 30)) 
# Step 3: Load file using only the desired columns
df = pd.read_table("data/HGNC_gene_info_full.tsv", usecols=cols_to_keep)
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          #"rgd_id": "RGD ID",
                                          #"mgd_id": "MGI ID", 
                                          "alias_symbol": "Alias symbol", # add to table
                                          "prev_symbol": "Previous symbol", # add to table
                                          "date_symbol_changed": "Date symbol changed"
                                         })

# Keep only first MGI/RGD ID
#pop_up_info["MGI ID"] = pop_up_info["MGI ID"].str.split("|").str[0]
#pop_up_info["RGD ID"] = pop_up_info["RGD ID"].str.split("|").str[0]

pop_up_info["Alias symbol"] = pop_up_info["Alias symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

pop_up_info["Previous symbol"] = pop_up_info["Previous symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

# Replace "|" with ", "
pop_up_info["Alias symbol"] = [value.replace("|", ", ") for value in pop_up_info["Alias symbol"]]
pop_up_info["Previous symbol"] = [value.replace("|", ", ") for value in pop_up_info["Previous symbol"]]

pop_up_info["Date symbol changed"] = pop_up_info["Date symbol changed"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)


pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "Alias symbol", # "MGI ID", "RGD ID"
                               "Approved symbol", "Previous symbol"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair_human.dropna(axis=1, how='all')
gene_pair = gene_pair[gene_pair['LR_pair_orig'] != '']
# for now set source count as triplicates
sourceCount = len(gene_pair[['LR_pair_orig']])

### KEEP ALL AS OF LATEST input datatable
# for now, keep only the following columns
# gene_pair = gene_pair[['LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
#                        'perplexity link', 'PMID', 'binding location', 
#                        'bind in trans?', 'bidirectional signalling?',
#                        'interaction type', 'original source']]

gene_pair = gene_pair.dropna(subset=['LR_pair_orig'])

# some PMIDs kick in with "," so replace
gene_pair["PMID"] = [value.replace(",", "") for value in gene_pair["PMID"]]
gene_pair = gene_pair.dropna(subset=['PMID'])

### NO NEED FOR MAPPING AS OF LATEST input datatable
# Mapping for replacements
# mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# # Replace values in the column based on the mapping
# gene_pair['original source'] = gene_pair['original source'].replace(mapping)

gene_pair.columns = gene_pair.columns.str.strip()
gene_pair[['Ligand', 'Receptor']] = gene_pair['LR Pair Card'].str.split(' ', n=1, expand=True)

## add Ligand/Receptor Location
def dedup_locations(loc_str):
    # Split, strip, deduplicate, and sort
    parts = [loc.strip() for loc in loc_str.split(',') if loc.strip()]
    unique_sorted = sorted(set(parts), key=str.lower)  # case-insensitive sort
    return unique_sorted

def generate_LocToolTip(row, geneloc, loc_col):
    ligand = row[loc_col]
    original_locations = [loc.strip() for loc in row["location"].split(',')]
    original_sources = [src.strip() for src in row["source"].split(',')]

    # Get deduplicated locations
    unique_locations = dedup_locations(row["location"])

    if len(unique_locations) == 1:
        # Single tooltip case
        location = unique_locations[0]
        matching_rows = geneloc[(geneloc[loc_col] == ligand) & (geneloc["location"].str.contains(location))]
        all_sources = matching_rows["source"].unique()
        sources_str = ", ".join(sorted(set(all_sources)))
        return f'<span title="based on {sources_str}">{location}</span>'
    else:
        # Multiple tooltips — find each (ligand, location) match in original df
        spans = []
        for loc in unique_locations:
            matching_rows = geneloc[
                (geneloc[loc_col] == ligand) &
                (geneloc["location"].str.contains(loc))
            ]
            all_sources = matching_rows["source"].unique()
            sources_str = ", ".join(sorted(set(all_sources)))
            spans.append(f'<span title="based on {sources_str}">{loc}</span>')
        return ", ".join(spans)


# Group the original loc_info by Ligand
ligand_loc = fetchGSheet.ligand_loc.dropna(axis=1, how='all')
grouped = ligand_loc.groupby("Ligand").agg({
    "location": lambda x: ', '.join(x),
    "source": lambda x: ', '.join(x)
}).reset_index()

# Generate tooltips
grouped["Ligand location"] = grouped.apply(lambda row: generate_LocToolTip(row, ligand_loc,loc_col="Ligand"), axis=1)
# create dict
mapping_loc = dict(zip(grouped['Ligand'], grouped['Ligand location'])) 
gene_pair['Ligand location'] = gene_pair['Ligand'].replace(mapping_loc)


# Group the original loc_info by Receptor
receptor_loc = fetchGSheet.receptor_loc.dropna(axis=1, how='all')
grouped = receptor_loc.groupby("Receptor").agg({
    "location": lambda x: ', '.join(x),
    "source": lambda x: ', '.join(x)
}).reset_index()

# Generate tooltips
grouped["Receptor location"] = grouped.apply(lambda row: generate_LocToolTip(row, receptor_loc,loc_col="Receptor"), axis=1)
# create dict
mapping_loc = dict(zip(grouped['Receptor'], grouped['Receptor location'])) 
gene_pair['Receptor location'] = gene_pair['Receptor'].replace(mapping_loc)


# Set missing mappings to 'unknown'
gene_pair.loc[gene_pair['Ligand location'] == gene_pair['Ligand'], 'Ligand location'] = 'unknown'
gene_pair.loc[gene_pair['Receptor location'] == gene_pair['Receptor'], 'Receptor location'] = 'unknown'
# Set "n/a" to unknown
gene_pair['Ligand location'] = [value.replace("n/a", "unknown") for value in gene_pair['Ligand location']]
gene_pair['Receptor location'] = [value.replace("n/a", "unknown") for value in gene_pair['Receptor location']]

# Fetch HGNC IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

gene_pair['Human LR Pair'] = np.where(
    gene_pair['Human evidence'] == "not conserved", 
    "no human ortholog",                                  
    gene_pair['Homo sapiens_ligand'] + " " + gene_pair['Homo sapiens_receptor'] 
)


# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "LR_pair_orig": "LR Pair",
    "HGNC ligand": "Ligand HGNC ID",
    "HGNC receptor": "Receptor HGNC ID",
    "ENSEMBL ligand": "Ligand ENSEMBL ID",
    "ENSEMBL receptor": "Receptor ENSEMBL ID",
    # "perplexity link": "Perplexity", # will be replaced with actual link later
    # "original source": "Database Source",
    "Ligand location": "Ligand Location",
    "Receptor location": "Receptor Location",
    # "binding location": "Binding Location",
    # "bind in trans?" : "Trans-binding", 
    # "bidirectional signalling?": "Bidirectional Signalling",
    # "interaction type" : "Interaction Type"
    #"PMID": "PMID support" # was PMID support
})
gene_pair = gene_pair.drop(columns=["Ligand", "Receptor"])
gene_pair = gene_pair.rename(columns={"Homo sapiens_ligand": "Ligand", "Homo sapiens_receptor": "Receptor"})
# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand Name", 
                                     #"MGI ID": "Ligand MGI ID", # NOT APPLIED YET BUT should be taken from Ensembl BioMart
                                     #"RGD ID": "Ligand RGD ID", # NOT APPLIED YET BUT should be taken from Ensembl BioMart
                                      "Alias symbol": "Ligand Aliases",
                                      "Previous symbol": "Ligand Old symbol",
                                     },
                            )
gene_pair = gene_pair.drop(columns=["HGNC ID", "Approved symbol"])


gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Receptor Name",
             #"MGI ID": "Receptor MGI ID",
             #"RGD ID": "Receptor RGD ID",
                                      "Alias symbol": "Receptor Aliases",
                                      "Previous symbol": "Receptor Old symbol",}
                            )

### AS OF LATEST DB skip pathways for now (code in addPathwayDiseaseAnnot_temp.py)

# Add new columns where all Ligand Symbol & Aliases and Receptor Symbol & Aliases merged in one column
def format_symbol_aliases(symbol, old_symbol, aliases):
    """
    Formats symbol, old symbols, and aliases.
    If the final formatted string would be empty after considering N/A values
    and empty inputs, it returns "mouse-specific".
    Otherwise, it formats based on the presence of old_symbol and aliases,
    removing unnecessary parentheses or commas, following the structure:
    "Symbol (Old Symbol, Aliases)" if both exist.
    """
    # Normalize inputs to empty strings if they are None/NaN or just whitespace
    symbol_str = str(symbol).strip()
    old_symbol_str = str(old_symbol).strip()
    aliases_str = str(aliases).strip()

    # Filter out values that are empty strings or "N/A" for old_symbol and aliases
    parts_for_join = []
    if old_symbol_str and old_symbol_str != "N/A":
        parts_for_join.append(old_symbol_str)
    if aliases_str and aliases_str != "N/A":
        parts_for_join.append(aliases_str)

    # Construct the preliminary result based on your original logic:
    # "symbol (old_symbol, aliases)" if parts_for_join is not empty, else "symbol"
    if parts_for_join:
        prelim_result = f"{symbol_str} ({', '.join(parts_for_join)})"
    else:
        prelim_result = symbol_str # Just the symbol if no old_symbol or aliases

    return prelim_result

# This is crucial for consistent handling by the function before processing "N/A".
gene_pair['Ligand'] = gene_pair['Ligand'].fillna('')
gene_pair['Ligand Old symbol'] = gene_pair['Ligand Old symbol'].fillna('')
gene_pair['Ligand Aliases'] = gene_pair['Ligand Aliases'].fillna('')


# to later check which ligand-receptor pairs are non-human
def is_mouse_specific(name):
    if not isinstance(name, str):
        return False
    name = name.strip()  # remove leading/trailing spaces
    return any(c.islower() for c in name[1:])

gene_pair = gene_pair.drop(columns=["HGNC ID"])

gene_pair['Ligand Symbols'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Ligand']) 
               else format_symbol_aliases(row['Ligand'], row['Ligand Old symbol'], row['Ligand Aliases']),
    axis=1
)


# This is crucial for consistent handling by the function before processing "N/A".
gene_pair['Receptor'] = gene_pair['Receptor'].fillna('')
gene_pair['Receptor Old symbol'] = gene_pair['Receptor Old symbol'].fillna('')
gene_pair['Receptor Aliases'] = gene_pair['Receptor Aliases'].fillna('')

gene_pair['Receptor Symbols'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Receptor']) 
                else format_symbol_aliases(row['Receptor'], row['Receptor Old symbol'], row['Receptor Aliases']),
    axis=1
)

### tooltips 
gene_pair["Ligand Symbols"] = [
    f'<span title="{aliases}">{aliases}</span>'
    for aliases in gene_pair["Ligand Symbols"]
]
gene_pair["Receptor Symbols"] = [
    f'<span title="{aliases}">{aliases}</span>'
    for aliases in gene_pair["Receptor Symbols"]
]

# might be used later just save info for now (for mouse cards)
grab_mouse_info = gene_pair["LR Pair Card"][gene_pair["Human evidence"].isin(["absent in human", "not conserved"])]
grab_mouse_info = grab_mouse_info.unique()
grab_mouse_info

# Add an empty Perplexity column filled with None (just to save it's order
gene_pair['Perplexity'] = None
#gene_pair = gene_pair.drop(columns=["Approved symbol_x", "Approved symbol_y"])

### For latest DB, skip (code saved as addOrth_temp.py)

# Add
first_columns=['LR Pair Card', 'Human LR Pair', 'Ligand', 'Receptor', 'Ligand Symbols', 'Receptor Symbols', 'Ligand Location', 'Receptor Location',	'Ligand HGNC ID', 'Receptor HGNC ID', 'Perplexity', 'Human evidence'] # 'Database Source'

end_columns=['PMID', 'Pair_species', 'lig_species', 'rec_species', 'ligand_orig', 'receptor_orig']
gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns + end_columns] + end_columns]
# gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns]]

# number of unique vars (Human and Mouse both counted)

lrPairsCount = len(gene_pair["LR Pair Card"].unique())

ligandCount = len(gene_pair["Ligand"].unique())

receptorCount = len(gene_pair["Receptor"].unique())


### Remove from here for latest DB
# # Mouse Orthologue
# MouseLigandCount = len(gene_pair["Ligand MGI ID"].unique())

# MouseReceptorCount = len(gene_pair["Receptor MGI ID"].unique())

# # Rat Orthologue
# RatLigandCount = len(gene_pair["Ligand RGD ID"].unique())

# RatReceptorCount = len(gene_pair["Receptor RGD ID"].unique())

# gene_pair["PMID"] = [value.replace(" ", "") for value in gene_pair["PMID"]] # was'PMID support'


source = np.array(gene_pair["PMID"].unique())
source = source.astype(str)
source = ",".join(sorted(set(filter(lambda x: x.lower() != 'nan', source))))
# Split the string into individual elements, filter out empty strings, and get unique values
source = sorted(
    set(filter(lambda x: x.strip() and x.strip().lower() != 'nan', source.split(',')))
)
source = [value.replace(" ", "") for value in source]

# Function to join unique sorted values
agg_func = lambda x: ', '.join(sorted(set(map(str, x))))

# Group and aggregate all columns except 'LR Pair Card'
gene_pair = gene_pair.groupby('LR Pair Card').agg(agg_func).reset_index()
gene_pair = gene_pair[gene_pair['LR Pair Card'] != '']
# Identify rows where BOTH 'Ligand HGNC ID' and 'Receptor HGNC ID' are empty
# We check if the stripped string is empty, as fillna('') converts None/NaN to empty strings
has_hgnc_id = (gene_pair['Ligand HGNC ID'].astype(str).str.strip() != '') | \
              (gene_pair['Receptor HGNC ID'].astype(str).str.strip() != '')

# Separate the DataFrame into two parts (human-based cards and mouse based)
human_rows = gene_pair[~(gene_pair["Human evidence"].isin(["absent in human", "not conserved"]))]
mouse_rows = gene_pair[gene_pair["Human evidence"].isin(["absent in human", "not conserved"])]

# Concatenate the DataFrames: rows with IDs first, then rows without IDs
gene_pair = pd.concat([human_rows, mouse_rows]).reset_index(drop=True)
DBlength = len(gene_pair)
gene_pair["Interaction ID"] = [f"CDB{str(i).zfill(5)}" for i in range(1, DBlength + 1)]

# for creating PMIDs
gene_pair00 = gene_pair[['LR Pair Card', 'PMID']] # was "PMID support"

# create Perplexity link
def create_url_basic(perplexity_col):
    query = f"What is the primary evidence that {perplexity_col} bind-each-other-as-a-ligand-and-receptor-pair. Exclude reviews, uniprot, wiki, genecards, PIPS, iuphar as sources."
    encoded_query = query.replace(" ", "%20")
    return f"https://www.perplexity.ai/search?q={encoded_query}"
    
# Option 2 -- new query all together

# def generate_perplexity_link_pmid(row): 
#     query = f"What-is-the-biological-relevance-of-the-ligand-and-receptor-pair-{row['Human LR Pair']}-based-on-Pubmed-ID-{row['PMID']}"
#     return (
#          f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank">'
#         f'<img src="https://img.icons8.com/?size=30&id=0NbBuNOxUwps&format=png&color=000000" alt="Perplexity AI" /></a>'
#     )

# cannot use perplexity logo
def generate_perplexity_link_pmid(row): 
    query = f"What-is-the-biological-relevance-of-the-ligand-and-receptor-pair-{row['Human LR Pair']}-based-on-Pubmed-ID-{row['PMID']}"
    return (
         f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank" style="text-decoration: none;">&#128269;</a>'
    )

# Apply function to the DataFrame
gene_pair["Perplexity"] = gene_pair.apply(generate_perplexity_link_pmid, axis=1)

# create URLs for the HGNC IDs

# ligand
gene_pair["Ligand HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(ligand, ligand)
    for ligand in gene_pair["Ligand HGNC ID"]
]

# receptor
gene_pair["Receptor HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(receptor, receptor)
    for receptor in gene_pair["Receptor HGNC ID"]
]


# Function to generate hyperlinks for the "PMID support" column
def generate_links_with_doi(df, gene_column, pmid_column, id_column):
    def create_link(gene, id_col, sources):
        # Replace spaces with "——" in the gene name for the link
        gene_name = gene.replace(" ", "—")
        
        if len(sources) == 1:
            source = sources[0]
            if source.startswith("https://www.biorxiv.org/content/"):
                # If the value starts with "https://doi.org/", use it as the hyperlink
                return f'<a href="{source}" target="_blank">BioRxiv</a>'
            else:
                # If it's a single PMID, hyperlink the PMID text
                return f'<a href="{site_url}cards/{gene}.html">{source}</a>'
        else:
            # If multiple PMIDs, show the count and hyperlink to the page
            return f'<a href="https://comp.med.yokohama-cu.ac.jp/reviewer/connectomedb/cards/{gene}.html" target="_blank">{len(sources)} PMIDs</a>'

    # Process each row to generate the "PMID" column # was "PMID support"
    df["PMID"] = [
        create_link(
            gene=row[gene_column], 
            id_col = row[id_column],
            sources=[s.strip() for s in row[pmid_column].split(',') if s.strip()]
        )
        for _, row in df.iterrows()
    ]
    return df


# Generate the links for the "PMID" column # was "PMID support"
gene_pair = generate_links_with_doi(gene_pair, gene_column="Human LR Pair", 
                                    pmid_column="PMID", id_column= "Interaction ID")

# for disease type, cancer-related and top pathways, when missing say "ask Perplexity"


def generate_perplexity_kegglinks(
    df,
    pathway_col="KEGG Pathway",
    default_query_template="What-biological or other functional-pathways-is-the-ligand-receptor-{pair}-associated-with"
):
    def create_link(row):
        value = row.get(pathway_col, "")
        
        if pd.isna(value) or str(value).strip().lower() in ["nan", "none", "", "unknown"]:
            pair = row["Human LR Pair"]
            label = "ask Perplexity"
            query = default_query_template.format(pair=pair)
            encoded_query = urllib.parse.quote(query)
            return f'<a href="https://www.perplexity.ai/search?q={encoded_query}" target="_blank">{label}</a>'
        else:
            return value

    df[pathway_col] = df.apply(create_link, axis=1)
    return df

gene_pair = generate_perplexity_kegglinks(gene_pair, pathway_col="KEGG Pathway")
    
def generate_perplexity_links(df, pathway_col, default_query_template):
    def create_link(row):
        pathway_value = str(row[pathway_col]).strip().lower()
        pair = row["Human LR Pair"]
        
        if pd.isna(row[pathway_col]) or pathway_value in ["nan", "none", "", "unknown"]:
            label = "ask Perplexity"
            query = default_query_template.format(pair=pair)
            output =  f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank">{label}</a>'
        else:
            label = row[pathway_col]
            query = f"What-is-the-role-of-the-ligand-and-receptor-pair-{pair}-in-{label}"
            output = f'{label} (see <a href="https://www.perplexity.ai/search?q={query}" target="_blank">evidence in Perplexity</a>)'
        
        return output
    
    df[pathway_col] = df.apply(create_link, axis=1)
    return df

### SKIP for latest DB

# gene_pair = generate_perplexity_links(
#     gene_pair,
#     pathway_col="PROGENy Pathway",
#     default_query_template="What-major signalling pathways-is-the-ligand-receptor-pair-{pair}-associated-with"
# )

# gene_pair = generate_perplexity_links(
#     gene_pair,
#     pathway_col="Disease Type",
#     default_query_template="What-disease types-is-the-ligand-receptor-pair-{pair}-associated-with"
# )
# # if it is a yes or no question
# def generate_perplexity_links_yesno(
#     df,
#     pathway_col="Cancer-related",
#     default_query_template="Is-the-{pair}-associated-with-cancer-and-if-so-which-ones"
# ):
#     def create_link(row):
#         pathway_value = str(row[pathway_col]).strip().lower()
#         pair = row["Human LR Pair"]
        
#         if pd.isna(row[pathway_col]) or pathway_value in ["nan", "none", "", "unknown"]:
#             label = "ask Perplexity"
#             query = default_query_template.format(pair=pair)
#             output = f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank">{label}</a>'
#         else:
#             label = row[pathway_col]
#             query = f"Provide evidence to support the statement-{pair}-is-related-to-cancer-answer-is-{label}"
#             output = f'{label} (see <a href="https://www.perplexity.ai/search?q={query}" target="_blank">evidence in Perplexity</a>)'
#         return output
    
#     df[pathway_col] = df.apply(create_link, axis=1)
#     return df

# gene_pair = generate_perplexity_links_yesno(
#     gene_pair,
#     pathway_col="Cancer-related"
# )

# Add tooltip to name

def add_geneToolTip(species):
    def tooltip_html(symbol, name):
        return (
            f'<span class="tooltip">{symbol}'
            f'<span class="tooltiptext">{name}</span></span>'
        )

    gene_pair[species + " Ligand"] = [
        tooltip_html(ligand_symbol, ligand_name)
        for ligand_name, ligand_symbol in zip(gene_pair[species + " Ligand Name"], gene_pair[species + " Ligand"])
    ]
    gene_pair[species + " Receptor"] = [
        tooltip_html(receptor_symbol, receptor_name)
        for receptor_name, receptor_symbol in zip(gene_pair[species + " Receptor Name"], gene_pair[species + " Receptor"])
    ]

## Make the Human evidence consistent
gene_pair["Human evidence"] = gene_pair["Human evidence"].replace("CONSERVATION, DIRECT", "DIRECT, CONSERVATION")

### Remove tooltip for name for each species for now as only zebrafish has the proper names ###     
# speciesPrime_list = ["Zebrafish"]
# # Loop through each species and update gene_pair
# for species in speciesPrime_list:
#    gene_pair = add_geneToolTip(species)

mouse_columns = ['Mouse Ligand', 'Mouse Receptor','Ligand MGI ID','Receptor MGI ID'] 
rat_columns = ['Rat Ligand','Rat Receptor','Ligand RGD ID','Receptor RGD ID']
zebrafish_columns = ['Zebrafish Ligand','Zebrafish Receptor','Ligand ZFIN ID','Receptor ZFIN ID']

# List of prefixes
prefixes = ("Chimpanzee", "Chicken", "Pig", "Cow", "Dog", "Horse", "Sheep", "Marmoset", "Macaque", "Frog")

# Get column names that start with any of the given prefixes
selected_columns = [col for col in gene_pair.columns if col.startswith(prefixes)]
# was "PMID support"
gene_pair0 = gene_pair[["Interaction ID"]+ first_columns+["PMID"]+["Ligand Name", "Receptor Name"]]
# gene_pair0 = gene_pair[["Interaction ID", "Human LR Pair", "Ligand", "Receptor",
#                        "Ligand Symbols", "Receptor Symbols", 
#                        "Ligand Location", "Receptor Location",
#                        "Ligand HGNC ID", "Receptor HGNC ID",
#                        "Perplexity", "PMID", 'Ligand Name','Receptor Name', 'KEGG Pathway', 'Cancer-related', 'Disease Type', 'Binding Location', 'Trans-binding', 'Bidirectional Signalling', 'Interaction Type', "PROGENy Pathway"] + mouse_columns + rat_columns]

gene_pair = gene_pair[["Interaction ID"]+ first_columns+["Ligand Name", "Receptor Name"]]

# gene_pair = gene_pair[["Interaction ID", "Human LR Pair", "Ligand", "Receptor",
#                        "Ligand Symbols", "Receptor Symbols", 
#                        "Ligand Location", "Receptor Location",
#                        "Ligand HGNC ID", "Receptor HGNC ID",
#                        "Perplexity", "PMID", 
#                        "Database Source", "Binding Location",
#                        "Trans-binding", "Bidirectional Signalling",
#                        "Interaction Type",'Ligand Name','Receptor Name'] + mouse_columns + rat_columns + zebrafish_columns + selected_columns]
# rm  "KEGG Pathway", "PROGENy Pathway", "Cancer-related", "Disease Type" for now

# Quick check if there is mouse-specific
gene_pair['Ligand'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Ligand']) else row['Ligand'],
    axis=1
)
gene_pair['Receptor'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Receptor']) else row['Receptor'],
    axis=1
)

# gene symbol
gene_pair["Ligand"] = [
    f'<span title="{ligand_name}">{ligand_symbol}</span>'
    for ligand_name, ligand_symbol in zip(gene_pair["Ligand Name"], 
                                              gene_pair["Ligand"])
]
# gene symbol
gene_pair["Receptor"] = [
    f'<span title="{receptor_name}">{receptor_symbol}</span>'
    for receptor_name, receptor_symbol in zip(gene_pair["Receptor Name"], 
                                              gene_pair["Receptor"])
]

### SKIP for latest DB
# tweak Database Source so when multiple should show multiple with tool tip
# gene_pair["Database Source"] = [
#     f'<span title="{orig_dbSource}">{("multiple (CDB2025 included)" if "connectomeDB2025" in orig_dbSource and "," in orig_dbSource else "multiple" if "," in orig_dbSource else orig_dbSource)}</span>'
#     for orig_dbSource in gene_pair["Database Source"]
# ]


# FOR NOW just make it as simple as em-dash/arrow
# def replace_spaces(row):
#     if 'secreted' in row['Ligand Location'].lower():
#         return row['Human LR Pair'].replace(" ", " <span style='font-size: 14px;'>○</span> <span style='font-size: 24px;'>⤚</span> ")
#     elif row['Ligand Location'] == 'unknown':
#         return row['Human LR Pair'].replace(" ", " <span style='font-size: 14px;'>○</span> <span style='font-size: 24px;'>⤚</span> ")
#     elif 'membrane' in row['Ligand Location'].lower():
#         return row['Human LR Pair'].replace(" ", " <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> ")
#     else:
#         return row['Human LR Pair'].replace(" ", " \u2192 ")

# # Apply the function to the 'LR Pair' column
# gene_pair['Human LR Pair'] = gene_pair.apply(replace_spaces, axis=1)
# gene_pair["Human LR Pair"] = gene_pair["Human LR Pair"].str.replace(" ", "-")
gene_pair = gene_pair.drop(columns=["Ligand Name", "Receptor Name"])


# Create the links to the HTML cards
gene_pair["LR Pair Card"] = [
    f'<a href="https://comp.med.yokohama-cu.ac.jp/reviewer/connectomedb/cards/{ "mouse" if evidence == "not conserved" else "human" }/{lrPairOrig.replace(" ","-")}.html">{lrPair}</a>'
    for lrPairOrig, lrPair, evidence in zip(gene_pair0["LR Pair Card"], gene_pair["LR Pair Card"], gene_pair["Human evidence"])
]



# Add tooltips to the column headers
gene_pair.columns = [
    f'<span title="Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">{col}</span>' if col == "Human LR Pair" else
    f'<span title="Click the logo below to run Perplexity on the Human LR pair">{col}&nbsp;</span>' if col == "Perplexity" else
    f'<span title="Official Gene Symbol; Hover on symbols below to show gene names">{col}&nbsp;&nbsp;&nbsp;</span>' if col in ["Ligand", "Receptor"] else
    f'<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">{col}&nbsp;&nbsp;</span>' if col in ["Ligand HGNC ID", "Receptor HGNC ID"] else
    f'<span title=" PubMed IDs (PMID) with Literature Evidence for LR Interaction. Click on the link for more details">{col}</span>' if col == "PMID" else
    f'<span title="Rat Genome Database (RGD) ID. Click on the link for more details">{col}</span>' if col in ["Ligand RGD ID", "Receptor RGD ID"] else
    f'<span title="Mouse Genome Informatics (MGI) ID. Click on the link for more details">{col}</span>' if col in ["Ligand MGI ID", "Receptor MGI ID"]else
    f'<span title="Zebrafish Information Network (ZFIN) ID. Click on the link for more details">{col}</span>' if col in ["Ligand ZFIN ID", "Receptor ZFIN ID"] else
    f'<span title="Location based on the predicted subcellular localization of the human proteome">{col}</span>' if col in ["Ligand Location", "Receptor Location"] else
    f'<span title="Double-click header of {col} to reverse sort">{col}&nbsp;</span>'
    for col in gene_pair.columns
]

gene_pair = gene_pair.reset_index(drop=True)  # Remove the index

#######################################################################
# Identify the column(s) that contain '(PMID)' and temporarily remove for presubmission
pmid_cols = [col for col in gene_pair.columns if '(PMID)' in col]
gene_pair = gene_pair.drop(columns=pmid_cols)
#######################################################################

gene_pair000 = gene_pair.copy()

keywords_to_modify = ["Ligand", "Receptor"]
exclude_keywords = ["HGNC ID", "Location", "Human"]  # Columns containing this will not be modified

# Copy the original columns so we can modify only the first 10
new_columns = gene_pair000.columns.tolist()

# Modify only the first 10 columns
new_columns[:10] = [
    f'{col.split(">")[0]}">Human {col.split(">")[1]}</span>'
    if any(keyword in col for keyword in keywords_to_modify) and not any(exclude in col for exclude in exclude_keywords)
    else col
    for col in new_columns[:10]
]

# Assign the modified column names back to the DataFrame
gene_pair000.columns = new_columns
#######################################################################
### For latest DB, no need to limit columns
human_columns = [col for col in gene_pair000.columns][:16]
#######################################################################
#human_gene_pair = gene_pair.iloc[:, :-36]
# remove mouse specific ones from the datatable
evidence_cols = [col for col in gene_pair.columns if 'Human evidence' in col]
human_gene_pair = gene_pair[~(gene_pair[evidence_cols[0]] == "not conserved")]
# add number of mouse pair cards
numOfMouseOrth = len(gene_pair[evidence_cols][(gene_pair[evidence_cols[0]] == "not conserved")])

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


In [134]:
human_gene_pair

Unnamed: 0,"<span title=""Double-click header of Interaction ID to reverse sort"">Interaction ID&nbsp;</span>","<span title=""Double-click header of LR Pair Card to reverse sort"">LR Pair Card&nbsp;</span>","<span title=""Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)"">Human LR Pair</span>","<span title=""Official Gene Symbol; Hover on symbols below to show gene names"">Ligand&nbsp;&nbsp;&nbsp;</span>","<span title=""Official Gene Symbol; Hover on symbols below to show gene names"">Receptor&nbsp;&nbsp;&nbsp;</span>","<span title=""Double-click header of Ligand Symbols to reverse sort"">Ligand Symbols&nbsp;</span>","<span title=""Double-click header of Receptor Symbols to reverse sort"">Receptor Symbols&nbsp;</span>","<span title=""Location based on the predicted subcellular localization of the human proteome"">Ligand Location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome"">Receptor Location</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Ligand HGNC ID&nbsp;&nbsp;</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Receptor HGNC ID&nbsp;&nbsp;</span>","<span title=""Click the logo below to run Perplexity on the Human LR pair"">Perplexity&nbsp;</span>","<span title=""Double-click header of Human evidence to reverse sort"">Human evidence&nbsp;</span>"
0,CDB00001,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",A2M HSPA5,"<span title=""alpha-2-macroglobulin"">A2M</span>","<span title=""heat shock protein family A (Hsp7...","<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on PMID: 12194978, PMID: 32...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT
1,CDB00002,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",A2M LRP1,"<span title=""alpha-2-macroglobulin"">A2M</span>","<span title=""LDL receptor related protein 1"">L...","<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT
2,CDB00003,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",ACE BDKRB2,"<span title=""angiotensin I converting enzyme"">...","<span title=""bradykinin receptor B2"">BDKRB2</s...","<span title=""ACE (DCP1, ACE1, CD143)"">ACE (DCP...","<span title=""BDKRB2 (BK-2)"">BDKRB2 (BK-2)</span>","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT
3,CDB00004,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",ADA DPP4,"<span title=""adenosine deaminase"">ADA</span>","<span title=""dipeptidyl peptidase 4"">DPP4</span>","<span title=""ADA (ADA1)"">ADA (ADA1)</span>","<span title=""DPP4 (CD26, ADCP2, DPPIV)"">DPP4 (...","<span title=""based on hpa, uniprot"">cell membr...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION
4,CDB00005,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",ADAM10 EFNA5,"<span title=""ADAM metallopeptidase domain 10"">...","<span title=""ephrin A5"">EFNA5</span>","<span title=""ADAM10 (kuz, MADM, HsT18717, CD15...","<span title=""EFNA5 (EPLG7, AF1, LERK7)"">EFNA5 ...","<span title=""based on hpa, perplexity, uniprot...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3545,CDB03546,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",ZG16B TLR2,"<span title=""zymogen granule protein 16B"">ZG16...","<span title=""toll like receptor 2"">TLR2</span>","<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR2 (TIL4, CD282)"">TLR2 (TIL4, C...","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on hpa, perplexity"">cell me...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...","DIRECT, CONSERVATION"
3546,CDB03547,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",ZG16B TLR4,"<span title=""zymogen granule protein 16B"">ZG16...","<span title=""toll like receptor 4"">TLR4</span>","<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR4 (hToll, CD284, TLR-4, ARMD10...","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...","DIRECT, CONSERVATION"
3547,CDB03548,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",ZG16B TLR5,"<span title=""zymogen granule protein 16B"">ZG16...","<span title=""toll like receptor 5"">TLR5</span>","<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR5 (SLEB1, TIL3, FLJ10052, MGC1...","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT
3548,CDB03549,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",ZG16B TLR6,"<span title=""zymogen granule protein 16B"">ZG16...","<span title=""toll like receptor 6"">TLR6</span>","<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR6 (CD286)"">TLR6 (CD286)</span>","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...","DIRECT, CONSERVATION"


In [124]:
mapping_orth_symbol = dict(zip(ref['ensembl_gene_id'], ref['external_gene_name']))

In [125]:
gene_pair_orth = fetchGSheet.safe_fetch(fetchGSheet.sheet_ID, f"FROZEN_{species_lower}", fetchGSheet.credentials_file)

In [126]:
gene_pair_orth.columns

Index(['Sorting order', 'LR_pair_orig', 'PMID', 'lig_species', 'rec_species',
       'ligand_orig', 'receptor_orig', 'Pair_species', 'LR Pair Card',
       'Sheep evidence', 'Sheep_ligand', 'Sheep_receptor', 'ENSEMBL ligand',
       'ENSEMBL receptor'],
      dtype='object', name=0)

In [127]:
gene_pair_orth['Ligand Official Symbol'] = 'NA'
gene_pair_orth['Ligand Official Symbol'] =  gene_pair_orth.apply(
    lambda row: mapping_orth_symbol.get(row[f'{id} ligand'], row['Ligand Official Symbol'])
    if pd.notna(row[f'{id} ligand']) else row['Ligand Official Symbol'],
    axis=1
)
gene_pair_orth['Receptor Official Symbol'] = 'NA'
gene_pair_orth['Receptor Official Symbol'] =  gene_pair_orth.apply(
    lambda row: mapping_orth_symbol.get(row[f'{id} receptor'], row['Receptor Official Symbol'])
    if pd.notna(row[f'{id} receptor']) else row['Receptor Official Symbol'],
    axis=1
)
gene_pair_orth["same_as_off_lig"] = (
    gene_pair_orth[f"{species}_ligand"] == gene_pair_orth["Ligand Official Symbol"]
)

gene_pair_orth["same_as_off_rec"] = (
    gene_pair_orth[f"{species}_receptor"] == gene_pair_orth["Receptor Official Symbol"]
)

In [128]:
gene_pair_orth.to_csv(f"data/{species}_{id}_ID_check.csv")

In [121]:
species_info

NameError: name 'species_info' is not defined

In [69]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import re
from bs4 import BeautifulSoup
from createDataTable import gene_pair, gene_pair000, human_columns, lrPairsCount
import warnings
import fetchGSheet 

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)

def process_species_gene_pair(species, fetchGSheet, gene_pair):
    species_name = {
        "Mouse": "mmusculus",
        "Rat": "rnorvegicus",
        "Zebrafish":"drerio" ,
        "Chimpanzee":"ptroglodytes",
        "Chicken":"ggallus",
        "Pig":"sscrofa",
        "Cow":"btaurus",
        "Dog":"clfamiliaris",
        "Horse":"ecaballus",
        "Sheep":"oarambouillet",
        "Marmoset": "cjacchus" ,
        "Macaque": "mmulatta",
        "Frog": "xtropicalis"
    }.get(species, "Unknown species")

    if species == "Mouse":
        species_id = "MGI"
        species_info = pd.read_csv(f"data/MRK_Merged_{species_id}_DB.tsv", sep="\t", dtype=str)
    elif species == "Rat":
        species_id = "RGD"
        species_info = pd.read_csv(f"data/GENES_RAT_{species_id}_DB.tsv", sep="\t", dtype=str)
        # combine all known aliases also old ones
        species_info["ALIASES"] = species_info[["MARKER_SYMBOL", "OLD_SYMBOL"]].apply(
            lambda row: ";".join(pd.unique(row.dropna().astype(str))), axis=1
        )
    elif species == "Frog":
        species_id = "XEN"
        species_info = pd.read_csv(f"data/GenePageGeneralInfo_{species_id.capitalize()}base_DB.tsv", sep="\t", dtype=str)
    elif species == "Zebrafish":
        species_id = "ZFIN"
        species_info = pd.read_csv(f"data/Zebrafish_merged_{species_id}_DB.tsv", sep="\t", dtype=str)
        # combine all known aliases also old ones
        species_info["Aliases"] = species_info[["Current Name", "Previous Name"]].apply(
            lambda row: ", ".join(pd.unique(row.dropna().astype(str))), axis=1
        )
    else: 
        species_id = "ENSEMBL"
        species_info = pd.read_csv(f"data/hsapiens_ID_biomart_{species_name}_centric.csv", dtype=str)
    
    
    species_lower = species.lower()
    
    if species == "Mouse":
        gene_pair_species = getattr(fetchGSheet, f"gene_pair_{species_lower}")  
    else:
        gene_pair_species = fetchGSheet.safe_fetch(fetchGSheet.sheet_ID, f"FROZEN_{species_lower}", fetchGSheet.credentials_file)
    
    def extract_visible_text(col):
        """Extract visible text between '>' and '</a>'."""
        match = re.search(r'>([^<]+)</a>', col)
        if match:
            return match.group(1).strip()
        return None
    
    # Grab the Interaction ID and 
    gene_pair = gene_pair.iloc[:, :9]
    # Next, drop columns at index positions 3 and 4 ("Ligand and Receptor" since we already have ligand symbols and receptor symbols)
    gene_pair = gene_pair.drop(gene_pair.columns[[3, 4]], axis=1)
    exclude_keywords = ["HGNC ID", "Location", "Human"]  # Columns containing this will not be modified
    keywords_to_modify = ["Ligand Symbols", "Receptor Symbols"]
    # Copy the original columns so we can modify only the first 10
    new_columns = gene_pair.columns.tolist()
    
    # Modify only the first 10 columns
    new_columns = [
        f'{col.split(">")[0]}">Human {col.split(">")[1]}</span>'
        if any(keyword in col for keyword in keywords_to_modify) and not any(exclude in col for exclude in exclude_keywords)
        else col
        for col in new_columns
    ]
    # Assign the modified column names back to the DataFrame
    gene_pair.columns = new_columns
    
    gene_pair["LR Pair Card"] = gene_pair.iloc[:, 1].apply(extract_visible_text)
    ligand_symbols_col = [col for col in gene_pair.columns if "Ligand Symbols" in col][0]
    receptor_symbols_col = [col for col in gene_pair.columns if "Receptor Symbols" in col][0]
    gene_pair = gene_pair.rename(columns={ligand_symbols_col: "Human Ligand Symbols",
                                          receptor_symbols_col: "Human Receptor Symbols"})
    
    
    gene_pair_species = gene_pair_species[[
        "LR Pair Card",
        f"{species}_ligand",
        f"{species}_receptor",
        f"{species} evidence",
        f"{species_id} ligand",
        f"{species_id} receptor",
        "PMID"
    ]]
    
    
    gene_pair =gene_pair_species.merge(gene_pair,how="left", on="LR Pair Card")
    
    if species == "Mouse":
        spec_id = f"{species_id} Marker Accession ID"
        spec_name = "Marker Name"
        spec_alias = "Aliases"
    elif species == "Rat":
        spec_id = f"GENE_{species_id}_ID"
        spec_name = "NAME"
        spec_alias = "ALIASES"
    elif species == "Zebrafish":
        spec_id = f"{species_id}_ID"
        spec_name = "Current Name"
        spec_alias = "Aliases"
    elif species == "Frog":
        spec_id = "tropicalis gene ID"
        spec_name = "gene name"
        spec_alias = "gene synonyms"
    else:
        spec_id = "ensembl_gene_id"
        spec_name = "external_gene_name"
        spec_alias = "external_synonym"
    
    
    species_info = species_info[[spec_id, spec_name,spec_alias]]
    
    if species in ["Mouse", "Frog"]:
        species_info[spec_alias] = species_info[spec_alias].str.replace("|", ", ", regex=False)
    elif species == "Rat":
        species_info[spec_alias] = species_info[spec_alias].str.replace(";", ", ", regex=False)

    
    gene_pair = gene_pair.merge(species_info,how="left", left_on = f"{species_id} ligand",right_on=spec_id)
    gene_pair = gene_pair.drop(columns=[spec_id])
    gene_pair = gene_pair.rename(columns={
                                          spec_name: "Ligand Name",
                                          spec_alias: "Ligand Symbols"
                                         }
                                )
    gene_pair = gene_pair.merge(
        species_info,
        how="left",
        left_on=f"{species_id} receptor",
        right_on=spec_id
    )
    
    gene_pair = gene_pair.drop(columns=[spec_id])
    gene_pair = gene_pair.rename(columns={
                                          spec_name: "Receptor Name",
                                          spec_alias: "Receptor Symbols"
                                         }
                                )
    
    
    
    gene_pair[f"{species} LR Pair"] = np.where(
        gene_pair[f"{species} evidence"] == "not conserved", 
        f"no {species_lower} ortholog",                                  
        gene_pair[f"{species}_ligand"] + " " + gene_pair[f"{species}_receptor"] 
    )
    
    gene_pair = gene_pair[~(gene_pair[f"{species} evidence"] == "not conserved")]
    def format_symbol_aliases(symbol, aliases):
        """
        Formats symbol, old symbols, and aliases.
        If the final formatted string would be empty after considering N/A values
        and empty inputs, it returns "species-specific".
        Otherwise, it formats based on the presence of old_symbol and aliases,
        removing unnecessary parentheses or commas, following the structure:
        "Symbol (Old Symbol, Aliases)" if both exist.
        """
        # Normalize inputs to empty strings if they are None/NaN or just whitespace
        symbol_str = str(symbol).strip()
        # old_symbol_str = str(old_symbol).strip()
        aliases_str = str(aliases).strip()
    
        # Filter out values that are empty strings or "N/A" for old_symbol and aliases
        parts_for_join = []
        # if old_symbol_str and old_symbol_str != "N/A":
        #     parts_for_join.append(old_symbol_str)
        if aliases_str and aliases_str != "N/A":
            parts_for_join.append(aliases_str)
    
        # Construct the preliminary result based on your original logic:
        # "symbol (old_symbol, aliases)" if parts_for_join is not empty, else "symbol"
        if parts_for_join:
            prelim_result = f"{symbol_str} ({', '.join(parts_for_join)})"
        else:
            prelim_result = symbol_str # Just the symbol if no old_symbol or aliases
    
        return prelim_result
    
    # This is crucial for consistent handling by the function before processing "N/A".
    gene_pair[f"{species}_ligand"] = gene_pair[f"{species}_ligand"].fillna('')
    gene_pair['Ligand Symbols'] = gene_pair['Ligand Symbols'].fillna('')
    
    gene_pair['Ligand Symbols'] = gene_pair.apply(
        lambda row: format_symbol_aliases(row[f"{species}_ligand"], row['Ligand Symbols']),
        axis=1
    )
    
    gene_pair[f"{species}_receptor"] = gene_pair[f"{species}_receptor"].fillna('')
    gene_pair['Receptor Symbols'] = gene_pair['Receptor Symbols'].fillna('')
    
    gene_pair['Receptor Symbols'] = gene_pair.apply(
        lambda row: format_symbol_aliases(row[f"{species}_receptor"], row['Receptor Symbols']),
        axis=1
    )
    # The list of columns to group by
    interaction_id_col = [col for col in gene_pair.columns if "Interaction ID" in col][0]
    
    grouping_cols = [
        interaction_id_col, f"{species} LR Pair" #, "Ligand Symbols", "Receptor Symbols" 
    ]
    
    aggregation_cols = [
        col for col in gene_pair.columns if col not in grouping_cols
    ]
    
    # 3. Create a dictionary mapping each aggregation column to the joining function
    agg_dict = {
        col: lambda x: ', '.join(x.astype(str).unique()) for col in aggregation_cols
    }
    
    # 4. Perform the groupby and aggregation
    gene_pair = gene_pair.groupby(grouping_cols).agg(agg_dict).reset_index()
    
    # make direct, conservation and conservation, direct the same
    gene_pair[f"{species} evidence"] = gene_pair[f"{species} evidence"].replace("CONSERVATION, DIRECT", "DIRECT, CONSERVATION")
    
    
    def generate_perplexity_link_pmid(row, species, species_lower): 
        query = (
            f"What-is-the-biological-relevance-of-the-ligand-and-receptor-pair-"
            f"{row[f'{species} LR Pair']}-based-on-Pubmed-ID-"
            f"{row['PMID']}-in-{species_lower}"
        )
        return (
             f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank" style="text-decoration: none;">&#128269;</a>'
        )
    
    
    # Apply function to the DataFrame
    gene_pair["Perplexity"] = gene_pair.apply(
        generate_perplexity_link_pmid, axis=1, args=(species, species_lower)
    )
    
    
    gene_pair = gene_pair.rename(columns={
                                          f"{species_id} ligand": f"Ligand {species_id} ID",
                                          f"{species_id} receptor": f"Receptor {species_id} ID"
                                         }
                                )
    ligand_loc_col = [col for col in gene_pair.columns if "Ligand Location" in col][0]
    receptor_loc_col = [col for col in gene_pair.columns if "Receptor Location" in col][0]
    lr_pair_card = [col for col in gene_pair.columns if ">LR Pair Card" in col][0]
    # gene_pair.columns
    gene_pair["LR Pair Card"] = gene_pair[lr_pair_card]
    gene_pair = gene_pair[[interaction_id_col, "LR Pair Card",f"{species} LR Pair", 'Ligand Symbols', 'Receptor Symbols', ligand_loc_col, receptor_loc_col, f"Ligand {species_id} ID", f"Receptor {species_id} ID", "Perplexity", f"{species} evidence", "Human Ligand Symbols", "Human Receptor Symbols"]]
    if species == "Mouse":
        # Linkify multiple species IDs in Ligand column
        gene_pair[f"Ligand {species_id} ID"] = gene_pair[f"Ligand {species_id} ID"].apply(
            lambda cell: ", ".join(
                f'<a href="https://www.informatics.jax.org/marker/{mgi.strip()}" target="_blank">{mgi.strip()}</a>'
                for mgi in str(cell).split(", ")
                if mgi.strip()
            ) if pd.notna(cell) else ""
        )
        
        # Linkify multiple MGI IDs in Receptor column
        gene_pair[f"Receptor {species_id} ID"] = gene_pair[f"Receptor {species_id} ID"].apply(
            lambda cell: ", ".join(
                f'<a href="https://www.informatics.jax.org/marker/{mgi.strip()}" target="_blank">{mgi.strip()}</a>'
                for mgi in str(cell).split(", ")
                if mgi.strip()
            ) if pd.notna(cell) else ""
        )
        
    elif species == "Rat":
        # Linkify multiple RGD IDs in Receptor column
        gene_pair[f"Ligand {species_id} ID"] = gene_pair["Ligand RGD ID"].apply(
            lambda cell: ", ".join(
                f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={rgd.strip()}" target="_blank">{"RGD:"+rgd.strip()}</a>'
                for rgd in str(cell).split(", ")
                if rgd.strip()
            ) if pd.notna(cell) else ""
        )
        
        
        # Linkify multiple RGD IDs in Receptor column
        gene_pair[f"Receptor {species_id} ID"] = gene_pair["Receptor RGD ID"].apply(
            lambda cell: ", ".join(
                f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={rgd.strip()}" target="_blank">{"RGD:"+rgd.strip()}</a>'
                for rgd in str(cell).split(", ")
                if rgd.strip()
            ) if pd.notna(cell) else ""
        )
        
    elif species == "Zebrafish":
        # Linkify multiple Zebrafish Receptor IDs
        gene_pair[f"Ligand {species_id} ID"] = gene_pair['Ligand ZFIN ID'].apply(
            lambda cell: ", ".join(
                f'<a href="https://zfin.org/{zfin.strip()}" target="_blank">{zfin.strip()}</a>'
                for zfin in str(cell).split(", ")
                if zfin.strip()
            ) if pd.notna(cell) else ""
        )
        
        # Linkify multiple Zebrafish Ligand IDs
        gene_pair[f"Receptor {species_id} ID"] = gene_pair['Receptor ZFIN ID'].apply(
            lambda cell: ", ".join(
                f'<a href="https://zfin.org/{zfin.strip()}" target="_blank">{zfin.strip()}</a>'
                for zfin in str(cell).split(", ")
                if zfin.strip()
            ) if pd.notna(cell) else ""
        )
    elif species == "Frog":
        def make_xenbase_link(cell):
            links = []
            for xid in str(cell).split(","):
                xid = xid.strip()
                if xid.startswith("XB-GENE-"):
                    url = f"https://www.xenbase.org/xenbase/gene/showgene.do?method=display&geneId={xid}"
                    links.append(f'<a href="{url}" target="_blank">{xid}</a>')
            return ", ".join(links)

        gene_pair[f"Ligand {species_id} ID"] = gene_pair['Ligand XEN ID'].apply(make_xenbase_link)
        gene_pair[f"Receptor {species_id} ID"] = gene_pair['Receptor XEN ID'].apply(make_xenbase_link)
        
    else:
        def make_ens_link(cell):
            links = []
            for xid in str(cell).split(","):
                xid = eid.strip()
                if eid.startswith("ENS"):
                    url = f" http://www.ensembl.org/id/{xid}"
                    links.append(f'<a href="{url}" target="_blank">{eid}</a>')
            return ", ".join(links)

        gene_pair[f"Ligand {species_id} ID"] = gene_pair['Ligand ENSEMBL ID'].apply(make_ens_link)
        gene_pair[f"Receptor {species_id} ID"] = gene_pair['Receptor ENSEMBL ID'].apply(make_ens_link)

    
    ### tooltips 
    gene_pair["Ligand Symbols"] = [
        f'<span title="{aliases}">{aliases}</span>'
        for aliases in gene_pair["Ligand Symbols"]
    ]
    gene_pair["Receptor Symbols"] = [
        f'<span title="{aliases}">{aliases}</span>'
        for aliases in gene_pair["Receptor Symbols"]
    ]
    return gene_pair


mouse_gene_pair1 = process_species_gene_pair("Mouse", fetchGSheet, gene_pair)
rat_gene_pair1 = process_species_gene_pair("Rat", fetchGSheet, gene_pair)
zebrafish_gene_pair1 = process_species_gene_pair("Zebrafish", fetchGSheet, gene_pair)
frog_gene_pair1 = process_species_gene_pair("Frog", fetchGSheet, gene_pair)
chicken_gene_pair1 = process_species_gene_pair("Chicken", fetchGSheet, gene_pair)

NameError: name 'eid' is not defined

In [13]:
gene_pair0_copy = gene_pair0_copy.merge(conservation, how= 'left', on = "LR Pair Card")

In [14]:
gene_pair0_copy

Unnamed: 0,Interaction ID,LR Pair Card,Human LR Pair,Ligand,Receptor,Ligand Symbols,Receptor Symbols,Ligand Location,Receptor Location,Ligand HGNC ID,...,Human evidence,PMID,Ligand Name,Receptor Name,Mouse_ligand,Mouse_receptor,Ligand MGI ID,Receptor MGI ID,Direct,Conserved
0,CDB00001,A2M HSPA5,A2M HSPA5,A2M,HSPA5,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on PMID: 12194978, PMID: 32...","<a href=""https://www.genenames.org/data/gene-s...",...,DIRECT,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",alpha-2-macroglobulin,heat shock protein family A (Hsp70) member 5,,,,,Human,"Mouse, Rat, Frog, Zebrafish, Chicken, Macaque,..."
1,CDB00002,A2M LRP1,A2M LRP1,A2M,LRP1,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/gene-s...",...,DIRECT,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",alpha-2-macroglobulin,LDL receptor related protein 1,,,,,Human,"Mouse, Rat, Frog, Zebrafish, Chicken, Macaque,..."
2,CDB00003,ACE BDKRB2,ACE BDKRB2,ACE,BDKRB2,"<span title=""ACE (DCP1, ACE1, CD143)"">ACE (DCP...","<span title=""BDKRB2 (BK-2)"">BDKRB2 (BK-2)</span>","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/gene-s...",...,DIRECT,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",angiotensin I converting enzyme,bradykinin receptor B2,,,,,Human,"Mouse, Rat, Frog, Zebrafish, Chicken, Macaque,..."
3,CDB00004,ADA DPP4,ADA DPP4,ADA,DPP4,"<span title=""ADA (ADA1)"">ADA (ADA1)</span>","<span title=""DPP4 (CD26, ADCP2, DPPIV)"">DPP4 (...","<span title=""based on hpa, uniprot"">cell membr...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...",...,CONSERVATION,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",adenosine deaminase,dipeptidyl peptidase 4,,,,,,"Human, Mouse, Rat, Frog, Zebrafish, Chicken, M..."
4,CDB00005,ADAM10 EFNA5,ADAM10 EFNA5,ADAM10,EFNA5,"<span title=""ADAM10 (kuz, MADM, HsT18717, CD15...","<span title=""EFNA5 (EPLG7, AF1, LERK7)"">EFNA5 ...","<span title=""based on hpa, perplexity, uniprot...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...",...,CONSERVATION,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",ADAM metallopeptidase domain 10,ephrin A5,,,,,,"Human, Mouse, Rat, Frog, Zebrafish, Chicken, M..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3584,CDB03577,Pcdhb9 Pcdhb9,no human ortholog,not conserved,not conserved,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">cell membrane</...","<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...",...,not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",protocadherin beta 9,protocadherin beta 9,Pcdhb9,Pcdhb9,MGI:2136744,MGI:2136744,Mouse,Rat
3585,CDB03578,Pcdhgb8 Pcdhgb8,no human ortholog,not conserved,not conserved,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...",...,not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...","protocadherin gamma subfamily B, 8","protocadherin gamma subfamily B, 8",Pcdhgb8,Pcdhgb8,MGI:1935200,MGI:1935200,Mouse,
3586,CDB03579,Saa3 Tlr4,no human ortholog,not conserved,TLR4,"<span title=""no human ortholog"">no human ortho...","<span title=""TLR4 (hToll, CD284, TLR-4, ARMD10...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...",...,not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...",serum amyloid A 3,toll-like receptor 4,Saa3,Tlr4,MGI:98223,MGI:96824,Mouse,Zebrafish
3587,CDB03580,Sema4a Timd2,no human ortholog,SEMA4A,not conserved,"<span title=""SEMA4A (SEMAB, SemB, FLJ12287, CO...","<span title=""no human ortholog"">no human ortho...","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...",...,not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/re...","sema domain, immunoglobulin domain (Ig), trans...",T cell immunoglobulin and mucin domain contain...,Sema4a,Timd2,MGI:107560,MGI:2159681,Mouse,"Rat, Frog, Horse"


In [11]:
conservation

Unnamed: 0,LR Pair Card,Direct,Conserved
0,A2M HSPA5,Human,"Mouse, Rat, Frog, Zebrafish, Chicken, Macaque,..."
1,A2M LRP1,Human,"Mouse, Rat, Frog, Zebrafish, Chicken, Macaque,..."
2,ACE BDKRB2,Human,"Mouse, Rat, Frog, Zebrafish, Chicken, Macaque,..."
3,ADA DPP4,,"Human, Mouse, Rat, Frog, Zebrafish, Chicken, M..."
4,ADAM10 EFNA5,,"Human, Mouse, Rat, Frog, Zebrafish, Chicken, M..."
...,...,...,...
3576,ZG16B TLR4,Human,"Human, Mouse, Rat, Frog, Macaque, Pig, Dog, Co..."
3577,ZG16B TLR5,Human,"Mouse, Rat, Frog, Macaque, Pig, Dog, Cow, Chim..."
3578,ZG16B TLR6,Human,"Human, Mouse, Rat, Frog, Macaque, Pig, Dog, Co..."
3579,ZP3 CHRNA7,Mouse,"Human, Rat, Frog, Zebrafish, Chicken, Macaque,..."


In [12]:
conservation.to_csv("data/test.csv")

In [23]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import re
from bs4 import BeautifulSoup
from createDataTable import gene_pair, gene_pair000, human_columns, lrPairsCount
import warnings
import fetchGSheet 

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


species = "Mouse"
species_id = "MGI"
species_lower = species.lower()

def extract_visible_text(col):
    """Extract visible text between '>' and '</a>'."""
    match = re.search(r'>([^<]+)</a>', col)
    if match:
        return match.group(1).strip()
    return None

# Grab the Interaction ID and 
gene_pair = gene_pair.iloc[:, :9]
# Next, drop columns at index positions 3 and 4 ("Ligand and Receptor" since we already have ligand symbols and receptor symbols)
gene_pair = gene_pair.drop(gene_pair.columns[[3, 4]], axis=1)
exclude_keywords = ["HGNC ID", "Location", "Human"]  # Columns containing this will not be modified
keywords_to_modify = ["Ligand Symbols", "Receptor Symbols"]
# Copy the original columns so we can modify only the first 10
new_columns = gene_pair.columns.tolist()

# Modify only the first 10 columns
new_columns = [
    f'{col.split(">")[0]}">Human {col.split(">")[1]}</span>'
    if any(keyword in col for keyword in keywords_to_modify) and not any(exclude in col for exclude in exclude_keywords)
    else col
    for col in new_columns
]
# Assign the modified column names back to the DataFrame
gene_pair.columns = new_columns

gene_pair["LR Pair Card"] = gene_pair.iloc[:, 1].apply(extract_visible_text)
ligand_symbols_col = [col for col in gene_pair.columns if "Ligand Symbols" in col][0]
receptor_symbols_col = [col for col in gene_pair.columns if "Receptor Symbols" in col][0]
gene_pair = gene_pair.rename(columns={ligand_symbols_col: "Human Ligand Symbols",
                                      receptor_symbols_col: "Human Receptor Symbols"})

gene_pair_species = getattr(fetchGSheet, f"gene_pair_{species_lower}")  
gene_pair_species = gene_pair_species[[
    "LR Pair Card",
    f"{species}_ligand",
    f"{species}_receptor",
    f"{species} evidence",
    f"{species_id} ligand",
    f"{species_id} receptor",
    "PMID"
]]


gene_pair =gene_pair_species.merge(gene_pair,how="left", on="LR Pair Card")
species_info = pd.read_csv(f"data/MRK_Merged_{species_id}_DB.tsv", sep="\t", dtype=str)
species_info = species_info[[f"{species_id} Marker Accession ID", 'Marker Name','Aliases']]
species_info["Aliases"] = species_info["Aliases"].str.replace("|", ", ", regex=False)
gene_pair = gene_pair.merge(species_info,how="left", left_on = f"{species_id} ligand",right_on=f"{species_id} Marker Accession ID")
gene_pair = gene_pair.drop(columns=[f"{species_id} Marker Accession ID"])
gene_pair = gene_pair.rename(columns={
                                      "Marker Name": "Ligand Name",
                                      "Aliases": "Ligand Symbols"
                                     }
                            )
gene_pair = gene_pair.merge(
    species_info,
    how="left",
    left_on=f"{species_id} receptor",
    right_on=f"{species_id} Marker Accession ID"
)

gene_pair = gene_pair.drop(columns=[f"{species_id} Marker Accession ID"])
gene_pair = gene_pair.rename(columns={
                                      "Marker Name": "Receptor Name",
                                      "Aliases": "Receptor Symbols"
                                     }
                            )

gene_pair[f"{species} LR Pair"] = np.where(
    gene_pair[f"{species} evidence"] == "not conserved", 
    f"no {species_lower} ortholog",                                  
    gene_pair[f"{species}_ligand"] + " " + gene_pair[f"{species}_receptor"] 
)

gene_pair = gene_pair[~(gene_pair[f"{species} evidence"] == "not conserved")]
def format_symbol_aliases(symbol, aliases):
    """
    Formats symbol, old symbols, and aliases.
    If the final formatted string would be empty after considering N/A values
    and empty inputs, it returns "species-specific".
    Otherwise, it formats based on the presence of old_symbol and aliases,
    removing unnecessary parentheses or commas, following the structure:
    "Symbol (Old Symbol, Aliases)" if both exist.
    """
    # Normalize inputs to empty strings if they are None/NaN or just whitespace
    symbol_str = str(symbol).strip()
    # old_symbol_str = str(old_symbol).strip()
    aliases_str = str(aliases).strip()

    # Filter out values that are empty strings or "N/A" for old_symbol and aliases
    parts_for_join = []
    # if old_symbol_str and old_symbol_str != "N/A":
    #     parts_for_join.append(old_symbol_str)
    if aliases_str and aliases_str != "N/A":
        parts_for_join.append(aliases_str)

    # Construct the preliminary result based on your original logic:
    # "symbol (old_symbol, aliases)" if parts_for_join is not empty, else "symbol"
    if parts_for_join:
        prelim_result = f"{symbol_str} ({', '.join(parts_for_join)})"
    else:
        prelim_result = symbol_str # Just the symbol if no old_symbol or aliases

    return prelim_result

# This is crucial for consistent handling by the function before processing "N/A".
gene_pair[f"{species}_ligand"] = gene_pair[f"{species}_ligand"].fillna('')
gene_pair['Ligand Symbols'] = gene_pair['Ligand Symbols'].fillna('')

gene_pair['Ligand Symbols'] = gene_pair.apply(
    lambda row: format_symbol_aliases(row[f"{species}_ligand"], row['Ligand Symbols']),
    axis=1
)

gene_pair[f"{species}_receptor"] = gene_pair[f"{species}_receptor"].fillna('')
gene_pair['Receptor Symbols'] = gene_pair['Receptor Symbols'].fillna('')

gene_pair['Receptor Symbols'] = gene_pair.apply(
    lambda row: format_symbol_aliases(row[f"{species}_receptor"], row['Receptor Symbols']),
    axis=1
)
# The list of columns to group by
interaction_id_col = [col for col in gene_pair.columns if "Interaction ID" in col][0]

grouping_cols = [
    interaction_id_col, f"{species} LR Pair", "Ligand Symbols", "Receptor Symbols" 
]

aggregation_cols = [
    col for col in gene_pair.columns if col not in grouping_cols
]

# 3. Create a dictionary mapping each aggregation column to the joining function
agg_dict = {
    col: lambda x: ', '.join(x.astype(str).unique()) for col in aggregation_cols
}

# 4. Perform the groupby and aggregation
gene_pair = gene_pair.groupby(grouping_cols).agg(agg_dict).reset_index()

def generate_perplexity_link_pmid(row, species, species_lower): 
    query = (
        f"What-is-the-biological-relevance-of-the-ligand-and-receptor-pair-"
        f"{row[f'{species} LR Pair']}-based-on-Pubmed-ID-"
        f"{row['PMID']}-in-{species_lower}"
    )
    return (
         f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank" style="text-decoration: none;">&#128269;</a>'
    )


# Apply function to the DataFrame
gene_pair["Perplexity"] = gene_pair.apply(
    generate_perplexity_link_pmid, axis=1, args=(species, species_lower)
)


gene_pair = gene_pair.rename(columns={
                                      f"{species_id} ligand": f"Ligand {species_id} ID",
                                      f"{species_id} receptor": f"Receptor {species_id} ID"
                                     }
                            )
ligand_loc_col = [col for col in gene_pair.columns if "Ligand Location" in col][0]
receptor_loc_col = [col for col in gene_pair.columns if "Receptor Location" in col][0]
lr_pair_card = [col for col in gene_pair.columns if ">LR Pair Card" in col][0]
# gene_pair.columns
gene_pair["LR Pair Card"] = gene_pair[lr_pair_card]

In [24]:
gene_pair

Unnamed: 0,"<span title=""Double-click header of Interaction ID to ensure all values are shown"">Interaction ID&nbsp;</span>",Mouse LR Pair,Ligand Symbols,Receptor Symbols,LR Pair Card,Mouse_ligand,Mouse_receptor,Mouse evidence,Ligand MGI ID,Receptor MGI ID,PMID,"<span title=""Double-click header of LR Pair Card to ensure all values are shown"">LR Pair Card&nbsp;</span>","<span title=""Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)"">Human LR Pair</span>",Human Ligand Symbols,Human Receptor Symbols,"<span title=""Location based on the predicted subcellular localization of the human proteome"">Ligand Location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome"">Receptor Location</span>",Ligand Name,Receptor Name,Perplexity
0,CDB00001,A2m Hspa5,A2m (A2mp),"Hspa5 (Hsce70, Bip, Grp78, Sez7, D2Wsu141e, D2...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m,Hspa5,CONSERVATION,MGI:2449119,MGI:95835,"12194978, 32541810","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2M HSPA5,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on PMID: 12194978, PMID: 32...",alpha-2-macroglobulin,heat shock protein 5,"<a href=""https://www.perplexity.ai/search?q=Wh..."
1,CDB00002,A2m Lrp1,A2m (A2mp),"Lrp1 (CD91, A2mr, b2b1554Clo)","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m,Lrp1,CONSERVATION,MGI:2449119,MGI:96828,"10652313, 12194978, 1702392","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2M LRP1,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on hpa, perplexity, uniprot...",alpha-2-macroglobulin,low density lipoprotein receptor-related prote...,"<a href=""https://www.perplexity.ai/search?q=Wh..."
2,CDB00003,Ace Bdkrb2,Ace (CD143),"Bdkrb2 (B2, B(2), kinin B2, BK2R, B2R)","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ace,Bdkrb2,CONSERVATION,MGI:87874,MGI:102845,10748135,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",ACE BDKRB2,"<span title=""ACE (DCP1, ACE1, CD143)"">ACE (DCP...","<span title=""BDKRB2 (BK-2)"">BDKRB2 (BK-2)</span>","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on hpa, perplexity, uniprot...",angiotensin I converting enzyme,"bradykinin receptor, beta 2","<a href=""https://www.perplexity.ai/search?q=Wh..."
3,CDB00004,Ada Dpp4,Ada,"Dpp4 (THAM, Dpp-4, Cd26)","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ada,Dpp4,CONSERVATION,MGI:87916,MGI:94919,15213224,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",ADA DPP4,"<span title=""ADA (ADA1)"">ADA (ADA1)</span>","<span title=""DPP4 (CD26, ADCP2, DPPIV)"">DPP4 (...","<span title=""based on hpa, uniprot"">cell membr...","<span title=""based on perplexity, uniprot"">cel...",adenosine deaminase,dipeptidylpeptidase 4,"<a href=""https://www.perplexity.ai/search?q=Wh..."
4,CDB00005,Adam10 Efna5,"Adam10 (kuz, kuzbanian, 1700031C13Rik)","Efna5 (AL-1, RAGS, Ephrin-A5, Epl7, EFL-5, LER...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Adam10,Efna5,CONSERVATION,MGI:109548,MGI:107444,16239146,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",ADAM10 EFNA5,"<span title=""ADAM10 (kuz, MADM, HsT18717, CD15...","<span title=""EFNA5 (EPLG7, AF1, LERK7)"">EFNA5 ...","<span title=""based on hpa, perplexity, uniprot...","<span title=""based on perplexity, uniprot"">cel...",a disintegrin and metallopeptidase domain 10,ephrin A5,"<a href=""https://www.perplexity.ai/search?q=Wh..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3251,CDB03577,Pcdhb9 Pcdhb9,"Pcdhb9 (Pcdhb4C, PcdhbI)","Pcdhb9 (Pcdhb4C, PcdhbI)","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb9,Pcdhb9,DIRECT,MGI:2136744,MGI:2136744,25171406,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",no human ortholog,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">cell membrane</...","<span title=""based on uniprot"">cell membrane</...",protocadherin beta 9,protocadherin beta 9,"<a href=""https://www.perplexity.ai/search?q=Wh..."
3252,CDB03578,Pcdhgb8 Pcdhgb8,Pcdhgb8,Pcdhgb8,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhgb8,Pcdhgb8,DIRECT,MGI:1935200,MGI:1935200,25171406,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",no human ortholog,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...","protocadherin gamma subfamily B, 8","protocadherin gamma subfamily B, 8","<a href=""https://www.perplexity.ai/search?q=Wh..."
3253,CDB03579,Saa3 Tlr4,"Saa3 (Saa-3, l7R3)","Tlr4 (Lps, Rasl2-8, lipopolysaccharide response)","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Saa3,Tlr4,DIRECT,MGI:98223,MGI:96824,23858030,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",no human ortholog,"<span title=""no human ortholog"">no human ortho...","<span title=""TLR4 (hToll, CD284, TLR-4, ARMD10...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on perplexity, uniprot"">cel...",serum amyloid A 3,toll-like receptor 4,"<a href=""https://www.perplexity.ai/search?q=Wh..."
3254,CDB03580,Sema4a Timd2,"Sema4a (SemB, SemB, Semab)","Timd2 (Tim2, TIM-2)","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Sema4a,Timd2,DIRECT,MGI:107560,MGI:2159681,12374982,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",no human ortholog,"<span title=""SEMA4A (SEMAB, SemB, FLJ12287, CO...","<span title=""no human ortholog"">no human ortho...","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...","sema domain, immunoglobulin domain (Ig), trans...",T cell immunoglobulin and mucin domain contain...,"<a href=""https://www.perplexity.ai/search?q=Wh..."


In [2]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings
import urllib.parse

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "mmusculus", "rnorvegicus", "drerio", "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet",
    "cjacchus", "mmulatta", "xtropicalis"
]

# Select only the relevant columns from pop_up_info
cols_to_keep = cols_to_keep = list(range(0, 30)) 
# Step 3: Load file using only the desired columns
df = pd.read_table("data/HGNC_gene_info_full.tsv", usecols=cols_to_keep)
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          #"rgd_id": "RGD ID",
                                          #"mgd_id": "MGI ID", 
                                          "alias_symbol": "Alias symbol", # add to table
                                          "prev_symbol": "Previous symbol", # add to table
                                          "date_symbol_changed": "Date symbol changed"
                                         })

# Keep only first MGI/RGD ID
#pop_up_info["MGI ID"] = pop_up_info["MGI ID"].str.split("|").str[0]
#pop_up_info["RGD ID"] = pop_up_info["RGD ID"].str.split("|").str[0]

pop_up_info["Alias symbol"] = pop_up_info["Alias symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

pop_up_info["Previous symbol"] = pop_up_info["Previous symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

# Replace "|" with ", "
pop_up_info["Alias symbol"] = [value.replace("|", ", ") for value in pop_up_info["Alias symbol"]]
pop_up_info["Previous symbol"] = [value.replace("|", ", ") for value in pop_up_info["Previous symbol"]]

pop_up_info["Date symbol changed"] = pop_up_info["Date symbol changed"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)


pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "Alias symbol", # "MGI ID", "RGD ID"
                               "Approved symbol", "Previous symbol"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair_human.dropna(axis=1, how='all')
gene_pair.columns

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


Index(['Sorting order', 'LR_pair_orig', 'PMID', 'lig_species', 'rec_species',
       'ligand_orig', 'receptor_orig', 'Pair_species', 'LR Pair Card',
       'Human evidence', 'Homo sapiens_ligand', 'Homo sapiens_receptor',
       'ENSEMBL ligand', 'ENSEMBL receptor', 'HGNC ligand', 'HGNC receptor'],
      dtype='object', name=0)

In [3]:
gene_pair = gene_pair[gene_pair['LR_pair_orig'] != '']

In [59]:
from pathlib import Path
import sys
import pandas as pd
import re
import html
import json
from jinja2 import Environment, FileSystemLoader

# === Import from createDataTable.py
sys.path.append("src")
import createDataTable_perSpecies

# === Paths (relative to project/)
species = "mouse"
# additional placeholder for search
species_addl_search = "Epha3, 5430401F13Rik, no human ortholog"
output_json = Path(f"JSON/{species}_gene_pair.json")  
qmd_template = Path(f"database/qmd_template/{species}Orth_template.qmd") 
qmd_output = Path(f"database/{species}Orth.qmd")  
template_dir = "HTML"
template_name = "datatableOrth_template.html"

# === Create output directories if needed
output_json.parent.mkdir(parents=True, exist_ok=True)

# === Clean column names and Save DataFrame to JSON
def clean_column_names_and_generate_metadata(df):
    def visible_text(html_string):
        return re.sub(r'<[^>]*>', '', html.unescape(html_string)).strip()
    
    raw_columns = df.columns.tolist()
    visible_columns = [visible_text(col) for col in raw_columns]
    
    df_cleaned = df.copy()
    df_cleaned.columns = visible_columns
    
    column_metadata = [
        {
            "data": visible,
            "title": html_col
        }
        for visible, html_col in zip(visible_columns, raw_columns)
    ]
    return df_cleaned, column_metadata

# === Generate DataTables column definitions
# Use getattr to dynamically access the attribute
gene_pair = getattr(createDataTable_perSpecies, f"{species}_gene_pair1")  

In [61]:
df_cleaned, columns_metadata = clean_column_names_and_generate_metadata(gene_pair)

In [65]:
df_cleaned.columns

Index(['Interaction ID', 'LR Pair Card', 'Mouse LR Pair', 'Ligand Symbols',
       'Receptor Symbols', 'Ligand Location', 'Receptor Location',
       'Ligand MGI ID', 'Receptor MGI ID', 'Perplexity', 'Mouse evidence',
       'Human Ligand Symbols', 'Human Receptor Symbols'],
      dtype='object')

In [56]:
# Linkify multiple MGI IDs in Ligand column
gene_pair["Ligand MGI ID"] = gene_pair["Ligand MGI ID"].apply(
    lambda cell: ", ".join(
        f'<a href="https://www.informatics.jax.org/marker/{mgi.strip()}" target="_blank">{mgi.strip()}</a>'
        for mgi in str(cell).split(", ")
        if mgi.strip()
    ) if pd.notna(cell) else ""
)

# Linkify multiple MGI IDs in Receptor column
gene_pair["Receptor MGI ID"] = gene_pair["Receptor MGI ID"].apply(
    lambda cell: ", ".join(
        f'<a href="https://www.informatics.jax.org/marker/{mgi.strip()}" target="_blank">{mgi.strip()}</a>'
        for mgi in str(cell).split(", ")
        if mgi.strip()
    ) if pd.notna(cell) else ""
)
gene_pair

Unnamed: 0,"<span title=""Double-click header of Interaction ID to ensure all values are shown"">Interaction ID&nbsp;</span>",LR Pair Card,Mouse LR Pair,Ligand Symbols,Receptor Symbols,"<span title=""Location based on the predicted subcellular localization of the human proteome"">Ligand Location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome"">Receptor Location</span>",Ligand MGI ID,Receptor MGI ID,Perplexity,Mouse evidence,Human Ligand Symbols,Human Receptor Symbols
0,CDB00001,A2M HSPA5,A2m Hspa5,A2m (A2mp),"Hspa5 (Hsce70, Bip, Grp78, Sez7, D2Wsu141e, D2...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on PMID: 12194978, PMID: 32...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,..."
1,CDB00002,A2M LRP1,A2m Lrp1,A2m (A2mp),"Lrp1 (CD91, A2mr, b2b1554Clo)","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A..."
2,CDB00003,ACE BDKRB2,Ace Bdkrb2,Ace (CD143),"Bdkrb2 (B2, B(2), kinin B2, BK2R, B2R)","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION,"<span title=""ACE (DCP1, ACE1, CD143)"">ACE (DCP...","<span title=""BDKRB2 (BK-2)"">BDKRB2 (BK-2)</span>"
3,CDB00004,ADA DPP4,Ada Dpp4,Ada,"Dpp4 (THAM, Dpp-4, Cd26)","<span title=""based on hpa, uniprot"">cell membr...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION,"<span title=""ADA (ADA1)"">ADA (ADA1)</span>","<span title=""DPP4 (CD26, ADCP2, DPPIV)"">DPP4 (..."
4,CDB00005,ADAM10 EFNA5,Adam10 Efna5,"Adam10 (kuz, kuzbanian, 1700031C13Rik)","Efna5 (AL-1, RAGS, Ephrin-A5, Epl7, EFL-5, LER...","<span title=""based on hpa, perplexity, uniprot...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION,"<span title=""ADAM10 (kuz, MADM, HsT18717, CD15...","<span title=""EFNA5 (EPLG7, AF1, LERK7)"">EFNA5 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3251,CDB03577,Pcdhb9 Pcdhb9,Pcdhb9 Pcdhb9,"Pcdhb9 (Pcdhb4C, PcdhbI)","Pcdhb9 (Pcdhb4C, PcdhbI)","<span title=""based on uniprot"">cell membrane</...","<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho..."
3252,CDB03578,Pcdhgb8 Pcdhgb8,Pcdhgb8 Pcdhgb8,Pcdhgb8,Pcdhgb8,"<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho..."
3253,CDB03579,Saa3 Tlr4,Saa3 Tlr4,"Saa3 (Saa-3, l7R3)","Tlr4 (Lps, Rasl2-8, lipopolysaccharide response)","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT,"<span title=""no human ortholog"">no human ortho...","<span title=""TLR4 (hToll, CD284, TLR-4, ARMD10..."
3254,CDB03580,Sema4a Timd2,Sema4a Timd2,"Sema4a (SemB, SemB, Semab)","Timd2 (Tim2, TIM-2)","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT,"<span title=""SEMA4A (SEMAB, SemB, FLJ12287, CO...","<span title=""no human ortholog"">no human ortho..."


In [47]:
gene_pair.columns

Index(['LR Pair Card', 'Mouse_ligand', 'Mouse_receptor', 'Mouse evidence',
       'MGI ligand', 'MGI receptor', 'PMID',
       '<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
       '<span title="Double-click header of LR Pair Card to ensure all values are shown">LR Pair Card&nbsp;</span>',
       '<span title="Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">Human LR Pair</span>',
       'Human Ligand Symbols', 'Human Receptor Symbols',
       '<span title="Location based on the predicted subcellular localization of the human proteome">Ligand Location</span>',
       '<span title="Location based on the predicted subcellular localization of the human proteome">Receptor Location</span>',
       'Ligand Name', 'Ligand Symbols', 'Receptor Name', 'Receptor Symbols',
       'Mouse LR Pair'],
      dtype='object')

Index(['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
       'Mouse LR Pair', 'Ligand Symbols', 'Receptor Symbols', 'LR Pair Card',
       'Mouse_ligand', 'Mouse_receptor', 'Mouse evidence', 'Ligand MGI ID',
       'Receptor MGI ID', 'PMID',
       '<span title="Double-click header of LR Pair Card to ensure all values are shown">LR Pair Card&nbsp;</span>',
       '<span title="Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">Human LR Pair</span>',
       'Human Ligand Symbols', 'Human Receptor Symbols',
       '<span title="Location based on the predicted subcellular localization of the human proteome">Ligand Location</span>',
       '<span title="Location based on the predicted subcellular localization of the human proteome">Receptor Location</span>',
       'Ligand Name', 'Receptor Name', 'Perplexity'],
      dtype='object')

In [52]:
gene_pair = gene_pair[[interaction_id_col, "LR Pair Card",'Mouse LR Pair', 'Ligand Symbols', 'Receptor Symbols', ligand_loc_col, receptor_loc_col, "Ligand MGI ID", "Receptor MGI ID", "Perplexity", "Mouse evidence", "Human Ligand Symbols", "Human Receptor Symbols"]]

In [57]:
gene_pair.to_csv("data/test.csv")

In [201]:
gene_pair = gene_pair[~(gene_pair["Mouse evidence"] == "not conserved")]

In [202]:
gene_pair

Unnamed: 0,LR Pair Card,Mouse_ligand,Mouse_receptor,Mouse evidence,MGI ligand,MGI receptor,"<span title=""Double-click header of Interaction ID to ensure all values are shown"">Interaction ID&nbsp;</span>","<span title=""Double-click header of LR Pair Card to ensure all values are shown"">LR Pair Card&nbsp;</span>","<span title=""Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)"">Human LR Pair</span>",Human Ligand Symbols,Human Receptor Symbols,"<span title=""Location based on the predicted subcellular localization of the human proteome"">Ligand Location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome"">Receptor Location</span>",MGI Marker Accession ID,Marker Name,Aliases,Mouse LR Pair
0,ASIP ATRN,a,Atrn,DIRECT,MGI:87853,MGI:1341628,CDB00229,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",ASIP ATRN,"<span title=""ASIP (AGTIL, ASP)"">ASIP (AGTIL, A...","<span title=""ATRN (DPPT-L, MGCA)"">ATRN (DPPT-L...","<span title=""based on hpa, perplexity, uniprot...","<span title=""based on perplexity, uniprot"">cel...",MGI:87853,nonagouti,"agouti, As, agouti signal protein, ASP, Asip",a Atrn
1,A2M HSPA5,A2m,Hspa5,CONSERVATION,MGI:2449119,MGI:95835,CDB00001,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2M HSPA5,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on PMID: 12194978, PMID: 32...",MGI:2449119,alpha-2-macroglobulin,A2mp,A2m Hspa5
2,A2M HSPA5,A2m,Hspa5,CONSERVATION,MGI:2449119,MGI:95835,CDB00001,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2M HSPA5,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on PMID: 12194978, PMID: 32...",MGI:2449119,alpha-2-macroglobulin,A2mp,A2m Hspa5
3,A2M LRP1,A2m,Lrp1,CONSERVATION,MGI:2449119,MGI:96828,CDB00002,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2M LRP1,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on hpa, perplexity, uniprot...",MGI:2449119,alpha-2-macroglobulin,A2mp,A2m Lrp1
4,A2M LRP1,A2m,Lrp1,CONSERVATION,MGI:2449119,MGI:96828,CDB00002,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2M LRP1,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on hpa, perplexity, uniprot...",MGI:2449119,alpha-2-macroglobulin,A2mp,A2m Lrp1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6089,ZG16B TLR5,Sbpl,Tlr5,CONSERVATION,MGI:3694550,MGI:1858171,CDB03548,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",ZG16B TLR5,"<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR5 (SLEB1, TIL3, FLJ10052, MGC1...","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...",MGI:3694550,spermine binding protein-like,2310068J22Rik,Sbpl Tlr5
6090,ZG16B TLR6,Sbpl,Tlr6,CONSERVATION,MGI:3694550,MGI:1341296,CDB03549,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",ZG16B TLR6,"<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR6 (CD286)"">TLR6 (CD286)</span>","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...",MGI:3694550,spermine binding protein-like,2310068J22Rik,Sbpl Tlr6
6091,ZG16B TLR6,Sbpl,Tlr6,CONSERVATION,MGI:3694550,MGI:1341296,CDB03549,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",ZG16B TLR6,"<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR6 (CD286)"">TLR6 (CD286)</span>","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on perplexity, uniprot"">cel...",MGI:3694550,spermine binding protein-like,2310068J22Rik,Sbpl Tlr6
6092,ZP3 CHRNA7,Zp3,Chrna7,DIRECT,MGI:99215,MGI:99779,CDB03550,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",ZP3 CHRNA7,"<span title=""ZP3 (ZP3A, ZP3B, ZP3-424, ZP3-372...","<span title=""CHRNA7"">CHRNA7</span>","<span title=""based on uniprot"">cell membrane</...","<span title=""based on perplexity, uniprot"">cel...",MGI:99215,zona pellucida glycoprotein 3,Zp-3,Zp3 Chrna7


In [149]:
gene_pair_species = fetchGSheet.gene_pair_mouse

In [150]:
gene_pair_species= gene_pair_species[["LR Pair Card", "Mouse_ligand",
"Mouse_receptor", "MGI ligand", "MGI receptor"]]
gene_pair =gene_pair_species.merge(gene_pair,how="left", on="LR Pair Card")
mouse_info = pd.read_csv("data/MRK_Merged_MGI_DB.tsv", sep="\t", dtype=str)
mouse_info = mouse_info[['MGI Marker Accession ID', 'Marker Name','Aliases']]
gene_pair = gene_pair.merge(mouse_info,how="left", left_on = "MGI ligand",right_on="MGI Marker Accession ID")

In [155]:
mouse_info

Unnamed: 0,MGI Marker Accession ID,Marker Name,Aliases
0,MGI:1341858,"DNA segment, 03B03F (Research Genetics)",
1,MGI:1341869,"DNA segment, 03B03R (Research Genetics)",
2,MGI:1918911,RIKEN cDNA 0610005C13 gene,
3,MGI:1923503,RIKEN cDNA 0610006L08 gene,
4,MGI:1925547,RIKEN cDNA 0610008J02 gene,
...,...,...,...
127817,MGI:2446208,"zyg-11 family member A, cell cycle regulator",
127818,MGI:2685277,"zyg-ll family member B, cell cycle regulator",LOC242610|1110046I03Rik|D4Mgi23|2810482G21Rik
127819,MGI:103072,zyxin,R75157
127820,MGI:2444286,"zinc finger, ZZ-type with EF hand domain 1",8430405D05Rik|C130099L13Rik


In [122]:
gene_pair.to_csv("data/test.csv")

In [99]:
orth_info = pd.read_csv(f"data/GenePageGeneralInfo_{id}base_DB.tsv", sep="\t", dtype=str)

In [100]:
mapping_orth_symbol = dict(zip(orth_info['tropicalis gene ID'], orth_info['tropicalis gene symbol']))

In [101]:
species = "Frog"

In [102]:
mapping_orth_symbol = dict(zip(orth_info['tropicalis gene ID'], orth_info['tropicalis gene symbol']))
gene_pair_orth['Ligand Official Symbol'] = 'NA'
gene_pair_orth['Ligand Official Symbol'] =  gene_pair_orth.apply(
    lambda row: mapping_orth_symbol.get(row[f'{id} ligand'], row['Ligand Official Symbol'])
    if pd.notna(row[f'{id} ligand']) else row['Ligand Official Symbol'],
    axis=1
)
gene_pair_orth['Receptor Official Symbol'] = 'NA'
gene_pair_orth['Receptor Official Symbol'] =  gene_pair_orth.apply(
    lambda row: mapping_orth_symbol.get(row[f'{id} receptor'], row['Receptor Official Symbol'])
    if pd.notna(row[f'{id} receptor']) else row['Receptor Official Symbol'],
    axis=1
)
gene_pair_orth["same_as_off_lig"] = (
    gene_pair_orth[f"{species.lower()}_ligand"] == gene_pair_orth["Ligand Official Symbol"]
)

gene_pair_orth["same_as_off_rec"] = (
    gene_pair_orth[f"{species.lower()}_receptor"] == gene_pair_orth["Receptor Official Symbol"]
)

In [103]:
gene_pair_orth.to_csv(f"data/{id}_ID_check.csv")

In [37]:
gene_pair_orth

Unnamed: 0,Sorting order,LR_pair_orig,PMID,lig_species,rec_species,ligand_orig,receptor_orig,Pair_species,LR Pair Card,rat evidence,rat_ligand,rat_receptor,ENSEMBL ligand,ENSEMBL receptor,RGD ligand,RGD receptor,Ligand Official Symbol,Receptor Official Symbol,same_as_off_lig,same_as_off_rec
0,1,A Atrn,11137996,Mus Musculus,Mus Musculus,a,Atrn,Mus Musculus,ASIP ATRN,CONSERVATION,Asip,Atrn,ENSRNOG00000017701,ENSRNOG00000021240,2003,69063,Asip,Atrn,True,True
1,2,A2M HSPA5,12194978,Homo sapiens,Homo sapiens,A2M,HSPA5,Homo sapiens,A2M HSPA5,CONSERVATION,A2m,Hspa5,ENSRNOG00000028896,ENSRNOG00000018294,2004,2843,A2m,Hspa5,True,True
2,3,A2M HSPA5,32541810,Homo sapiens,Homo sapiens,A2M,HSPA5,Homo sapiens,A2M HSPA5,CONSERVATION,A2m,Hspa5,ENSRNOG00000028896,ENSRNOG00000018294,2004,2843,A2m,Hspa5,True,True
3,4,A2M LRP1,10652313,Homo sapiens,Homo sapiens,A2M,LRP1,Homo sapiens,A2M LRP1,CONSERVATION,A2m,Lrp1,ENSRNOG00000028896,ENSRNOG00000025053,2004,1307535,A2m,Lrp1,True,True
4,5,A2M LRP1,12194978,Homo sapiens,Homo sapiens,A2M,LRP1,Homo sapiens,A2M LRP1,CONSERVATION,A2m,Lrp1,ENSRNOG00000028896,ENSRNOG00000025053,2004,1307535,A2m,Lrp1,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6089,6090,ZG16B TLR5,20802527,Homo sapiens,Homo sapiens,ZG16B,TLR5,Homo sapiens,ZG16B TLR5,CONSERVATION,Zg16b,Tlr5,ENSRNOG00000057127,ENSRNOG00000022067,1562673,631351,Zg16b,Tlr5,True,True
6090,6091,ZG16B TLR6,20802527,Homo sapiens,Homo sapiens,ZG16B,TLR6,Homo sapiens,ZG16B TLR6,CONSERVATION,Zg16b,Tlr6,ENSRNOG00000057127,ENSRNOG00000002161,1562673,1303030,Zg16b,Tlr6,True,True
6091,6092,ZG16B Tlr6,20802527,Homo sapiens,Mus Musculus,ZG16B,Tlr6,mixed,ZG16B TLR6,CONSERVATION,Zg16b,Tlr6,ENSRNOG00000057127,ENSRNOG00000002161,1562673,1303030,Zg16b,Tlr6,True,True
6092,6093,Zp3 Chrna7,22577141,Mus Musculus,Mus Musculus,Zp3,Chrna7,Mus Musculus,ZP3 CHRNA7,CONSERVATION,Zp3,Chrna7,ENSRNOG00000001434,ENSRNOG00000010853,620606,2348,Zp3,Chrna7,True,True


In [40]:
gene_pair_orth[gene_pair_orth["same_as_off_lig"] == False].to_csv("data/RGD_ID_check_lig_false.csv")

In [6]:

# Filter to only needed columns (if they exist)
columns_to_keep = [
    "Xenbase gene page ID",
    "Gene Symbol",
    "Gene Name",
    "Gene Function",
    "Gene Synonyms",
    "JGI ID"
]

# Rename columns to match your naming style
renamed_columns = {
    "Xenbase gene page ID": "Xenbase genepage ID",
    "Gene Symbol": "gene symbol",
    "Gene Name": "gene name",
    "Gene Function": "gene function",
    "Gene Synonyms": "gene synonyms",
    "JGI ID": "JGI ID"
}

# Sanity-check before selection
existing_cols = [col for col in columns_to_keep if col in df_xenbase.columns]
df_xenbase = df_xenbase[existing_cols].rename(columns=renamed_columns)

# Save to file
df_xenbase.to_csv(xenbase_file, sep="\t", index=False, encoding="utf-8")
df_xenbase.to_csv(xenbase_file_today, sep="\t", index=False, encoding="utf-8")

print(f"✅ Xenbase gene info saved: {xenbase_file_today} — shape: {df_xenbase.shape}")


✅ Xenbase gene info saved: data/GenePageGeneralInfo_20250731_Xenbase_DB.tsv — shape: (22133, 0)


In [2]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings
import urllib.parse

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "mmusculus", "rnorvegicus", "drerio", "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet",
    "cjacchus", "mmulatta", "xtropicalis"
]

# Select only the relevant columns from pop_up_info
cols_to_keep = cols_to_keep = list(range(0, 30)) 
# Step 3: Load file using only the desired columns
df = pd.read_table("data/HGNC_gene_info_full.tsv", usecols=cols_to_keep)
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          #"rgd_id": "RGD ID",
                                          #"mgd_id": "MGI ID", 
                                          "alias_symbol": "Alias symbol", # add to table
                                          "prev_symbol": "Previous symbol", # add to table
                                          "date_symbol_changed": "Date symbol changed"
                                         })

# Keep only first MGI/RGD ID
#pop_up_info["MGI ID"] = pop_up_info["MGI ID"].str.split("|").str[0]
#pop_up_info["RGD ID"] = pop_up_info["RGD ID"].str.split("|").str[0]

pop_up_info["Alias symbol"] = pop_up_info["Alias symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

pop_up_info["Previous symbol"] = pop_up_info["Previous symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

# Replace "|" with ", "
pop_up_info["Alias symbol"] = [value.replace("|", ", ") for value in pop_up_info["Alias symbol"]]
pop_up_info["Previous symbol"] = [value.replace("|", ", ") for value in pop_up_info["Previous symbol"]]

pop_up_info["Date symbol changed"] = pop_up_info["Date symbol changed"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)


pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "Alias symbol", # "MGI ID", "RGD ID"
                               "Approved symbol", "Previous symbol"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair_human.dropna(axis=1, how='all')
gene_pair = gene_pair[gene_pair['LR_pair_orig'] != '']
# for now set source count as triplicates
sourceCount = len(gene_pair[['LR_pair_orig']])

### KEEP ALL AS OF LATEST input datatable
# for now, keep only the following columns
# gene_pair = gene_pair[['LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
#                        'perplexity link', 'PMID', 'binding location', 
#                        'bind in trans?', 'bidirectional signalling?',
#                        'interaction type', 'original source']]

gene_pair = gene_pair.dropna(subset=['LR_pair_orig'])

# some PMIDs kick in with "," so replace
gene_pair["PMID"] = [value.replace(",", "") for value in gene_pair["PMID"]]
gene_pair = gene_pair.dropna(subset=['PMID'])

### NO NEED FOR MAPPING AS OF LATEST input datatable
# Mapping for replacements
# mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# # Replace values in the column based on the mapping
# gene_pair['original source'] = gene_pair['original source'].replace(mapping)

gene_pair.columns = gene_pair.columns.str.strip()
gene_pair[['Ligand', 'Receptor']] = gene_pair['LR Pair Card'].str.split(' ', n=1, expand=True)

## add Ligand/Receptor Location
def dedup_locations(loc_str):
    # Split, strip, deduplicate, and sort
    parts = [loc.strip() for loc in loc_str.split(',') if loc.strip()]
    unique_sorted = sorted(set(parts), key=str.lower)  # case-insensitive sort
    return unique_sorted

def generate_LocToolTip(row, geneloc, loc_col):
    ligand = row[loc_col]
    original_locations = [loc.strip() for loc in row["location"].split(',')]
    original_sources = [src.strip() for src in row["source"].split(',')]

    # Get deduplicated locations
    unique_locations = dedup_locations(row["location"])

    if len(unique_locations) == 1:
        # Single tooltip case
        location = unique_locations[0]
        matching_rows = geneloc[(geneloc[loc_col] == ligand) & (geneloc["location"].str.contains(location))]
        all_sources = matching_rows["source"].unique()
        sources_str = ", ".join(sorted(set(all_sources)))
        return f'<span title="based on {sources_str}">{location}</span>'
    else:
        # Multiple tooltips — find each (ligand, location) match in original df
        spans = []
        for loc in unique_locations:
            matching_rows = geneloc[
                (geneloc[loc_col] == ligand) &
                (geneloc["location"].str.contains(loc))
            ]
            all_sources = matching_rows["source"].unique()
            sources_str = ", ".join(sorted(set(all_sources)))
            spans.append(f'<span title="based on {sources_str}">{loc}</span>')
        return ", ".join(spans)


# Group the original loc_info by Ligand
ligand_loc = fetchGSheet.ligand_loc.dropna(axis=1, how='all')
grouped = ligand_loc.groupby("Ligand").agg({
    "location": lambda x: ', '.join(x),
    "source": lambda x: ', '.join(x)
}).reset_index()

# Generate tooltips
grouped["Ligand location"] = grouped.apply(lambda row: generate_LocToolTip(row, ligand_loc,loc_col="Ligand"), axis=1)
# create dict
mapping_loc = dict(zip(grouped['Ligand'], grouped['Ligand location'])) 
gene_pair['Ligand location'] = gene_pair['Ligand'].replace(mapping_loc)


# Group the original loc_info by Receptor
receptor_loc = fetchGSheet.receptor_loc.dropna(axis=1, how='all')
grouped = receptor_loc.groupby("Receptor").agg({
    "location": lambda x: ', '.join(x),
    "source": lambda x: ', '.join(x)
}).reset_index()

# Generate tooltips
grouped["Receptor location"] = grouped.apply(lambda row: generate_LocToolTip(row, receptor_loc,loc_col="Receptor"), axis=1)
# create dict
mapping_loc = dict(zip(grouped['Receptor'], grouped['Receptor location'])) 
gene_pair['Receptor location'] = gene_pair['Receptor'].replace(mapping_loc)


# Set missing mappings to 'unknown'
gene_pair.loc[gene_pair['Ligand location'] == gene_pair['Ligand'], 'Ligand location'] = 'unknown'
gene_pair.loc[gene_pair['Receptor location'] == gene_pair['Receptor'], 'Receptor location'] = 'unknown'
# Set "n/a" to unknown
gene_pair['Ligand location'] = [value.replace("n/a", "unknown") for value in gene_pair['Ligand location']]
gene_pair['Receptor location'] = [value.replace("n/a", "unknown") for value in gene_pair['Receptor location']]

# Fetch HGNC IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

gene_pair['Human LR Pair'] = np.where(
    gene_pair['Human evidence'] == "absent in human", 
    "no human ortholog",                                  
    gene_pair['Homo sapiens_ligand'] + " " + gene_pair['Homo sapiens_receptor'] 
)


# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "LR_pair_orig": "LR Pair",
    "HGNC ligand": "Ligand HGNC ID",
    "HGNC receptor": "Receptor HGNC ID",
    "ENSEMBL ligand": "Ligand ENSEMBL ID",
    "ENSEMBL receptor": "Receptor ENSEMBL ID",
    # "perplexity link": "Perplexity", # will be replaced with actual link later
    # "original source": "Database Source",
    "Ligand location": "Ligand Location",
    "Receptor location": "Receptor Location",
    # "binding location": "Binding Location",
    # "bind in trans?" : "Trans-binding", 
    # "bidirectional signalling?": "Bidirectional Signalling",
    # "interaction type" : "Interaction Type"
    #"PMID": "PMID support" # was PMID support
})
gene_pair = gene_pair.drop(columns=["Ligand", "Receptor"])
gene_pair = gene_pair.rename(columns={"Homo sapiens_ligand": "Ligand", "Homo sapiens_receptor": "Receptor"})
# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand Name", 
                                     #"MGI ID": "Ligand MGI ID", # NOT APPLIED YET BUT should be taken from Ensembl BioMart
                                     #"RGD ID": "Ligand RGD ID", # NOT APPLIED YET BUT should be taken from Ensembl BioMart
                                      "Alias symbol": "Ligand Aliases",
                                      "Previous symbol": "Ligand Old symbol",
                                     },
                            )
gene_pair = gene_pair.drop(columns=["HGNC ID", "Approved symbol"])


gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Receptor Name",
             #"MGI ID": "Receptor MGI ID",
             #"RGD ID": "Receptor RGD ID",
                                      "Alias symbol": "Receptor Aliases",
                                      "Previous symbol": "Receptor Old symbol",}
                            )

### AS OF LATEST DB skip pathways for now (code in addPathwayDiseaseAnnot_temp.py)

# Add new columns where all Ligand Symbol & Aliases and Receptor Symbol & Aliases merged in one column
def format_symbol_aliases(symbol, old_symbol, aliases):
    """
    Formats symbol, old symbols, and aliases.
    If the final formatted string would be empty after considering N/A values
    and empty inputs, it returns "mouse-specific".
    Otherwise, it formats based on the presence of old_symbol and aliases,
    removing unnecessary parentheses or commas, following the structure:
    "Symbol (Old Symbol, Aliases)" if both exist.
    """
    # Normalize inputs to empty strings if they are None/NaN or just whitespace
    symbol_str = str(symbol).strip()
    old_symbol_str = str(old_symbol).strip()
    aliases_str = str(aliases).strip()

    # Filter out values that are empty strings or "N/A" for old_symbol and aliases
    parts_for_join = []
    if old_symbol_str and old_symbol_str != "N/A":
        parts_for_join.append(old_symbol_str)
    if aliases_str and aliases_str != "N/A":
        parts_for_join.append(aliases_str)

    # Construct the preliminary result based on your original logic:
    # "symbol (old_symbol, aliases)" if parts_for_join is not empty, else "symbol"
    if parts_for_join:
        prelim_result = f"{symbol_str} ({', '.join(parts_for_join)})"
    else:
        prelim_result = symbol_str # Just the symbol if no old_symbol or aliases

    return prelim_result

# This is crucial for consistent handling by the function before processing "N/A".
gene_pair['Ligand'] = gene_pair['Ligand'].fillna('')
gene_pair['Ligand Old symbol'] = gene_pair['Ligand Old symbol'].fillna('')
gene_pair['Ligand Aliases'] = gene_pair['Ligand Aliases'].fillna('')


# to later check which ligand-receptor pairs are non-human
def is_mouse_specific(name):
    if not isinstance(name, str):
        return False
    name = name.strip()  # remove leading/trailing spaces
    return any(c.islower() for c in name[1:])

gene_pair = gene_pair.drop(columns=["HGNC ID"])

gene_pair['Ligand Symbols'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Ligand']) 
               else format_symbol_aliases(row['Ligand'], row['Ligand Old symbol'], row['Ligand Aliases']),
    axis=1
)


# This is crucial for consistent handling by the function before processing "N/A".
gene_pair['Receptor'] = gene_pair['Receptor'].fillna('')
gene_pair['Receptor Old symbol'] = gene_pair['Receptor Old symbol'].fillna('')
gene_pair['Receptor Aliases'] = gene_pair['Receptor Aliases'].fillna('')

gene_pair['Receptor Symbols'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Receptor']) 
                else format_symbol_aliases(row['Receptor'], row['Receptor Old symbol'], row['Receptor Aliases']),
    axis=1
)

### tooltips 
gene_pair["Ligand Symbols"] = [
    f'<span title="{aliases}">{aliases}</span>'
    for aliases in gene_pair["Ligand Symbols"]
]
gene_pair["Receptor Symbols"] = [
    f'<span title="{aliases}">{aliases}</span>'
    for aliases in gene_pair["Receptor Symbols"]
]

# might be used later just save info for now (for mouse cards)
grab_mouse_info = gene_pair["LR Pair Card"][gene_pair["Human evidence"].isin(["absent in human", "not conserved"])]
grab_mouse_info = grab_mouse_info.unique()
grab_mouse_info

# Add an empty Perplexity column filled with None (just to save it's order
gene_pair['Perplexity'] = None
#gene_pair = gene_pair.drop(columns=["Approved symbol_x", "Approved symbol_y"])

### For latest DB, skip (code saved as addOrth_temp.py)

# Add
first_columns=['LR Pair Card', 'Human LR Pair', 'Ligand', 'Receptor', 'Ligand Symbols', 'Receptor Symbols', 'Ligand Location', 'Receptor Location',	'Ligand HGNC ID', 'Receptor HGNC ID', 'Perplexity', 'Human evidence'] # 'Database Source'

end_columns=['PMID', 'Pair_species', 'lig_species', 'rec_species', 'ligand_orig', 'receptor_orig']
gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns + end_columns] + end_columns]
# gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns]]

# number of unique vars (Human and Mouse both counted)

lrPairsCount = len(gene_pair["LR Pair Card"].unique())
print(lrPairsCount)

ligandCount = len(gene_pair["Ligand"].unique())
print(ligandCount)

receptorCount = len(gene_pair["Receptor"].unique())
print(receptorCount)

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


3581
1097
857


In [59]:
gene_pair_mouse_card= fetchGSheet.gene_pair_mouse
gene_pair_mouse_card['Ligand Official Symbol'] = 'NA'
gene_pair_mouse_card['Ligand Official Symbol'] =  gene_pair_mouse_card.apply(
    lambda row: mapping_mouse_symbol.get(row['MGI ligand'], row['Ligand Official Symbol'])
    if pd.notna(row['MGI ligand']) else row['Ligand Official Symbol'],
    axis=1
)
gene_pair_mouse_card['Receptor Official Symbol'] = 'NA'
gene_pair_mouse_card['Receptor Official Symbol'] =  gene_pair_mouse_card.apply(
    lambda row: mapping_mouse_symbol.get(row['MGI receptor'], row['Receptor Official Symbol'])
    if pd.notna(row['MGI receptor']) else row['Receptor Official Symbol'],
    axis=1
)
gene_pair_mouse_card["same_as_off_lig"] = (
    gene_pair_mouse_card["Mouse_ligand"] == gene_pair_mouse_card["Ligand Official Symbol"]
)

gene_pair_mouse_card["same_as_off_rec"] = (
    gene_pair_mouse_card["Mouse_receptor"] == gene_pair_mouse_card["Receptor Official Symbol"]
)

In [61]:
gene_pair_mouse_card.to_csv("data/MGI_ID_check.csv")

In [43]:
mouse_cards = gene_pair[gene_pair["LR Pair Card"].isin(grab_mouse_info)]

In [45]:
import os
import jinja2
import sys
import pandas as pd
import numpy as np
import time
import base64
import re
# Add the src directory to the path for importing modules
sys.path.append(os.path.abspath("src"))

# Import necessary modules from your existing src files
# Ensure createDataTable and createFunctionalAnnotTable are in your 'src' directory
from fetchGSheet import gene_pair_mouse
from createDataTable import pop_up_info, gene_pair0, generate_perplexity_links, gene_pair00, is_mouse_specific, grab_mouse_info
from createFunctionalAnnotTable import gene_pair_annot_ligand, gene_pair_annot_receptor

# Test or all
test = True
test_genes = ["H60A Klrk1", "H60B Klrk1", "VEGFA NRP1", "THPO MPL", "FGF1 FGFR3", "Lair1 Lilrb4A"] # Example genes
# --- Paths ---
MERGED_TEMPLATE_PATH = 'HTML/mergedCard_tabs.html'
OUTPUT_DIR = 'data/cards/' # New output directory for combined files

# function for replacing visible text:
def update_link_text_with_symbol(html_str, new_symbol):
    """
    Replace the visible text in an anchor tag with the provided symbol.
    The HGNC ID is extracted from the href and left unchanged.
    """
    # Only proceed if input is valid
    if not isinstance(html_str, str) or not html_str.strip():
        return html_str
    
    return re.sub(r">(.*?)</a>", f">{new_symbol}</a>", html_str) #<i class='fa-solid fa-arrow-up-right-from-square' style='margin-left:4px;'></i></a>

# Create Perplexity link

# Recreate Perplexity link
def create_url_basic(symbol):
          label = "Perplexity (LLM)"
          query = f"What diseases is {symbol} implicated in?"
          encoded_query = query.replace(" ", "%20")
          output = f'<a href="https://www.perplexity.ai/search?q={encoded_query}" target="_blank">{label}</a>' #<i class="fa-solid fa-arrow-up-right-from-square" style="margin-left:4px;"></i>
          return output

# --- Load and Preprocess Data (Combined from both scripts) ---

# Load PubMed data (from createPMIDpages.py)
pubmed_data = pd.read_csv("data/pubmed_results.csv")
pubmed_data["Year"] = pubmed_data["Year"].astype(str).str.replace(".0", "", regex=False).astype(int)
pubmed_data["PMID"] = pubmed_data["PMID"].astype(str)
pubmed_data = pubmed_data.reset_index(drop=True)

### For latest DB, skip
# # Load LLM results (from createPMIDpages.py)
# bio_keywords = pd.read_csv("data/llm_results.csv")

# --- Prepare gene_pair00 for PMID section (from createPMIDpages.py) ---
# gene_pair00 is used for PMID and Keywords, so it needs the '—' placeholder
# Ensure gene_pair00 is a copy to avoid SettingWithCopyWarning later
gene_pair00_copy = gene_pair00.copy()
#gene_pair00_copy["Human LR Pair"] = gene_pair00_copy["Human LR Pair"].str.replace(" ", "—")

# Merge with LLM results
# gene_pair000 = gene_pair00_copy.merge(bio_keywords, how='left', left_on="Human LR Pair", right_on='Human LR Pair')
# gene_pair000["Relevance Keywords"] = gene_pair000["Relevance Keywords"].astype(str)
 # Ensure string type
# gene_pair000["Human LR Pair"] = gene_pair000["Human LR Pair"].astype(str)

### For latest DB,
gene_pair000 = gene_pair00.copy()
gene_pair000["LR Pair Card"] = gene_pair000["LR Pair Card"].astype(str)
# --- Prepare gene_pair0 for Card section (from createCards.py) ---
# gene_pair0 is used for card details, it should retain spaces for splitting gene names
# Ensure gene_pair0 is a copy to avoid SettingWithCopyWarning later
gene_pair0_copy = gene_pair0.copy()

# grab the pairs by interaction id that has human evidence that is absent in human
# mouse_interaction_ids = gene_pair0_copy[gene_pair0_copy['Ligand'].apply(is_mouse_specific)]['Interaction ID'].tolist()
mouse_interaction_ids = gene_pair0_copy["Interaction ID"][gene_pair0_copy["Human evidence"].isin(["absent in human", "not conserved"])]

### For for latest DB, remove for now
# # Add Disease (specific) to cards
# df_disease = pd.read_csv("data/disease_annotations_per_pair.csv")
# df_disease = df_disease.groupby('interaction')['disease'].apply(', '.join).reset_index()
# mapping_disease = dict(zip(df_disease['interaction'], df_disease['disease']))
# gene_pair0_copy["Disease"] = gene_pair0_copy['Human LR Pair'].map(mapping_disease).fillna("unknown")

# gene_pair0_copy = generate_perplexity_links(
#     gene_pair0_copy,
#     pathway_col="Disease",
#     default_query_template="What-diseases-is-the-ligand-receptor-pair-{pair}-associated-with"
# )

### SHOULD BE ACTIVATED ONCE WE DECIDE TO OPEN DB 
# # Hide for now (linking to actual PMID database
# gene_pair0_copy["Interaction ID"] = gene_pair0_copy["Interaction ID"].apply(
#     lambda x: f"<a href='https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/database/filter/{x}.html'>{x}</a>"
# )

# Add external link icon
# icon_html = '<i class="fa-solid fa-arrow-up-right-from-square" style="margin-left:4px;"></i></a>'
# columns_to_update = [
#     "KEGG Pathway", "PROGENy Pathway", "Cancer-related",
#     "Disease Type", "Disease"
# ]
# for col in columns_to_update:
#     gene_pair0_copy[col] = gene_pair0_copy[col].str.replace(
#         "</a>", icon_html, regex=False
#     )

# Add missing ligand name for mouse-specific
mouse_info = pd.read_csv("data/MRK_Merged_MGI_DB.tsv", sep="\t", dtype=str)
mapping_mouse_name = dict(zip(mouse_info['MGI Marker Accession ID'], mouse_info['Marker Name']))
mapping_mouse_aliases = dict(zip(mouse_info['MGI Marker Accession ID'], mouse_info['Aliases']))
mapping_mouse_uniprot = dict(zip(mouse_info['MGI Marker Accession ID'], mouse_info['UniProt IDs']))
mapping_mouse_ncbi = dict(zip(mouse_info['MGI Marker Accession ID'], mouse_info['EntrezGene ID']))

# Combine ligand and receptor columns from mouse DB input
gene_pair_mouse_card = gene_pair_mouse[gene_pair_mouse["LR Pair Card"].isin(grab_mouse_info)]
mouse_id_map = pd.concat([
    gene_pair_mouse_card[['MGI ligand', 'ENSEMBL ligand']].rename(columns={
        'MGI ligand': 'MGI ID', 'ENSEMBL ligand': 'ENSEMBL ID'
    }),
    gene_pair_mouse_card[['MGI receptor', 'ENSEMBL receptor']].rename(columns={
        'MGI receptor': 'MGI ID', 'ENSEMBL receptor': 'ENSEMBL ID'
    })
])


In [54]:
gene_pair_mouse_card['Ligand Official Symbol'] = 'NA'
gene_pair_mouse_card['Ligand Official Symbol'] =  gene_pair_mouse_card.apply(
    lambda row: mapping_mouse_symbol.get(row['MGI ligand'], row['Ligand Official Symbol'])
    if pd.notna(row['MGI ligand']) else row['Ligand Official Symbol'],
    axis=1
)
gene_pair_mouse_card['Receptor Official Symbol'] = 'NA'
gene_pair_mouse_card['Receptor Official Symbol'] =  gene_pair_mouse_card.apply(
    lambda row: mapping_mouse_symbol.get(row['MGI receptor'], row['Receptor Official Symbol'])
    if pd.notna(row['MGI receptor']) else row['Receptor Official Symbol'],
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_pair_mouse_card['Ligand Official Symbol'] = 'NA'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_pair_mouse_card['Ligand Official Symbol'] =  gene_pair_mouse_card.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_pair_mouse_card['Receptor Official Symbol'] = 'NA'
A value is trying

In [56]:
gene_pair_mouse_card["same_as_off_lig"] = (
    gene_pair_mouse_card["Mouse_ligand"] == gene_pair_mouse_card["Ligand Official Symbol"]
)

gene_pair_mouse_card["same_as_off_rec"] = (
    gene_pair_mouse_card["Mouse_receptor"] == gene_pair_mouse_card["Receptor Official Symbol"]
)

gene_pair_mouse_card

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_pair_mouse_card["same_as_off_lig"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_pair_mouse_card["same_as_off_rec"] = (


Unnamed: 0,LR_pair_orig,PMID,lig_species,rec_species,ligand_orig,receptor_orig,Pair_species,Mouse evidence,LR Pair Card,Mouse_ligand,Mouse_receptor,ENSEMBL ligand,ENSEMBL receptor,MGI ligand,MGI receptor,Ligand Official Symbol,Receptor Official Symbol,same_as_off_lig,same_as_off_rec
770,Ccl12 Ccr2,16543609,Mus Musculus,Mus Musculus,Ccl12,Ccr2,Mus Musculus,DIRECT,Ccl12 Ccr2,Ccl12,Ccr2,ENSMUSG00000035352,ENSMUSG00000049103,MGI:108224,MGI:106185,Ccl12,Ccr2,True,True
967,Ccl6 Ccr1,31551151,Rattus norvegicus,Rattus norvegicus,Ccl6,Ccr1,Rattus norvegicus,CONSERVATION,Ccl6 Ccr1,Ccl6,Ccr1,ENSMUSG00000018927,ENSMUSG00000025804,MGI:98263,MGI:104618,Ccl6,Ccr1,True,True
968,Ccl6 Ccr1,33640900,Mus Musculus,Mus Musculus,Ccl6,Ccr1,Mus Musculus,DIRECT,Ccl6 Ccr1,Ccl6,Ccr1,ENSMUSG00000018927,ENSMUSG00000025804,MGI:98263,MGI:104618,Ccl6,Ccr1,True,True
993,Ccl9 Ccr1,12397598,Mus Musculus,Mus Musculus,Ccl9,Ccr1,Mus Musculus,DIRECT,Ccl9 Ccr1,Ccl9,Ccr1,ENSMUSG00000019122,ENSMUSG00000025804,MGI:104533,MGI:104618,Ccl9,Ccr1,True,True
1781,Defb2 Ccr6,11714836,Mus Musculus,Mus Musculus,Defb2,Ccr6,Mus Musculus,DIRECT,Defb2 Ccr6,Defb2,Ccr6,ENSMUSG00000006570,ENSMUSG00000040899,MGI:1338754,MGI:1333797,Defb2,Ccr6,True,True
1782,Defb2 Tlr4,12411706,Mus Musculus,Mus Musculus,Defb2,Tlr4,Mus Musculus,DIRECT,Defb2 Tlr4,Defb2,Tlr4,ENSMUSG00000006570,ENSMUSG00000039005,MGI:1338754,MGI:96824,Defb2,Tlr4,True,True
2272,Fcna Tlr4,28844702,Mus Musculus,Mus Musculus,Fcna,Tlr4,Mus Musculus,DIRECT,Fcna Tlr4,Fcna,Tlr4,ENSMUSG00000026938,ENSMUSG00000039005,MGI:1340905,MGI:96824,Fcna,Tlr4,True,True
4318,Pcdhb10 Pcdhb10,25171406,Mus Musculus,Mus Musculus,Pcdhb10,Pcdhb10,Mus Musculus,DIRECT,Pcdhb10 Pcdhb10,Pcdhb10,Pcdhb10,ENSMUSG00000045657,ENSMUSG00000045657,MGI:2136745,MGI:2136745,Pcdhb10,Pcdhb10,True,True
4319,Pcdhb11 Pcdhb11,25171406,Mus Musculus,Mus Musculus,Pcdhb11,Pcdhb11,Mus Musculus,DIRECT,Pcdhb11 Pcdhb11,Pcdhb11,Pcdhb11,ENSMUSG00000051486,ENSMUSG00000051486,MGI:2136746,MGI:2136746,Pcdhb11,Pcdhb11,True,True
4321,Pcdhb12 Pcdhb12,25171406,Mus Musculus,Mus Musculus,Pcdhb12,Pcdhb12,Mus Musculus,DIRECT,Pcdhb12 Pcdhb12,Pcdhb12,Pcdhb12,ENSMUSG00000043458,ENSMUSG00000043458,MGI:2136747,MGI:2136747,Pcdhb12,Pcdhb12,True,True


In [37]:
# Condition 1: Set 'Ligand Official Symbol' to 'NA' if 'lig_species' is not 'Homo sapiens'
# We use .loc for setting values based on a condition to avoid SettingWithCopyWarning
gene_pair.loc[gene_pair['lig_species'] != 'Homo sapiens', 'Ligand Official Symbol'] = 'NA'

# Condition 2: Set 'Receptor Official Symbol' to 'NA' if 'rec_species' is not 'Homo sapiens'
gene_pair.loc[gene_pair['rec_species'] != 'Homo sapiens', 'Receptor Official Symbol'] = 'NA'

not_human = gene_pair[(gene_pair['Ligand Official Symbol'] == 'NA') | (gene_pair['Receptor Official Symbol'] == 'NA')]

In [33]:
not_human["Pair_species"].unique()

array(['Mus Musculus', 'mixed', 'Bos taurus', 'Rattus norvegicus',
       'Gallus gallus', 'Danio rerio', 'Xenopus tropicalis',
       'Tetraodon nigroviridis', 'Oryctolagus cuniculus'], dtype=object)

In [40]:
gene_pair

Unnamed: 0,LR Pair,PMID,lig_species,rec_species,ligand_orig,receptor_orig,Pair_species,Human evidence,LR Pair Card,Ligand,...,Human LR Pair,Ligand Name,Ligand Aliases,Ligand Old symbol,Receptor Name,Receptor Aliases,Approved symbol,Receptor Old symbol,Ligand Symbols,Receptor Symbols
0,A Atrn,11137996,Mus Musculus,Mus Musculus,A,Atrn,Mus Musculus,CONSERVATION,A Atrn,ASIP,...,ASIP ATRN,agouti signaling protein,ASP,AGTIL,attractin,"DPPT-L, MGCA",ATRN,,"ASIP (AGTIL, ASP)","ATRN (DPPT-L, MGCA)"
1,A2M HSPA5,12194978,Homo sapiens,Homo sapiens,A2M,HSPA5,Homo sapiens,DIRECT,A2M HSPA5,A2M,...,A2M HSPA5,alpha-2-macroglobulin,"FWP007, S863-7, CPAMD5",,heat shock protein family A (Hsp70) member 5,BiP,HSPA5,GRP78,"A2M (FWP007, S863-7, CPAMD5)","HSPA5 (GRP78, BiP)"
2,A2M HSPA5,32541810,Homo sapiens,Homo sapiens,A2M,HSPA5,Homo sapiens,DIRECT,A2M HSPA5,A2M,...,A2M HSPA5,alpha-2-macroglobulin,"FWP007, S863-7, CPAMD5",,heat shock protein family A (Hsp70) member 5,BiP,HSPA5,GRP78,"A2M (FWP007, S863-7, CPAMD5)","HSPA5 (GRP78, BiP)"
3,A2M LRP1,1702392,Homo sapiens,Homo sapiens,A2M,LRP1,Homo sapiens,DIRECT,A2M LRP1,A2M,...,A2M LRP1,alpha-2-macroglobulin,"FWP007, S863-7, CPAMD5",,LDL receptor related protein 1,"LRP, CD91, LRP1A, APOER, IGFBP3R1, IGFBP-3R",LRP1,"APR, A2MR","A2M (FWP007, S863-7, CPAMD5)","LRP1 (APR, A2MR, LRP, CD91, LRP1A, APOER, IGFB..."
4,A2M LRP1,10652313,Homo sapiens,Homo sapiens,A2M,LRP1,Homo sapiens,DIRECT,A2M LRP1,A2M,...,A2M LRP1,alpha-2-macroglobulin,"FWP007, S863-7, CPAMD5",,LDL receptor related protein 1,"LRP, CD91, LRP1A, APOER, IGFBP3R1, IGFBP-3R",LRP1,"APR, A2MR","A2M (FWP007, S863-7, CPAMD5)","LRP1 (APR, A2MR, LRP, CD91, LRP1A, APOER, IGFB..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6089,ZG16B TLR5,20802527,Homo sapiens,Homo sapiens,ZG16B,TLR5,Homo sapiens,DIRECT,ZG16B TLR5,ZG16B,...,ZG16B TLR5,zymogen granule protein 16B,"HRPE773, PRO1567, JCLN2",,toll like receptor 5,"TIL3, FLJ10052, MGC126430, MGC126431",TLR5,SLEB1,"ZG16B (HRPE773, PRO1567, JCLN2)","TLR5 (SLEB1, TIL3, FLJ10052, MGC126430, MGC126..."
6090,ZG16B TLR6,20802527,Homo sapiens,Homo sapiens,ZG16B,TLR6,Homo sapiens,DIRECT,ZG16B TLR6,ZG16B,...,ZG16B TLR6,zymogen granule protein 16B,"HRPE773, PRO1567, JCLN2",,toll like receptor 6,CD286,TLR6,,"ZG16B (HRPE773, PRO1567, JCLN2)",TLR6 (CD286)
6091,ZG16B Tlr6,20802527,Homo sapiens,Mus Musculus,ZG16B,Tlr6,mixed,CONSERVATION,ZG16B Tlr6,ZG16B,...,ZG16B TLR6,zymogen granule protein 16B,"HRPE773, PRO1567, JCLN2",,toll like receptor 6,CD286,TLR6,,"ZG16B (HRPE773, PRO1567, JCLN2)",TLR6 (CD286)
6092,Zp3 Chrna7,22577141,Mus Musculus,Mus Musculus,Zp3,Chrna7,Mus Musculus,CONSERVATION,Zp3 Chrna7,ZP3,...,ZP3 CHRNA7,zona pellucida glycoprotein 3,"ZP3-424, ZP3-372, ZPC","ZP3A, ZP3B",cholinergic receptor nicotinic alpha 7 subunit,,CHRNA7,,"ZP3 (ZP3A, ZP3B, ZP3-424, ZP3-372, ZPC)",CHRNA7


In [12]:
gene_pair0_copy[gene_pair0_copy["LR Pair Card"].isin(grab_mouse_info)]

Unnamed: 0,Interaction ID,LR Pair Card,Human LR Pair,Ligand,Receptor,Ligand Symbols,Receptor Symbols,Ligand Location,Receptor Location,Ligand HGNC ID,Receptor HGNC ID,Perplexity,Human evidence,PMID,Ligand Name,Receptor Name,Mouse_ligand,Mouse_receptor,Ligand MGI ID,Receptor MGI ID
4565,CDB04566,CD99 Pilrb1,no human ortholog,CD99,Pilrb1,"<span title=""CD99 (MIC2, MIC2X, MIC2Y)"">CD99 (...","<span title=""no human ortholog"">no human ortho...","<span title=""based on perplexity"">cell membran...","<span title=""based on perplexity"">cell membran...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",CD99 molecule (Xg blood group),,,,,
4566,CDB04567,Ccl12 Ccr2,not conserved CCR2,Ccl12,Ccr2,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">secreted</span>","<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",C-C motif chemokine ligand 12,C-C motif chemokine receptor 2,Ccl12,Ccr2,MGI:108224,MGI:106185
4567,CDB04568,Ccl21B CXCR3,not conserved CXCR3,Ccl21B,CXCR3,"<span title=""no human ortholog"">no human ortho...","<span title=""CXCR3 (GPR9, CKR-L2, CMKAR3, IP10...",unknown,"<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",,C-X-C motif chemokine receptor 3,,,,
4568,CDB04569,Ccl21B Ccr7,not conserved CCR7,Ccl21B,Ccr7,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,"<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",,C-C motif chemokine receptor 7,,,,
4569,CDB04570,Ccl21B Cxcr3,not conserved CXCR3,Ccl21B,Cxcr3,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,"<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",,C-X-C motif chemokine receptor 3,,,,
4570,CDB04571,Ccl21C CXCR3,no human ortholog,Ccl21C,CXCR3,"<span title=""no human ortholog"">no human ortho...","<span title=""CXCR3 (GPR9, CKR-L2, CMKAR3, IP10...",unknown,"<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",,C-X-C motif chemokine receptor 3,,,,
4571,CDB04572,Ccl21C Cxcr3,no human ortholog,Ccl21C,Cxcr3,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,"<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",,C-X-C motif chemokine receptor 3,,,,
4572,CDB04573,Ccl6 Ccr1,not conserved CCR1,Ccl6,Ccr1,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">secreted</span>","<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",C-C motif chemokine ligand 6,C-C motif chemokine receptor 1,Ccl6,Ccr1,MGI:98263,MGI:104618
4573,CDB04573,Ccl6 Ccr1,not conserved CCR1,Ccl6,Ccr1,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">secreted</span>","<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",C-C motif chemokine ligand 6,C-C motif chemokine receptor 1,Ccl6,Ccr1,MGI:98263,MGI:104618
4574,CDB04574,Ccl9 Ccr1,not conserved CCR1,Ccl9,Ccr1,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">secreted</span>","<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",C-C motif chemokine ligand 9,C-C motif chemokine receptor 1,Ccl9,Ccr1,MGI:104533,MGI:104618


In [4]:
mouse_id_map = pd.concat([
    gene_pair_mouse_card[['MGI ligand', 'ENSEMBL ligand']].rename(columns={
        'MGI ligand': 'MGI ID', 'ENSEMBL ligand': 'ENSEMBL ID'
    }),
    gene_pair_mouse_card[['MGI receptor', 'ENSEMBL receptor']].rename(columns={
        'MGI receptor': 'MGI ID', 'ENSEMBL receptor': 'ENSEMBL ID'
    })
])

# Drop duplicate rows
mapping_mouse_ens = mouse_id_map.drop_duplicates().reset_index(drop=True)

In [6]:
def extract_mgi_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'MGI:(\d+)', col)
    if match:
        return 'MGI:' +str(match.group(1))
    return None

### For latest DB, no need
# gene_pair0_copy['Ligand MGI ID'] = gene_pair0_copy['Ligand MGI ID'].apply(extract_mgi_id)
# gene_pair0_copy['Receptor MGI ID'] = gene_pair0_copy['Receptor MGI ID'].apply(extract_mgi_id)

gene_pair_mouse_card_lim = gene_pair_mouse_card[["LR Pair Card", "Mouse_ligand", "Mouse_receptor", "MGI ligand", "MGI receptor"]]
gene_pair0_copy =gene_pair0_copy.merge(gene_pair_mouse_card_lim, how="left", on="LR Pair Card")
gene_pair0_copy = gene_pair0_copy.rename(columns={
        'MGI ligand': 'Ligand MGI ID',
        'MGI receptor': 'Receptor MGI ID'
    })

In [8]:
gene_pair0_copy['Ligand Name'] = gene_pair0_copy.apply(
    lambda row: mapping_mouse_name.get(row['Ligand MGI ID'], row['Ligand Name'])
    if pd.notna(row['Ligand MGI ID']) else row['Ligand Name'],
    axis=1
)
gene_pair0_copy['Receptor Name'] = gene_pair0_copy.apply(
    lambda row: mapping_mouse_name.get(row['Receptor MGI ID'], row['Receptor Name'])
    if pd.notna(row['Receptor MGI ID']) else row['Receptor Name'],
    axis=1
)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [28]:
gene_pair_keywords_filtered['is_mouse_specific']

KeyError: 'is_mouse_specific'

In [56]:
import pandas as pd
import requests
from datetime import datetime
import os

# Create data folder if it doesn't exist
os.makedirs("data", exist_ok=True)

# Today's date for versioning
today = datetime.now().strftime("%Y%m%d")

# URLs and filenames
homology_url = "https://www.informatics.jax.org/downloads/reports/HGNC_AllianceHomology.rpt"
mrk_url = "https://www.informatics.jax.org/downloads/reports/MRK_Sequence.rpt"
mrk_list_url = "https://www.informatics.jax.org/downloads/reports/MRK_List2.rpt"

homology_file = f"data/HGNC_AllianceHomology_{today}.tsv"
mrk_file = f"data/MRK_Sequence_{today}.tsv"
mrk_list_file = f"data/MRK_List2_{today}.tsv"
merged_file = f"data/MRK_Merged_{today}_MGI_DB.tsv"

# Download HGNC_AllianceHomology.rpt
response = requests.get(homology_url)
response.raise_for_status()
with open(homology_file, "wb") as f:
    f.write(response.content)

# Download MRK_Sequence.rpt
response = requests.get(mrk_url)
response.raise_for_status()
with open(mrk_file, "wb") as f:
    f.write(response.content)

# Download MRK_List2.rpt
response = requests.get(mrk_list_url)
response.raise_for_status()
with open(mrk_list_file, "wb") as f:
    f.write(response.content)

# Load files
df_mrk = pd.read_csv(mrk_file, sep="\t", dtype=str)
df_homology = pd.read_csv(homology_file, sep="\t", dtype=str, index_col = False)


In [22]:
mapping_mouse_name

{'MGI:1341858': 'DNA segment, 03B03F (Research Genetics)',
 'MGI:1341869': 'DNA segment, 03B03R (Research Genetics)',
 'MGI:1918911': 'RIKEN cDNA 0610005C13 gene',
 'MGI:1923503': 'RIKEN cDNA 0610006L08 gene',
 'MGI:1925547': 'RIKEN cDNA 0610008J02 gene',
 'MGI:3698435': 'RIKEN cDNA 0610009E02 gene',
 'MGI:1918921': 'RIKEN cDNA 0610009F21 gene',
 'MGI:1918931': 'RIKEN cDNA 0610009K14 gene',
 'MGI:1914088': 'RIKEN cDNA 0610009L18 gene',
 'MGI:1918926': 'RIKEN cDNA 0610012D04 gene',
 'MGI:1925548': 'RIKEN cDNA 0610012E21 gene',
 'MGI:1915618': 'RIKEN cDNA 0610025J13 gene',
 'MGI:1915614': 'RIKEN cDNA 0610030E20 gene',
 'MGI:1925549': 'RIKEN cDNA 0610031C06 gene',
 'MGI:1921093': 'RIKEN cDNA 0610031I08 gene',
 'MGI:1915619': 'RIKEN cDNA 0610031O16 gene',
 'MGI:1925551': 'RIKEN cDNA 0610033I19 gene',
 'MGI:1921343': 'RIKEN cDNA 0610033M10 gene',
 'MGI:1925613': 'RIKEN cDNA 0610033P13 gene',
 'MGI:1925614': 'RIKEN cDNA 0610037L18 gene',
 'MGI:1917595': 'RIKEN cDNA 0610038B21 gene',
 'MGI:19

In [65]:
import pandas as pd
import requests
from datetime import datetime
import os

# Create data folder if it doesn't exist
os.makedirs("data", exist_ok=True)

# Today's date for versioning
today = datetime.now().strftime("%Y%m%d")

# URLs and filenames
homology_url = "https://www.informatics.jax.org/downloads/reports/HGNC_AllianceHomology.rpt"
mrk_url = "https://www.informatics.jax.org/downloads/reports/MRK_Sequence.rpt"
mrk_list_url = "https://www.informatics.jax.org/downloads/reports/MRK_List2.rpt"

homology_file = f"data/HGNC_AllianceHomology_{today}.tsv"
mrk_file = f"data/MRK_Sequence_{today}.tsv"
mrk_list_file = f"data/MRK_List2_{today}.tsv"
merged_file = f"data/MRK_Merged_{today}_MGI_DB.tsv"

# Download HGNC_AllianceHomology.rpt
response = requests.get(homology_url)
response.raise_for_status()
with open(homology_file, "wb") as f:
    f.write(response.content)

# Download MRK_Sequence.rpt
response = requests.get(mrk_url)
response.raise_for_status()
with open(mrk_file, "wb") as f:
    f.write(response.content)

# Download MRK_List2.rpt
response = requests.get(mrk_list_url)
response.raise_for_status()
with open(mrk_list_file, "wb") as f:
    f.write(response.content)

# Load files
df_mrk = pd.read_csv(mrk_file, sep="\t", dtype=str)
df_homology = pd.read_csv(homology_file, sep="\t", dtype=str, index_col = False)

df_mrk_list = pd.read_csv(mrk_list_file, sep="\t", dtype=str)
df_mrk_list= df_mrk_list[["MGI Accession ID","Marker Synonyms (pipe-separated)"]].rename(columns={"MGI Accession ID": "MGI Marker Accession ID",
                                                                                                  "Marker Synonyms (pipe-separated)": "Aliases"})
df_homology = df_homology.rename(columns={"MGI Accession ID": "MGI Marker Accession ID"})
# Keep only relevant columns from HGNC_AllianceHomology
columns_to_keep = [
    "MGI Marker Accession ID",
    "EntrezGene ID",
    "CCDS IDs",
    "HGNC ID"
]
df_homology_subset = df_homology[columns_to_keep]
# Merge: MRK + homology
df_merged = df_mrk.merge(df_homology_subset, how="left", on="MGI Marker Accession ID")

# Merge with synonyms
df_merged = df_merged.merge(df_mrk_list, how="left", on="MGI Marker Accession ID")

# Save result
df_merged.to_csv(merged_file, sep="\t", index=False)
print(f"Merged file saved to: {merged_file}")


Unnamed: 0,MGI Marker Accession ID,Aliases
0,MGI:1341858,
1,MGI:1341869,
2,MGI:1337005,
3,MGI:1918911,
4,MGI:1923503,
...,...,...
655272,MGI:2446208,
655273,MGI:2685277,LOC242610|1110046I03Rik|D4Mgi23|2810482G21Rik
655274,MGI:103072,R75157
655275,MGI:2444286,8430405D05Rik|C130099L13Rik


Unnamed: 0,MGI Marker Accession ID,EntrezGene ID,CCDS IDs,HGNC ID
0,MGI:87853,50518,CCDS16941.1,HGNC:745
1,MGI:87854,11287,CCDS39650.1,
2,MGI:87855,,,
3,MGI:87859,11350,CCDS15901.1|CCDS50563.1,HGNC:76
4,MGI:87860,11352,CCDS15393.1|CCDS48404.1,HGNC:77
...,...,...,...,...
90793,MGI:7834358,,,
90794,MGI:7834360,,,
90795,MGI:7834362,,,
90796,MGI:7855873,,,


Merged file saved to: data/MRK_Merged_20250730_MGI_DB.tsv


In [77]:
df_merged['Ensembl transcript IDs'][df_merged['Marker Symbol'] == "Tlr4"]

124340    ENSMUST00000048096|ENSMUST00000107365|ENSMUST0...
Name: Ensembl transcript IDs, dtype: object

In [76]:
df_merged.columns

Index(['MGI Marker Accession ID', 'Marker Symbol', 'Status', 'Marker Type',
       'Marker Name', 'cM position', 'Chromosome', 'Genome Coordinate Start',
       'Genome Coordinate End', 'Strand', 'GenBank IDs',
       'RefSeq transcript IDs', 'Ensembl transcript IDs', 'UniProt IDs',
       'TrEMBL IDs', 'Ensembl protein IDs', 'RefSeq protein IDs',
       'UniGene IDs', 'Feature Type', 'EntrezGene ID', 'CCDS IDs', 'HGNC ID',
       'Aliases'],
      dtype='object')

In [34]:
df_homology

Unnamed: 0,MGI Accession ID,Marker Symbol,Marker Name,Feature Type,EntrezGene ID,NCBI Gene chromosome,NCBI Gene start,NCBI Gene end,NCBI Gene strand,Ensembl Gene ID,Ensembl Gene chromosome,Ensembl Gene start,Ensembl Gene end,Ensembl Gene strand,CCDS IDs,HGNC ID
MGI:87853,a,nonagouti,protein coding gene,50518,2,154792519,154892932,+,ENSMUSG00000027596,2,154633322,154892932,+,CCDS16941.1,HGNC:745,
MGI:87854,Pzp,"PZP, alpha-2-macroglobulin like",protein coding gene,11287,6,128460530,128503683,-,ENSMUSG00000030359,6,128460530,128503683,-,CCDS39650.1,,
MGI:87855,Aal,active avoidance learning,heritable phenotypic marker,,,,,,,,,,,,,
MGI:87859,Abl1,"c-abl oncogene 1, non-receptor tyrosine kinase",protein coding gene,11350,2,31578256,31697105,+,ENSMUSG00000026842,2,31578388,31694239,+,CCDS15901.1|CCDS50563.1,HGNC:76,
MGI:87860,Abl2,"ABL proto-oncogene 2, non-receptor tyrosine ki...",protein coding gene,11352,1,156386160,156477189,+,ENSMUSG00000026596,1,156386356,156477138,+,CCDS15393.1|CCDS48404.1,HGNC:77,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MGI:7834358,Gm75820,"predicted gene, 75820",lncRNA gene,,,,,,ENSMUSG00000137503,2,164339005,164345988,-,,,
MGI:7834360,Gm75821,"predicted gene, 75821",lncRNA gene,,,,,,ENSMUSG00000138928,2,109876392,109963249,+,,,
MGI:7834362,Gm75822,"predicted gene, 75822",lncRNA gene,,,,,,ENSMUSG00000140534,2,52821585,52837904,+,,,
MGI:7855873,BPL001,bone phosphate low mutant 001,heritable phenotypic marker,,,,,,,,,,,,,


In [30]:
df_homology_subset = df_homology[columns_to_keep]

NameError: name 'df_homology_subset' is not defined

KeyError: "None of [Index(['MGI Marker Accession ID', 'NCBI Gene ID',\n       'CCDS IDs (comma delimited)'],\n      dtype='object')] are in the [columns]"

In [2]:
import os
import jinja2
import sys
import pandas as pd
import numpy as np
import time
import base64
import re

# Add the src directory to the path for importing modules
sys.path.append(os.path.abspath("src"))

# Import necessary modules from your existing src files
# Ensure createDataTable and createFunctionalAnnotTable are in your 'src' directory
from createDataTable import pop_up_info, gene_pair0, generate_perplexity_links, gene_pair00, is_mouse_specific
from createFunctionalAnnotTable import gene_pair_annot_ligand, gene_pair_annot_receptor

# Test or all
test = False
test_genes = ["H60a Klrk1", "H60b Klrk1", "VEGFA NRP1", "THPO MPL", "FGF1 FGFR3"] # Example genes
# --- Paths ---
MERGED_TEMPLATE_PATH = 'HTML/mergedCard_tabs.html'
OUTPUT_DIR = 'data/cards/' # New output directory for combined files

# function for replacing visible text:
def update_link_text_with_symbol(html_str, new_symbol):
    """
    Replace the visible text in an anchor tag with the provided symbol.
    The HGNC ID is extracted from the href and left unchanged.
    """
    # Only proceed if input is valid
    if not isinstance(html_str, str) or not html_str.strip():
        return html_str
    
    return re.sub(r">(.*?)</a>", f">{new_symbol}</a>", html_str) #<i class='fa-solid fa-arrow-up-right-from-square' style='margin-left:4px;'></i></a>

# Create Perplexity link

# Recreate Perplexity link
def create_url_basic(symbol):
          label = "Perplexity (LLM)"
          query = f"What diseases is {symbol} implicated in?"
          encoded_query = query.replace(" ", "%20")
          output = f'<a href="https://www.perplexity.ai/search?q={encoded_query}" target="_blank">{label}</a>' #<i class="fa-solid fa-arrow-up-right-from-square" style="margin-left:4px;"></i>
          return output

# --- Load and Preprocess Data (Combined from both scripts) ---

# Load PubMed data (from createPMIDpages.py)
pubmed_data = pd.read_csv("data/pubmed_results.csv")
pubmed_data["Year"] = pubmed_data["Year"].astype(str).str.replace(".0", "", regex=False).astype(int)
pubmed_data["PMID"] = pubmed_data["PMID"].astype(str)
pubmed_data = pubmed_data.reset_index(drop=True)

# remove for now
# # Load LLM results (from createPMIDpages.py)
# bio_keywords = pd.read_csv("data/llm_results.csv")

# --- Prepare gene_pair00 for PMID section (from createPMIDpages.py) ---
# gene_pair00 is used for PMID and Keywords, so it needs the '—' placeholder
# Ensure gene_pair00 is a copy to avoid SettingWithCopyWarning later
gene_pair00_copy = gene_pair00.copy()

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


In [6]:
gene_pair000 = gene_pair00.copy()
gene_pair000["LR Pair Card"] = gene_pair000["LR Pair Card"].astype(str)
# --- Prepare gene_pair0 for Card section (from createCards.py) ---
# gene_pair0 is used for card details, it should retain spaces for splitting gene names
# Ensure gene_pair0 is a copy to avoid SettingWithCopyWarning later
gene_pair0_copy = gene_pair0.copy()

In [7]:
# grab the pairs by interaction id that has ligand that is mouse_specific
mouse_interaction_ids = gene_pair0_copy[gene_pair0_copy['Ligand'].apply(is_mouse_specific)]['Interaction ID'].tolist()

In [12]:
from createDataTable import pop_up_info, gene_pair0, generate_perplexity_links, gene_pair00, is_mouse_specific, grab_mouse_info

In [22]:
mouse_interaction_ids = gene_pair0_copy["Interaction ID"][gene_pair0_copy["Human evidence"] == "absent in human"]

In [24]:
gene_pair0_copy[gene_pair0_copy["Interaction ID"].isin(mouse_interaction_ids)]

Unnamed: 0,Interaction ID,LR Pair Card,Human LR Pair,Ligand,Receptor,Ligand Symbols,Receptor Symbols,Ligand Location,Receptor Location,Ligand HGNC ID,Receptor HGNC ID,Perplexity,Human evidence,PMID
3561,CDB03562,CD99 Pilrb1,no human ortholog,CD99,Pilrb1,"<span title=""CD99 (MIC2, MIC2X, MIC2Y)"">CD99 (...","<span title=""no human ortholog"">no human ortho...","<span title=""based on perplexity"">cell membran...",unknown,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3562,CDB03563,Ccl21C CXCR3,no human ortholog,Ccl21C,CXCR3,"<span title=""no human ortholog"">no human ortho...","<span title=""CXCR3 (GPR9, CKR-L2, CMKAR3, IP10...",unknown,"<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3563,CDB03564,Ccl21C Cxcr3,no human ortholog,Ccl21C,Cxcr3,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,"<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3564,CDB03565,Defb2 Ccr6,no human ortholog,Defb2,Ccr6,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">secreted</span>","<span title=""based on uniprot"">cell membrane</...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3565,CDB03566,Defb2 Tlr4,no human ortholog,Defb2,Tlr4,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">secreted</span>","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3566,CDB03567,Fcna Tlr4,no human ortholog,Fcna,Tlr4,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,"<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3567,CDB03568,H60A Klrk1,no human ortholog,H60A,Klrk1,"<span title=""H60A"">H60A</span>","<span title=""no human ortholog"">no human ortho...",unknown,"<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3568,CDB03569,H60B Klrk1,no human ortholog,H60B,Klrk1,"<span title=""H60B"">H60B</span>","<span title=""no human ortholog"">no human ortho...",unknown,"<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3569,CDB03570,H60C Klrk1,no human ortholog,H60C,Klrk1,"<span title=""H60C"">H60C</span>","<span title=""no human ortholog"">no human ortho...",unknown,"<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3570,CDB03571,Lair1 Lilrb4A,no human ortholog,Lair1,Lilrb4A,"<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...","<span title=""based on uniprot"">cell membrane</...",unknown,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",absent in human,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."


In [10]:
import os
import jinja2
import sys
import pandas as pd
import numpy as np
import time
import base64
import re

# Add the src directory to the path for importing modules
sys.path.append(os.path.abspath("src"))

# Import necessary modules from your existing src files
# Ensure createDataTable and createFunctionalAnnotTable are in your 'src' directory
from createDataTable import pop_up_info, gene_pair0, generate_perplexity_links, gene_pair00, is_mouse_specific
from createFunctionalAnnotTable import gene_pair_annot_ligand, gene_pair_annot_receptor

# Test or all
test = False
test_genes = ["H60a Klrk1", "H60b Klrk1", "VEGFA NRP1", "THPO MPL", "FGF1 FGFR3"] # Example genes
# --- Paths ---
MERGED_TEMPLATE_PATH = 'HTML/mergedCard_tabs.html'
OUTPUT_DIR = 'data/cards/' # New output directory for combined files


In [9]:
gene_pair_annot_receptor

Unnamed: 0,Receptor HGNC ID,Receptor Symbols,Receptor Location,Receptor group,root_group_id
0,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""HSPA5 (GRP78, BiP)"">...","<span title=""based on PMID: 12194978, PMID: 32...","<a href=""https://www.genenames.org/data/genegr...",582
1,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""LRP1 (APR, A2MR, LRP...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/genegr...",634
2,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""LRP1 (APR, A2MR, LRP...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/genegr...",471
3,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""LRP1 (APR, A2MR, LRP...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/genegr...",1690
4,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""BDKRB2 (BK-2)"">BDKRB...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/genegr...",2054
...,...,...,...,...,...
1513,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""no human ortholog"">n...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/genegr...",621
1514,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""no human ortholog"">n...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/genegr...",1298
1515,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""no human ortholog"">n...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/genegr...",471
1516,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""no human ortholog"">n...","<span title=""based on uniprot"">cell membrane</...",unknown,


In [151]:
# Clean the column in the existing gene_group
hgnc_id_column = fetchGSheet.gene_group["hgnc_id"].astype(str).str.strip()

# Clean the input IDs
cleaned_input_ids = pd.Series(hgnc_id).astype(str).str.strip()

# Find missing IDs
missing_ids = cleaned_input_ids[~cleaned_input_ids.isin(hgnc_id_column)]

# Optionally print the whole list without truncation
print("Missing HGNC IDs:")
print(missing_ids.to_string(index=False))

# Save to CSV
missing_ids_df = pd.DataFrame({"missing_hgnc_id": missing_ids})
missing_ids_df.to_csv("missing_hgnc_ids.csv", index=False)

Missing HGNC IDs:
  HGNC:607
HGNC:13916
          
      #N/A
HGNC:16721
HGNC:16722
HGNC:16451
HGNC:21969
 HGNC:2475
HGNC:20594
 HGNC:3600
 HGNC:5474
 HGNC:6552
 HGNC:9030
HGNC:10468
HGNC:10469
HGNC:10515
HGNC:10740
HGNC:10776
HGNC:20323
HGNC:30668
  HGNC:281
HGNC:12338
 HGNC:8535
HGNC:12449
 HGNC:3608
 HGNC:4471
HGNC:14687
 HGNC:4469
HGNC:14901
HGNC:10875
HGNC:14558
HGNC:17867
 HGNC:6693
 HGNC:1665


In [149]:
cleaned_hgnc_ids_column = cleaned_id.astype(str).str.strip()

# Print all the cleaned IDs without truncation
print(cleaned_hgnc_ids_column.to_string())

AttributeError: 'str' object has no attribute 'astype'

In [117]:
# Quick check if there is mouse-specific
gene_pair['Ligand'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Ligand']) else row['Ligand'],
    axis=1
)
gene_pair['Receptor'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Receptor']) else row['Receptor'],
    axis=1
)

# gene symbol
gene_pair["Ligand"] = [
    f'<span title="{ligand_name}">{ligand_symbol}</span>'
    for ligand_name, ligand_symbol in zip(gene_pair["Ligand Name"], 
                                              gene_pair["Ligand"])
]
# gene symbol
gene_pair["Receptor"] = [
    f'<span title="{receptor_name}">{receptor_symbol}</span>'
    for receptor_name, receptor_symbol in zip(gene_pair["Receptor Name"], 
                                              gene_pair["Receptor"])
]


In [120]:
gene_pair = gene_pair.drop(columns=["Ligand Name", "Receptor Name"])


# Create the links to the HTML cards
gene_pair["Human LR Pair"] = [
    f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrPairOrig.replace(" ","-")}.html">{lrPair}</a>'
    for lrPairOrig, lrPair in zip(gene_pair0["Human LR Pair"], gene_pair["Human LR Pair"])
]




# Add tooltips to the column headers
gene_pair.columns = [
    f'<span title="Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">{col}</span>' if col == "Human LR Pair" else
    f'<span title="Click the logo below to run Perplexity on the Human LR pair">{col}&nbsp;</span>' if col == "Perplexity" else
    f'<span title="Official Gene Symbol; Hover on symbols below to show gene names">{col}&nbsp;&nbsp;&nbsp;</span>' if col in ["Ligand", "Receptor"] else
    f'<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">{col}&nbsp;&nbsp;</span>' if col in ["Ligand HGNC ID", "Receptor HGNC ID"] else
    f'<span title=" PubMed IDs (PMID) with Literature Evidence for LR Interaction. Click on the link for more details">{col}</span>' if col == "PMID" else
    f'<span title="Rat Genome Database (RGD) ID. Click on the link for more details">{col}</span>' if col in ["Ligand RGD ID", "Receptor RGD ID"] else
    f'<span title="Mouse Genome Informatics (MGI) ID. Click on the link for more details">{col}</span>' if col in ["Ligand MGI ID", "Receptor MGI ID"]else
    f'<span title="Zebrafish Information Network (ZFIN) ID. Click on the link for more details">{col}</span>' if col in ["Ligand ZFIN ID", "Receptor ZFIN ID"] else
    f'<span title="Location based on the predicted subcellular localization of the human proteome">{col}</span>' if col in ["Ligand Location", "Receptor Location"] else
    f'<span title="Double-click header of {col} to ensure all values are shown">{col}&nbsp;</span>'
    for col in gene_pair.columns
]
gene_pair = gene_pair.reset_index(drop=True)  # Remove the index

#######################################################################
# Identify the column(s) that contain '(PMID)' and temporarily remove for presubmission
pmid_cols = [col for col in gene_pair.columns if '(PMID)' in col]
gene_pair = gene_pair.drop(columns=pmid_cols)
#######################################################################

gene_pair000 = gene_pair.copy()

keywords_to_modify = ["Ligand", "Receptor"]
exclude_keywords = ["HGNC ID", "Location", "Human"]  # Columns containing this will not be modified

# Copy the original columns so we can modify only the first 10
new_columns = gene_pair000.columns.tolist()

KeyError: "['Ligand Name', 'Receptor Name'] not found in axis"

In [119]:
new_columns

['<span title="Double-click header of LR Pair Card to ensure all values are shown">LR Pair Card&nbsp;</span>',
 '<span title="Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">Human LR Pair</span>',
 '<span title="Official Gene Symbol; Hover on symbols below to show gene names">Ligand&nbsp;&nbsp;&nbsp;</span>',
 '<span title="Official Gene Symbol; Hover on symbols below to show gene names">Receptor&nbsp;&nbsp;&nbsp;</span>',
 '<span title="Double-click header of Ligand Symbols to ensure all values are shown">Ligand Symbols&nbsp;</span>',
 '<span title="Double-click header of Receptor Symbols to ensure all values are shown">Receptor Symbols&nbsp;</span>',
 '<span title="Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)">Ligand Location</span>',
 '<span title="Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26

In [92]:
human_rows = gene_pair[~(gene_pair["Human evidence"] == "absent in human")]
mouse_rows = gene_pair[gene_pair["Human evidence"] == "absent in human"] 

In [95]:
# Concatenate the DataFrames: rows with IDs first, then rows without IDs
gene_pair = pd.concat([human_rows, mouse_rows]).reset_index(drop=True)
DBlength = len(gene_pair)
gene_pair["Interaction ID"] = [f"CDB{str(i).zfill(5)}" for i in range(1, DBlength + 1)]

In [122]:
human_columns = [col for col in gene_pair000.columns][:16]

In [125]:
evidence_cols = [col for col in gene_pair.columns if 'Human evidence' in col]
evidence_cols

['<span title="Double-click header of Human evidence to ensure all values are shown">Human evidence&nbsp;</span>']

In [129]:
human_gene_pair = gene_pair[~(gene_pair[evidence_cols[0]] == "absent in human")]
human_gene_pair

Unnamed: 0,"<span title=""Double-click header of LR Pair Card to ensure all values are shown"">LR Pair Card&nbsp;</span>","<span title=""Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)"">Human LR Pair</span>","<span title=""Official Gene Symbol; Hover on symbols below to show gene names"">Ligand&nbsp;&nbsp;&nbsp;</span>","<span title=""Official Gene Symbol; Hover on symbols below to show gene names"">Receptor&nbsp;&nbsp;&nbsp;</span>","<span title=""Double-click header of Ligand Symbols to ensure all values are shown"">Ligand Symbols&nbsp;</span>","<span title=""Double-click header of Receptor Symbols to ensure all values are shown"">Receptor Symbols&nbsp;</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Ligand Location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Receptor Location</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Ligand HGNC ID&nbsp;&nbsp;</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Receptor HGNC ID&nbsp;&nbsp;</span>","<span title=""Click the logo below to run Perplexity on the Human LR pair"">Perplexity&nbsp;</span>","<span title=""Double-click header of Human evidence to ensure all values are shown"">Human evidence&nbsp;</span>"
0,A2M HSPA5,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""alpha-2-macroglobulin"">A2M</span>","<span title=""heat shock protein family A (Hsp7...","<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on PMID: 12194978, PMID: 32...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT
1,A2M LRP1,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""alpha-2-macroglobulin"">A2M</span>","<span title=""LDL receptor related protein 1"">L...","<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A...","<span title=""based on perplexity, uniprot"">sec...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT
2,ACE BDKRB2,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""angiotensin I converting enzyme"">...","<span title=""bradykinin receptor B2"">BDKRB2</s...","<span title=""ACE (DCP1, ACE1, CD143)"">ACE (DCP...","<span title=""BDKRB2 (BK-2)"">BDKRB2 (BK-2)</span>","<span title=""based on perplexity, uniprot"">cel...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",DIRECT
3,ADA DPP4,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""adenosine deaminase"">ADA</span>","<span title=""dipeptidyl peptidase 4"">DPP4</span>","<span title=""ADA (ADA1)"">ADA (ADA1)</span>","<span title=""DPP4 (CD26, ADCP2, DPPIV)"">DPP4 (...","<span title=""based on hpa, uniprot"">cell membr...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION
4,ADAM10 EFNA5,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""ADAM metallopeptidase domain 10"">...","<span title=""ephrin A5"">EFNA5</span>","<span title=""ADAM10 (kuz, MADM, HsT18717, CD15...","<span title=""EFNA5 (EPLG7, AF1, LERK7)"">EFNA5 ...","<span title=""based on hpa, perplexity, uniprot...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",CONSERVATION
...,...,...,...,...,...,...,...,...,...,...,...,...
3556,not conserved CCR1,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""nan"">no human ortholog</span>","<span title=""C-C motif chemokine receptor 1"">n...","<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,unknown,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved
3557,not conserved CCR2,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""nan"">no human ortholog</span>","<span title=""C-C motif chemokine receptor 2"">n...","<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,unknown,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved
3558,not conserved CCR7,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""nan"">no human ortholog</span>","<span title=""C-C motif chemokine receptor 7"">n...","<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,unknown,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved
3559,not conserved CXCR3,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""nan"">no human ortholog</span>","<span title=""C-X-C motif chemokine receptor 3""...","<span title=""no human ortholog"">no human ortho...","<span title=""no human ortholog"">no human ortho...",unknown,unknown,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.perplexity.ai/search?q=Wh...",not conserved


In [None]:
# create Perplexity link
def create_url_basic(perplexity_col):
    query = f"What is the primary evidence that {perplexity_col} bind-each-other-as-a-ligand-and-receptor-pair. Exclude reviews, uniprot, wiki, genecards, PIPS, iuphar as sources."
    encoded_query = query.replace(" ", "%20")
    return f"https://www.perplexity.ai/search?q={encoded_query}"
    
# Option 2 -- new query all together

# def generate_perplexity_link_pmid(row): 
#     query = f"What-is-the-biological-relevance-of-the-ligand-and-receptor-pair-{row['Human LR Pair']}-based-on-Pubmed-ID-{row['PMID']}"
#     return (
#          f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank">'
#         f'<img src="https://img.icons8.com/?size=30&id=0NbBuNOxUwps&format=png&color=000000" alt="Perplexity AI" /></a>'
#     )

# cannot use perplexity logo
def generate_perplexity_link_pmid(row): 
    query = f"What-is-the-biological-relevance-of-the-ligand-and-receptor-pair-{row['Human LR Pair']}-based-on-Pubmed-ID-{row['PMID']}"
    return (
         f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank" style="text-decoration: none;">&#128269;</a>'
    )

# Apply function to the DataFrame
gene_pair["Perplexity"] = gene_pair.apply(generate_perplexity_link_pmid, axis=1)

In [99]:
# ligand
gene_pair["Ligand HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(ligand, ligand)
    for ligand in gene_pair["Ligand HGNC ID"]
]

# receptor
gene_pair["Receptor HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(receptor, receptor)
    for receptor in gene_pair["Receptor HGNC ID"]
]


# Function to generate hyperlinks for the "PMID support" column
def generate_links_with_doi(df, gene_column, pmid_column, id_column):
    def create_link(gene, id_col, sources):
        # Replace spaces with "——" in the gene name for the link
        gene_name = gene.replace(" ", "—")
        
        if len(sources) == 1:
            source = sources[0]
            if source.startswith("https://www.biorxiv.org/content/"):
                # If the value starts with "https://doi.org/", use it as the hyperlink
                return f'<a href="{source}" target="_blank">BioRxiv</a>'
            else:
                # If it's a single PMID, hyperlink the PMID text
                return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{gene}.html">{source}</a>'
        else:
            # If multiple PMIDs, show the count and hyperlink to the page
            return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{gene}.html" target="_blank">{len(sources)} PMIDs</a>'

    # Process each row to generate the "PMID" column # was "PMID support"
    df["PMID"] = [
        create_link(
            gene=row[gene_column], 
            id_col = row[id_column],
            sources=[s.strip() for s in row[pmid_column].split(',') if s.strip()]
        )
        for _, row in df.iterrows()
    ]
    return df


# Generate the links for the "PMID" column # was "PMID support"
gene_pair = generate_links_with_doi(gene_pair, gene_column="Human LR Pair", 
                                    pmid_column="PMID", id_column= "Interaction ID")

# for disease type, cancer-related and top pathways, when missing say "ask Perplexity"


def generate_perplexity_kegglinks(
    df,
    pathway_col="KEGG Pathway",
    default_query_template="What-biological or other functional-pathways-is-the-ligand-receptor-{pair}-associated-with"
):
    def create_link(row):
        value = row.get(pathway_col, "")
        
        if pd.isna(value) or str(value).strip().lower() in ["nan", "none", "", "unknown"]:
            pair = row["Human LR Pair"]
            label = "ask Perplexity"
            query = default_query_template.format(pair=pair)
            encoded_query = urllib.parse.quote(query)
            return f'<a href="https://www.perplexity.ai/search?q={encoded_query}" target="_blank">{label}</a>'
        else:
            return value

    df[pathway_col] = df.apply(create_link, axis=1)
    return df

gene_pair = generate_perplexity_kegglinks(gene_pair, pathway_col="KEGG Pathway")
    
def generate_perplexity_links(df, pathway_col, default_query_template):
    def create_link(row):
        pathway_value = str(row[pathway_col]).strip().lower()
        pair = row["Human LR Pair"]
        
        if pd.isna(row[pathway_col]) or pathway_value in ["nan", "none", "", "unknown"]:
            label = "ask Perplexity"
            query = default_query_template.format(pair=pair)
            output =  f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank">{label}</a>'
        else:
            label = row[pathway_col]
            query = f"What-is-the-role-of-the-ligand-and-receptor-pair-{pair}-in-{label}"
            output = f'{label} (see <a href="https://www.perplexity.ai/search?q={query}" target="_blank">evidence in Perplexity</a>)'
        
        return output
    
    df[pathway_col] = df.apply(create_link, axis=1)
    return df


In [100]:
def add_geneToolTip(species):
    def tooltip_html(symbol, name):
        return (
            f'<span class="tooltip">{symbol}'
            f'<span class="tooltiptext">{name}</span></span>'
        )

    gene_pair[species + " Ligand"] = [
        tooltip_html(ligand_symbol, ligand_name)
        for ligand_name, ligand_symbol in zip(gene_pair[species + " Ligand Name"], gene_pair[species + " Ligand"])
    ]
    gene_pair[species + " Receptor"] = [
        tooltip_html(receptor_symbol, receptor_name)
        for receptor_name, receptor_symbol in zip(gene_pair[species + " Receptor Name"], gene_pair[species + " Receptor"])
    ]

### Remove tooltip for name for each species for now as only zebrafish has the proper names ###     
# speciesPrime_list = ["Zebrafish"]
# # Loop through each species and update gene_pair
# for species in speciesPrime_list:
#    gene_pair = add_geneToolTip(species)

mouse_columns = ['Mouse Ligand', 'Mouse Receptor','Ligand MGI ID','Receptor MGI ID'] 
rat_columns = ['Rat Ligand','Rat Receptor','Ligand RGD ID','Receptor RGD ID']
zebrafish_columns = ['Zebrafish Ligand','Zebrafish Receptor','Ligand ZFIN ID','Receptor ZFIN ID']

# List of prefixes
prefixes = ("Chimpanzee", "Chicken", "Pig", "Cow", "Dog", "Horse", "Sheep", "Marmoset", "Macaque", "Frog")

In [109]:
gene_pair0 = gene_pair[first_columns+["PMID"]]
gene_pair = gene_pair[first_columns]

KeyError: "None of [Index(['LR Pair Card', 'Human LR Pair', 'Ligand', 'Receptor', 'Ligand Symbols',\n       'Receptor Symbols', 'Ligand Location', 'Receptor Location',\n       'Ligand HGNC ID', 'Receptor HGNC ID', 'Perplexity', 'Human evidence',\n       'PMID'],\n      dtype='object')] are in the [columns]"

In [None]:
# Quick check if there is mouse-specific
gene_pair['Ligand'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Ligand']) else row['Ligand'],
    axis=1
)
gene_pair['Receptor'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Receptor']) else row['Receptor'],
    axis=1
)

In [104]:
# gene symbol
gene_pair["Ligand"] = [
    f'<span title="{ligand_name}">{ligand_symbol}</span>'
    for ligand_name, ligand_symbol in zip(gene_pair["Ligand Name"], 
                                              gene_pair["Ligand"])
]
# gene symbol
gene_pair["Receptor"] = [
    f'<span title="{receptor_name}">{receptor_symbol}</span>'
    for receptor_name, receptor_symbol in zip(gene_pair["Receptor Name"], 
                                              gene_pair["Receptor"])
]


In [105]:
gene_pair = gene_pair.drop(columns=["Ligand Name", "Receptor Name"])


# Create the links to the HTML cards
gene_pair["Human LR Pair"] = [
    f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrPairOrig.replace(" ","-")}.html">{lrPair}</a>'
    for lrPairOrig, lrPair in zip(gene_pair0["Human LR Pair"], gene_pair["Human LR Pair"])
]




# Add tooltips to the column headers
gene_pair.columns = [
    f'<span title="Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">{col}</span>' if col == "Human LR Pair" else
    f'<span title="Click the logo below to run Perplexity on the Human LR pair">{col}&nbsp;</span>' if col == "Perplexity" else
    f'<span title="Official Gene Symbol; Hover on symbols below to show gene names">{col}&nbsp;&nbsp;&nbsp;</span>' if col in ["Ligand", "Receptor"] else
    f'<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">{col}&nbsp;&nbsp;</span>' if col in ["Ligand HGNC ID", "Receptor HGNC ID"] else
    f'<span title=" PubMed IDs (PMID) with Literature Evidence for LR Interaction. Click on the link for more details">{col}</span>' if col == "PMID" else
    f'<span title="Rat Genome Database (RGD) ID. Click on the link for more details">{col}</span>' if col in ["Ligand RGD ID", "Receptor RGD ID"] else
    f'<span title="Mouse Genome Informatics (MGI) ID. Click on the link for more details">{col}</span>' if col in ["Ligand MGI ID", "Receptor MGI ID"]else
    f'<span title="Zebrafish Information Network (ZFIN) ID. Click on the link for more details">{col}</span>' if col in ["Ligand ZFIN ID", "Receptor ZFIN ID"] else
    f'<span title="Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)">{col}</span>' if col in ["Ligand Location", "Receptor Location"] else
    f'<span title="Double-click header of {col} to ensure all values are shown">{col}&nbsp;</span>'
    for col in gene_pair.columns
]

gene_pair = gene_pair.reset_index(drop=True)  # Remove the index

In [107]:
gene_pair000 = gene_pair.copy()

In [108]:
keywords_to_modify = ["Ligand", "Receptor"]
exclude_keywords = ["HGNC ID", "Location", "Human"]  # Columns containing this will not be modified

# Copy the original columns so we can modify only the first 10
new_columns = gene_pair000.columns.tolist()

# Modify only the first 10 columns
new_columns[:10] = [
    f'{col.split(">")[0]}">Human {col.split(">")[1]}</span>'
    if any(keyword in col for keyword in keywords_to_modify) and not any(exclude in col for exclude in exclude_keywords)
    else col
    for col in new_columns[:10]
]
new_columns

['<span title="Double-click header of LR Pair Card to ensure all values are shown">LR Pair Card&nbsp;</span>',
 '<span title="Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">Human LR Pair</span>',
 '<span title="Official Gene Symbol; Hover on symbols below to show gene names"">Human Ligand&nbsp;&nbsp;&nbsp;</span</span>',
 '<span title="Official Gene Symbol; Hover on symbols below to show gene names"">Human Receptor&nbsp;&nbsp;&nbsp;</span</span>',
 '<span title="Double-click header of Ligand Symbols to ensure all values are shown"">Human Ligand Symbols&nbsp;</span</span>',
 '<span title="Double-click header of Receptor Symbols to ensure all values are shown"">Human Receptor Symbols&nbsp;</span</span>',
 '<span title="Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)">Ligand Location</span>',
 '<span title="Location based on the predicted subcellular localization of the human p

In [50]:
# This is crucial for consistent handling by the function before processing "N/A".
gene_pair['Receptor'] = gene_pair['Receptor'].fillna('')
gene_pair['Receptor Old symbol'] = gene_pair['Receptor Old symbol'].fillna('')
gene_pair['Receptor Aliases'] = gene_pair['Receptor Aliases'].fillna('')

gene_pair['Receptor Symbols'] = gene_pair.apply(
    lambda row: "no human ortholog" if is_mouse_specific(row['Receptor']) 
                else format_symbol_aliases(row['Receptor'], row['Receptor Old symbol'], row['Receptor Aliases']),
    axis=1
)

### tooltips 
gene_pair["Ligand Symbols"] = [
    f'<span title="{aliases}">{aliases}</span>'
    for aliases in gene_pair["Ligand Symbols"]
]
gene_pair["Receptor Symbols"] = [
    f'<span title="{aliases}">{aliases}</span>'
    for aliases in gene_pair["Receptor Symbols"]
]


In [52]:
# Drop columns where all values are NA in gene_pair
gene_pair = gene_pair.dropna(axis=1, how='all')

gene_pair = gene_pair.fillna(" ")
gene_pair = gene_pair[gene_pair['Human LR Pair'] != ' ']

In [55]:
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'Mouse Ligand'] = gene_pair.loc[mask, 'Ligand']

In [62]:
grab_mouse_info = gene_pair["LR Pair Card"][gene_pair["Human evidence"] == "absent in human"]
grab_mouse_info = grab_mouse_info.unique()
grab_mouse_info

array(['Ccl21C Cxcr3', 'Ccl21C CXCR3', 'CD99 Pilrb1', 'Defb2 Ccr6',
       'Defb2 Tlr4', 'Fcna Tlr4', 'H60A Klrk1', 'H60B Klrk1',
       'H60C Klrk1', 'Lair1 Lilrb4A', 'Pcdhb11 Pcdhb11',
       'Pcdhb14 Pcdhb14', 'Pcdhb8 Pcdhb8', 'Pcdhgb8 Pcdhgb8',
       'Sema4A Timd2', 'Zp3 Zp3R'], dtype=object)

In [68]:
gene_pair

Unnamed: 0,LR Pair,PMID,lig_species,rec_species,ligand_orig,receptor_orig,Pair_species,Human evidence,LR Pair Card,Ligand ENSEMBL ID,...,Ligand Name,Ligand Aliases,Ligand Old symbol,Receptor Name,Receptor Aliases,Approved symbol,Receptor Old symbol,Ligand Symbols,Receptor Symbols,Mouse Ligand
0,A Atrn,11137996,Mus Musculus,Mus Musculus,A,Atrn,Mus Musculus,CONSERVATION,ASIP ATRN,ENSG00000101440,...,agouti signaling protein,ASP,AGTIL,attractin,"DPPT-L, MGCA",ATRN,,"<span title=""ASIP (AGTIL, ASP)"">ASIP (AGTIL, A...","<span title=""ATRN (DPPT-L, MGCA)"">ATRN (DPPT-L...",
1,A2M HSPA5,12194978,Homo sapiens,Homo sapiens,A2M,HSPA5,Homo sapiens,DIRECT,A2M HSPA5,ENSG00000175899,...,alpha-2-macroglobulin,"FWP007, S863-7, CPAMD5",,heat shock protein family A (Hsp70) member 5,BiP,HSPA5,GRP78,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,...",
2,A2M HSPA5,32541810,Homo sapiens,Homo sapiens,A2M,HSPA5,Homo sapiens,DIRECT,A2M HSPA5,ENSG00000175899,...,alpha-2-macroglobulin,"FWP007, S863-7, CPAMD5",,heat shock protein family A (Hsp70) member 5,BiP,HSPA5,GRP78,"<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""HSPA5 (GRP78, BiP)"">HSPA5 (GRP78,...",
3,A2M LRP1,1702392,Homo sapiens,Homo sapiens,A2M,LRP1,Homo sapiens,DIRECT,A2M LRP1,ENSG00000175899,...,alpha-2-macroglobulin,"FWP007, S863-7, CPAMD5",,LDL receptor related protein 1,"LRP, CD91, LRP1A, APOER, IGFBP3R1, IGFBP-3R",LRP1,"APR, A2MR","<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A...",
4,A2M LRP1,10652313,Homo sapiens,Homo sapiens,A2M,LRP1,Homo sapiens,DIRECT,A2M LRP1,ENSG00000175899,...,alpha-2-macroglobulin,"FWP007, S863-7, CPAMD5",,LDL receptor related protein 1,"LRP, CD91, LRP1A, APOER, IGFBP3R1, IGFBP-3R",LRP1,"APR, A2MR","<span title=""A2M (FWP007, S863-7, CPAMD5)"">A2M...","<span title=""LRP1 (APR, A2MR, LRP, CD91, LRP1A...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6121,ZG16B TLR5,20802527,Homo sapiens,Homo sapiens,ZG16B,TLR5,Homo sapiens,DIRECT,ZG16B TLR5,ENSG00000283056,...,zymogen granule protein 16B,"HRPE773, PRO1567, JCLN2",,toll like receptor 5,"TIL3, FLJ10052, MGC126430, MGC126431",TLR5,SLEB1,"<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR5 (SLEB1, TIL3, FLJ10052, MGC1...",
6122,ZG16B TLR6,20802527,Homo sapiens,Homo sapiens,ZG16B,TLR6,Homo sapiens,DIRECT,ZG16B TLR6,ENSG00000283056,...,zymogen granule protein 16B,"HRPE773, PRO1567, JCLN2",,toll like receptor 6,CD286,TLR6,,"<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR6 (CD286)"">TLR6 (CD286)</span>",
6123,ZG16B Tlr6,20802527,Homo sapiens,Mus Musculus,ZG16B,Tlr6,mixed,CONSERVATION,ZG16B TLR6,ENSG00000283056,...,zymogen granule protein 16B,"HRPE773, PRO1567, JCLN2",,toll like receptor 6,CD286,TLR6,,"<span title=""ZG16B (HRPE773, PRO1567, JCLN2)"">...","<span title=""TLR6 (CD286)"">TLR6 (CD286)</span>",
6124,Zp3 Chrna7,22577141,Mus Musculus,Mus Musculus,Zp3,Chrna7,Mus Musculus,CONSERVATION,ZP3 CHRNA7,ENSG00000188372,...,zona pellucida glycoprotein 3,"ZP3-424, ZP3-372, ZPC","ZP3A, ZP3B",cholinergic receptor nicotinic alpha 7 subunit,,CHRNA7,,"<span title=""ZP3 (ZP3A, ZP3B, ZP3-424, ZP3-372...","<span title=""CHRNA7"">CHRNA7</span>",


In [9]:
def fetch_pubmed_data(pmid_list):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    results = []

    # Load existing data if output file exists
    if os.path.exists(output_file):
        existing_data = pd.read_csv(output_file)
    else:
        existing_data = pd.DataFrame(columns=["PMID", "Title", "Abstract", "Journal", "Year"])

    # Split PMIDs into batches
    batch_size = 50
    pmid_batches = [pmid_list[i:i + batch_size] for i in range(0, len(pmid_list), batch_size)]

    # Iterate over the batches
    for batch in pmid_batches:
        params = {
            "db": "pubmed",
            "id": ",".join(batch),  # Join PMIDs as comma-separated
            "retmode": "xml",
            "api_key": ncbi_api_key
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()

            # Parse the XML response
            root = ET.fromstring(response.text)
            for article in root.findall(".//PubmedArticle"):
                # Extract Title and Abstract
                title = article.findtext(".//ArticleTitle", default="N/A")
                abstract = article.findtext(".//AbstractText", default="No abstract available")

                # Extract Journal Title
                journal_tag = article.find(".//Journal/Title")
                journal = journal_tag.text.strip() if journal_tag is not None and journal_tag.text else "N/A"

                # Extract Publication Year
                pub_date = article.find(".//PubDate")
                if pub_date is not None:
                    year_tag = pub_date.find("Year")
                    year = year_tag.text if year_tag is not None else "N/A"

                    # Fallback to MedlineDate if Year is missing
                    if year == "N/A":
                        medline_date_tag = pub_date.find("MedlineDate")
                        year = medline_date_tag.text.split()[0] if medline_date_tag is not None else "N/A"
                else:
                    year = "N/A"  # PubDate is completely missing
# 
                # Initialize species as N/A
                # species = "N/A"

                # # Check if the word "patient" is detected in title or abstract (assume human)
                # if "patient" in title.lower() or "patient" in abstract.lower():
                #     species = "Homo sapiens"
                # elif "human" in title.lower() or "human" in abstract.lower():
                #     species = "Homo sapiens"
                # else:
                #     # Look for HGNC gene symbols in title or abstract (assume human if found)
                #     for gene in hgnc_symbols:
                #         if gene in title or gene in abstract:
                #             species = "Homo sapiens"
                #             break
                #     else:
                #         # Look for MeSH terms related to species
                #         for mesh_heading in article.findall(".//MeshHeadingList/MeshHeading"):
                #             descriptor_name = mesh_heading.findtext("DescriptorName")
                #             if descriptor_name:
                #                 # Match official species names using the species_dict
                #                 for species_term, scientific_name in species_dict.items():
                #                     if species_term in descriptor_name.lower():
                #                         species = scientific_name
                #                         break  # Stop after finding the first match

                # Append the result
                results.append({
                    "PMID": article.findtext(".//MedlineCitation/PMID"),
                    "Title": title,
                    "Abstract": abstract,
                    "Journal": journal,
                    "Year": year,
                    #"Species": species,
                })

        except Exception as e:
            print(f"Error fetching batch {batch}: {e}")
            # Optionally save the response for debugging
            with open(f"error_batch_{batch[0]}_{batch[-1]}.xml", "w") as f:
                f.write(response.text)

        # Rate limiting to avoid API overload
        time.sleep(1)  # Increase delay for better API compliance

    # Save results
    new_data = pd.DataFrame(results)
    if not new_data.empty:
        # Merge existing and new data, updating missing values
        updated_data = pd.concat([existing_data, new_data])

        # Ensure all PMIDs are strings
        updated_data["PMID"] = updated_data["PMID"].astype(str)

        # Drop rows with missing PMIDs
        updated_data = updated_data.dropna(subset=["PMID"])

        # Ensure rows are ordered and remove duplicates
        updated_data = (
            updated_data.sort_values(by="PMID")  # Ensure rows are ordered
            .drop_duplicates(subset="PMID", keep="last")  # Keep the latest data
        )
        updated_data["Journal"] = updated_data["Journal"].str.split(" (", n=1, expand=False, regex=False).str[0]
        updated_data.to_csv(output_file, index=False)
    else:
        print("No new data fetched.")

    return results

# Fetch PubMed data with your list of PMIDs, output file path, and NCBI API key
fetch_pubmed_data(pmid_list)

# Filter and print PMIDs where the Title does not end with a period this are the ones that need to be manually edited as title is not complete.
df = pd.read_csv(output_file)
pmids_without_period = df[df['Title'].isna() | ~df['Title'].str.endswith(('.', '?')).fillna(False)]['PMID']
pmid_check= pmids_without_period.tolist()
print("These " + str(len(pmid_check)) + " titles that have to be manually checked -- possible incomplete titles")
print(pmid_check)

These 29 titles that have to be manually checked -- possible incomplete titles
[28280243.0, 28360196.0, 28646018.0, 28844702.0, 29549127.0, 29741477.0, 30262652.0, 30659054.0, 31300520.0, 31664130.0, 31987794.0, 32541810.0, 32681389.0, 32876567.0, 33497493.0, 33526813.0, 33790888.0, 33962943.0, 33975953.0, 35132089.0, 35605991.0, 35617401.0, 35711472.0, 36790376.0, 37540598.0, 38628398.0, 38758807.0, 38902261.0, nan]


  pmids_without_period = df[df['Title'].isna() | ~df['Title'].str.endswith(('.', '?')).fillna(False)]['PMID']


In [11]:
import numpy as np
source = np.array(gene_pair_human["PMID"].unique())

# Read the API key from a file
with open("data/ncbi_api_key.txt", "r") as file:
    ncbi_api_key = file.read().strip()

# File to save the results
output_file = "data/pubmed_results.csv"

# Load your list of PMIDs
pmid_list = source

In [None]:
def fetch_pubmed_data(pmid_list):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    results = []

    # Load existing data if output file exists
    if os.path.exists(output_file):
        existing_data = pd.read_csv(output_file)
    else:
        existing_data = pd.DataFrame(columns=["PMID", "Title", "Abstract", "Journal", "Year"])

    # Split PMIDs into batches
    batch_size = 50
    pmid_batches = [pmid_list[i:i + batch_size] for i in range(0, len(pmid_list), batch_size)]

    # Iterate over the batches
    for batch in pmid_batches:
        params = {
            "db": "pubmed",
            "id": ",".join(batch),  # Join PMIDs as comma-separated
            "retmode": "xml",
            "api_key": ncbi_api_key
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()

            # Parse the XML response
            root = ET.fromstring(response.text)
            for article in root.findall(".//PubmedArticle"):
                # Extract Title and Abstract
                title = article.findtext(".//ArticleTitle", default="N/A")
                abstract = article.findtext(".//AbstractText", default="No abstract available")

                # Extract Journal Title
                journal_tag = article.find(".//Journal/Title")
                journal = journal_tag.text.strip() if journal_tag is not None and journal_tag.text else "N/A"

                # Extract Publication Year
                pub_date = article.find(".//PubDate")
                if pub_date is not None:
                    year_tag = pub_date.find("Year")
                    year = year_tag.text if year_tag is not None else "N/A"

                    # Fallback to MedlineDate if Year is missing
                    if year == "N/A":
                        medline_date_tag = pub_date.find("MedlineDate")
                        year = medline_date_tag.text.split()[0] if medline_date_tag is not None else "N/A"
                else:
                    year = "N/A"  # PubDate is completely missing
# 
                # Initialize species as N/A
                # species = "N/A"

                # # Check if the word "patient" is detected in title or abstract (assume human)
                # if "patient" in title.lower() or "patient" in abstract.lower():
                #     species = "Homo sapiens"
                # elif "human" in title.lower() or "human" in abstract.lower():
                #     species = "Homo sapiens"
                # else:
                #     # Look for HGNC gene symbols in title or abstract (assume human if found)
                #     for gene in hgnc_symbols:
                #         if gene in title or gene in abstract:
                #             species = "Homo sapiens"
                #             break
                #     else:
                #         # Look for MeSH terms related to species
                #         for mesh_heading in article.findall(".//MeshHeadingList/MeshHeading"):
                #             descriptor_name = mesh_heading.findtext("DescriptorName")
                #             if descriptor_name:
                #                 # Match official species names using the species_dict
                #                 for species_term, scientific_name in species_dict.items():
                #                     if species_term in descriptor_name.lower():
                #                         species = scientific_name
                #                         break  # Stop after finding the first match

                # Append the result
                results.append({
                    "PMID": article.findtext(".//MedlineCitation/PMID"),
                    "Title": title,
                    "Abstract": abstract,
                    "Journal": journal,
                    "Year": year,
                    #"Species": species,
                })

        except Exception as e:
            print(f"Error fetching batch {batch}: {e}")
            # Optionally save the response for debugging
            with open(f"error_batch_{batch[0]}_{batch[-1]}.xml", "w") as f:
                f.write(response.text)

        # Rate limiting to avoid API overload
        time.sleep(1)  # Increase delay for better API compliance

    # Save results
    new_data = pd.DataFrame(results)
    if not new_data.empty:
        # Merge existing and new data, updating missing values
        updated_data = pd.concat([existing_data, new_data])

        # Ensure all PMIDs are strings
        updated_data["PMID"] = updated_data["PMID"].astype(str)

        # Drop rows with missing PMIDs
        updated_data = updated_data.dropna(subset=["PMID"])

        # Ensure rows are ordered and remove duplicates
        updated_data = (
            updated_data.sort_values(by="PMID")  # Ensure rows are ordered
            .drop_duplicates(subset="PMID", keep="last")  # Keep the latest data
        )
        updated_data["Journal"] = updated_data["Journal"].str.split(" (", n=1, expand=False, regex=False).str[0]
        updated_data.to_csv(output_file, index=False)
    else:
        print("No new data fetched.")

    return results

# Fetch PubMed data with your list of PMIDs, output file path, and NCBI API key
fetch_pubmed_data(pmid_list)

In [14]:
df

Unnamed: 0,PMID,Title,Abstract,Journal,Year
0,1,Formate assay in body fluids: application in m...,No abstract available,Biochemical medicine,1975
1,10,Digitoxin metabolism by rat liver microsomes.,No abstract available,Biochemical pharmacology,1975
2,10024503,Reversal of hyperlipidaemia in apolipoprotein ...,We have shown previously that human apolipopro...,The Biochemical journal,1999
3,10025398,The integrin alpha v beta 6 binds and activate...,Transforming growth factor beta (TGF beta) fam...,Cell,1999
4,10037686,Identification of a novel activation-inducible...,Among members of the tumor necrosis factor rec...,The Journal of biological chemistry,1999
...,...,...,...,...,...
3393,9972281,Insulin-like growth factor-I receptor signal t...,The insulin-like growth factor-I receptor (IGF...,Comparative biochemistry and physiology. Part ...,1998
3394,9973222,Activation of neurotrophin-3 receptor TrkC ind...,Elevated expression of the neurotrophin-3 (NT-...,Cancer research,1999
3395,9988678,Decorin is a biological ligand for the epiderm...,Ectopic expression of decorin induces profound...,The Journal of biological chemistry,1999
3396,9988761,Identification of residues within the 727-767 ...,Mapping approaches employing blocking antibodi...,The Journal of biological chemistry,1999


In [18]:
# Filter and print PMIDs where the Title does not end with a period this are the ones that need to be manually edited as title is not complete.
df = pd.read_csv(output_file)
pmids_without_period = df[df['Title'].isna() | ~df['Title'].str.endswith(('.', '?')).fillna(False)]['PMID']
pmid_check= pmids_without_period.tolist()
print("These " + str(len(pmid_check)) + " titles that have to be manually checked -- possible incomplete titles")
print(pmid_check)

# Fill NaN values in 'Abstract' column with an empty string
df['Abstract'] = df['Abstract'].fillna('')

pmids_without_period = df[~df['Abstract'].str.endswith(('.', '?', 'available', ')', 'Review', '...'))]['PMID'] # Corrected the condition to find abstracts that *do not* end with a period or question mark
pmid_check = pmids_without_period.tolist()

print("These " + str(len(pmid_check)) + " PMIDs have abstracts that might be incomplete (do not end with a period or question mark or other indicators):")
print(pmid_check)

These 23 titles that have to be manually checked -- possible incomplete titles
[28280243, 28360196, 28646018, 29549127, 29741477, 30262652, 30659054, 31300520, 31664130, 31987794, 32541810, 32876567, 33497493, 33526813, 33790888, 33962943, 35132089, 35605991, 35617401, 36790376, 37540598, 38628398, 38902261]
These 125 PMIDs have abstracts that might be incomplete (do not end with a period or question mark or other indicators):
[24701371, 27378688, 27875312, 27982078, 28280243, 28302677, 28360196, 28364041, 28394331, 28408722, 28465413, 28546512, 28646018, 28696225, 28698550, 28733458, 28783682, 28785723, 28794434, 28846098, 28851741, 28893801, 28939773, 28943410, 28973891, 29059156, 29100055, 29178324, 29180449, 29296932, 29348142, 29467366, 29545933, 29549127, 29717114, 29721382, 29741477, 29742426, 29769720, 29777742, 29853539, 29898920, 29904386, 29925589, 29930766, 30041429, 30139742, 30139743, 30262652, 30632962, 30659054, 30712922, 30713770, 30733680, 30819903, 30854241, 30956130

  pmids_without_period = df[df['Title'].isna() | ~df['Title'].str.endswith(('.', '?')).fillna(False)]['PMID']


In [27]:
prev_file = "data/pubmed_results_20250614.csv"
df = pd.read_csv(prev_file)

In [23]:
df =df[df["PMID"].isin([28280243, 28360196, 28646018, 29549127, 29741477, 30262652, 30659054, 31300520, 31664130, 31987794, 32541810, 32876567, 33497493, 33526813, 33790888, 33962943, 35132089, 35605991, 35617401, 36790376, 37540598, 38628398, 38902261])]

In [28]:
pmid_check = [24701371, 27378688, 27875312, 27982078, 28280243, 28302677, 28360196, 28364041, 28394331, 28408722, 28465413, 28546512, 28646018, 28696225, 28698550, 28733458, 28783682, 28785723, 28794434, 28846098, 28851741, 28893801, 28939773, 28943410, 28973891, 29059156, 29100055, 29178324, 29180449, 29296932, 29348142, 29467366, 29545933, 29549127, 29717114, 29721382, 29741477, 29742426, 29769720, 29777742, 29853539, 29898920, 29904386, 29925589, 29930766, 30041429, 30139742, 30139743, 30262652, 30632962, 30659054, 30712922, 30713770, 30733680, 30819903, 30854241, 30956130, 31098409, 31175175, 31216173, 31300520, 31367043, 31645726, 31664130, 31857654, 31987794, 32144270, 32149455, 32506691, 32541810, 32694578, 32702237, 32820046, 32876567, 32907880, 32948210, 33306155, 33497493, 33503438, 33526813, 33790888, 33852831, 33875597, 33962943, 34108253, 34163464, 34407556, 34508778, 34518695, 34678058, 34910520, 35132089, 35141051, 35165283, 35253643, 35295855, 35414038, 35596683, 35617401, 35739238, 35802072, 36044575, 36205393, 36790376, 37036990, 37097004, 37168680, 37333140, 37356715, 37435859, 37523551, 37540598, 37554323, 37633268, 38147550, 38203798, 38426726, 38628398, 38781210, 38897996, 38902261, 39128984, 39229119, 39261724, 6172602]
df =df[df["PMID"].isin(pmid_check)]
df.to_csv("data/missing_PMID_abstract_info.csv")

In [15]:
gene_pair_human["LR_pair_orig"].unique()

array(['A Atrn', 'A2M HSPA5', 'A2M LRP1', ..., 'ZG16B Tlr6', 'Zp3 Chrna7',
       'Zp3 Zp3R'], dtype=object)

In [55]:
# 1. Define URL and dated filename
import csv
url = "https://download.rgd.mcw.edu/data_release/GENES_RAT.txt"
out_file = f"data/GENES_RAT_{today}_RGD_DB.tsv"

# 2. Fetch raw content as text
response = requests.get(url)
response.raise_for_status()
content = response.text

# 3. Split into lines, remove first 102 header lines
lines = content.splitlines()[102:]

# 4. Parse into rows
rows = [line.split("\t") for line in lines]
header = rows[1]
data_rows = rows[2:]


# 5. Write as proper TSV
with open(out_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(header)
    writer.writerows(data_rows)

print(f"✅ Written {len(rows)} rows to {out_file}")

✅ Written 61464 rows to data/GENES_RAT_20250723_RGD_DB.tsv


In [62]:
import requests
import pandas as pd
from io import StringIO
from datetime import datetime

# Define file URLs and your exact header specifications
files = {
    "aliases": {
        "url": "https://zfin.org/downloads/aliases.txt",
        "headers": ["Current ZFIN ID", "Current Name", "Current Symbol",
                    "Previous Name", "SO ID"],
        "key": "Current ZFIN ID"
    },
    "gene": {
        "url": "https://zfin.org/downloads/gene.txt",
        "headers": ["ZFIN ID", "SO ID", "Symbol", "NCBI Gene ID"],
        "key": "ZFIN ID"
    },
    "orthos": {
        "url": "https://zfin.org/downloads/human_orthos.txt",
        "headers": ["ZFIN ID", "ZFIN Symbol", "ZFIN Name", "Human Symbol",
                    "Human Name", "OMIM ID", "Gene ID", "HGNC ID",
                    "Evidence", "Pub ID", "ZFIN Abbreviation Name",
                    "ECO ID", "ECO Term Name"],
        "key": "ZFIN ID"
    }
}

# 1. Download and load each file with your defined headers
dfs = {}
for name, params in files.items():
    r = requests.get(params["url"])
    r.raise_for_status()
    df = pd.read_csv(
        StringIO(r.text),
        sep="\t",
        comment="#",
        header=None,
        names=params["headers"],
        dtype=str
    )
    # Standardize the key column name
    df = df.rename(columns={params["key"]: "ZFIN_ID"})
    dfs[name] = df

# 2. Merge on ZFIN_ID using outer join
merged = dfs["aliases"]
for name in ["gene", "orthos"]:
    merged = merged.merge(dfs[name], on="ZFIN_ID", how="outer", suffixes=("", f"_{name}"))

# 3. Drop duplicated columns (identical names and contents)
merged = merged.loc[:, ~merged.columns.duplicated()]
# Also drop perfectly identical-content duplicates
columns = merged.columns
to_drop = [
    col2 for i, col1 in enumerate(columns)
    for col2 in columns[i+1:]
    if merged[col1].equals(merged[col2])
]
merged = merged.drop(columns=to_drop)
merged = merged.drop(columns=['SO ID_gene', 'Symbol', 'Evidence','Pub ID', 'ZFIN Symbol', 'ZFIN Name'])
merged = merged.drop_duplicates()

# 4. Save with today's date
today = datetime.now().strftime("%Y%m%d")
output = f"data/Zebrafish_merged_{today}_ZFIN_DB.tsv"
merged.to_csv(output, sep="\t", index=False, encoding="utf-8")

print(f"✅ Merged file saved: {output} — shape: {merged.shape}")


✅ Merged file saved: data/Zebrafish_merged_20250723_ZFIN_DB.tsv — shape: (240290, 14)


In [24]:
import fetchGSheet

In [73]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

### IMPORTANT ####
# Warning: First run "src/convertOrthUniprotToEnsembl.r" in R
##################

# === USER INPUT ===
data_dir = "data"
# original species
orig_species_input = "frog"
# Ortholog species
species_input = "human"  # Options: "mouse", "zebrafish", # No sheep in inParanoid

# species_to_process = ["mouse", "rat", "human", "zebrafish", "chimpanzee", "chicken", "pig", "cow", "dog", "horse", "marmoset",   "macaque"]

# === INTERNAL MAPPINGS ===

orig_species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    #"sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "frog":          {"taxid": "8364",  "code": "xtropicalis"},
    "rabbit":        {"taxid": "9986",  "code": "ocuniculus"},
    "guineapig":     {"taxid": "10141",  "code": "cporcellus"},
    "pufferfish":    {"taxid": "99883",  "code": "tnigroviridis"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}


species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    #"sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "frog":          {"taxid": "8364",  "code": "xtropicalis"},
    "rabbit":        {"taxid": "9986",  "code": "ocuniculus"},
    "guineapig":     {"taxid": "10141",  "code": "cporcellus"},
    "pufferfish":    {"taxid": "99883",  "code": "tnigroviridis"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}

if orig_species_input not in orig_species_info:
    raise ValueError(f"Species '{orig_species_input}' not supported. Choose from: {list(orig_species_info)}")

if species_input not in species_info:
    raise ValueError(f"Species '{species_input}' not supported. Choose from: {list(species_info)}")

species = species_info[species_input]
taxid = species["taxid"]
code = species["code"]

orig_species = orig_species_info[orig_species_input]
orig_taxid = orig_species["taxid"]
orig_code = orig_species["code"]

# === Step 1: Download inParanoid file for human vs species ===
url = f"https://inparanoidb.sbc.su.se/download/sqltable/{orig_taxid}&{taxid}&prot"
r = requests.get(url)
r.raise_for_status()

df = pd.read_csv(StringIO(r.text.strip()), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "inparalog_score", "protein_id", "seed_score"]

# === Step 2: Add species labels ===
def infer_species(src):
    if orig_taxid in src:
        return orig_species_input
    elif taxid in src:
        return species_input
    return "unknown"

df["species"] = df["source_file"].apply(infer_species)

# === Step 3: Build ortholog pairs ===
records = []
for cid, grp in df.groupby("cluster_id"):
    orig_spec = grp[grp["species"] == orig_species_input]
    others = grp[grp["species"] == species_input]
    
    for h, o in product(orig_spec.itertuples(index=False), others.itertuples(index=False)):
        records.append({
            "cluster_id": cid,
            f"{orig_species_input}_protein": h.protein_id,
            f"{orig_species_input}_inparalog_score": h.inparalog_score,
            f"{orig_species_input}_seed_score": h.seed_score,
            f"{species_input}_protein": o.protein_id,
            f"{species_input}_inparalog_score": o.inparalog_score,
            f"{species_input}_seed_score": o.seed_score,
            "bitscore": (h.bitscore + o.bitscore) / 2
        })


df_orthologs = pd.DataFrame(records)
df_orthologs.to_csv(f"data/{orig_species_input}_centric_inParanoid_{species_input}.csv", index=False)

# === Step 4: UniProt → Gene Name ===
print("Starting annotation process...")

# --- Load Human UniProt Mapping once ---
orig_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{orig_species_input}.csv")
if not os.path.exists(orig_uniprot_file):
    print(f"Error: {orig_species_input} UniProt mapping file not found at {orig_uniprot_file}. Please run the UniProt fetching script for human first.")
    exit() # Exit if the essential human file is missing

print(f"Loading {orig_species_input} UniProt mapping from {orig_uniprot_file}...")
# We only need 'Entry' (Accession) and 'Gene Names'
orig_uniprot_df = pd.read_csv(orig_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as human gene name
orig_uniprot_df = orig_uniprot_df.rename(columns={"Gene Names": f"{orig_species_input}_Gene_Name"})
print(f"Loaded {len(orig_uniprot_df)} {orig_species_input} UniProt entries.")

# --- Process each species ---
print(f"\n--- Processing {species_input} ---")

inparanoid_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}.csv")

species_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{species_input}.csv")
output_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}_AnnWithUniProt.csv")

# Check if input files exist
if not os.path.exists(inparanoid_file):
    print(f"Skipping {species_input}: InParanoid file not found at {inparanoid_file}")
    if not os.path.exists(species_uniprot_file):
        print(f"Skipping {species_input}: UniProt mapping file not found at {species_uniprot_file}")

print(f"Loading InParanoid data from {inparanoid_file}...")
df_inparanoid = pd.read_csv(inparanoid_file)
print(f"Loaded {len(df_inparanoid)} InParanoid entries for {species_input}.")

print(f"Loading {species_input} UniProt mapping from {species_uniprot_file}...")
species_uniprot_df = pd.read_csv(species_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as the species' gene name
species_uniprot_df = species_uniprot_df.rename(columns={"Gene Names": f"{species_input}_Gene_Name"})
print(f"Loaded {len(species_uniprot_df)} {species_input} UniProt entries.")

# 1. Annotate based on "orig_species_protein" with orig UniProt data
print(f"Merging {orig_species_input} gene names...")
# Use left merge to keep all rows from df_inparanoid
df_merged = pd.merge(
    df_inparanoid,
    orig_uniprot_df,
    left_on=f"{orig_species_input}_protein",
    right_on="Entry",
    how="left"
)
    # Drop the redundant 'Entry' column from the merge
df_merged = df_merged.drop(columns=["Entry"])

# 2. Annotate based on "{species_input}_protein" with species UniProt data
print(f"Merging {species_input} gene names...")
df_merged = pd.merge(
    df_merged, # Merge into the already merged dataframe
    species_uniprot_df,
    left_on=f"{species_input}_protein",
    right_on="Entry",
    how="left"
)
# Drop the redundant 'Entry' column from the second merge
df_merged = df_merged.drop(columns=["Entry"])

# 3. Save merged data
print(f"Saving merged data to {output_file}...")
df_merged.to_csv(output_file, index=False)
print(f"Successfully saved {len(df_merged)} annotated entries for {species_input}.")

print("\nAnnotation process completed for all specified species.")

Starting annotation process...
Loading frog UniProt mapping from data/uniprotMapping_frog.csv...
Loaded 2901 frog UniProt entries.

--- Processing human ---
Loading InParanoid data from data/frog_centric_inParanoid_human.csv...
Loaded 19461 InParanoid entries for human.
Loading human UniProt mapping from data/uniprotMapping_human.csv...
Loaded 45583 human UniProt entries.
Merging frog gene names...
Merging human gene names...
Saving merged data to data/frog_centric_inParanoid_human_AnnWithUniProt.csv...
Successfully saved 45745 annotated entries for human.

Annotation process completed for all specified species.


In [61]:
df_merged

Unnamed: 0,cluster_id,mouse_protein,mouse_inparalog_score,mouse_seed_score,cow_protein,cow_inparalog_score,cow_seed_score,bitscore,mouse_Gene_Name,cow_Gene_Name
0,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Macf1,
1,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Acf7,
2,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Aclp7,
3,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Kiaa0754,
4,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Macf,
...,...,...,...,...,...,...,...,...,...,...
44484,16645,Q925H3,1.000,1.0,A0A3Q1NHC3,1.0,1.0,42.0,Krtap16-8,
44485,16645,Q925H3,1.000,1.0,A0A3Q1NHC3,1.0,1.0,42.0,Krtap16.8,
44486,16645,O09048,0.667,,A0A3Q1NHC3,1.0,1.0,42.0,,
44487,16645,O08631,0.241,,A0A3Q1NHC3,1.0,1.0,42.0,,


In [12]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "ensembl_gene_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "ensembl_gene_id": "human_ensembl_gene_id"
}).drop(columns=["uniprot_id"])

df_merged = df_merged.dropna(subset=["human_ensembl_gene_id"])
df_merged.to_csv(f"data/{species_input}_inParanoid_withHGNC.tsv", sep="\t", index=False)

# === Step 5: Optional - Species UniProt → Ensembl mapping ===
map_path = f"data/{species_input}_uniprot_to_ensembl.tsv"
try:
    species_map = pd.read_csv(map_path, sep="\t", dtype=str)
    df_merged = df_merged.merge(
        species_map,
        left_on=f"{species_input}_protein",
        right_on="uniprotswissprot",
        how="left"
    ).rename(columns={"ensembl_gene_id": f"{species_input}_ensembl_gene_id"}) \
     .drop(columns=["uniprotswissprot"])
except FileNotFoundError:
    print(f"⚠️  Mapping file not found: {map_path}")

df_merged.to_csv(f"data/df_merged_with_{species_input}_ensembl.tsv", sep="\t", index=False)


In [23]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

### IMPORTANT ####
# Warning: First run "src/convertOrthUniprotToEnsembl.r" in R
##################

# === USER INPUT ===
# original species
orig_species_input = "mouse"
# Ortholog species
species_input = "human"  # Options: "mouse", "zebrafish", # No sheep in inParanoid

# === INTERNAL MAPPINGS ===

orig_species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    "sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}


species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    "sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}

if orig_species_input not in orig_species_info:
    raise ValueError(f"Species '{orig_species_input}' not supported. Choose from: {list(orig_species_info)}")

if species_input not in species_info:
    raise ValueError(f"Species '{species_input}' not supported. Choose from: {list(species_info)}")

species = species_info[species_input]
taxid = species["taxid"]
code = species["code"]

orig_species = orig_species_info[orig_species_input]
orig_taxid = orig_species["taxid"]
orig_code = orig_species["code"]

# === Step 1: Download inParanoid file for human vs species ===
url = f"https://inparanoidb.sbc.su.se/download/sqltable/{orig_taxid}&{taxid}&prot"
r = requests.get(url)
r.raise_for_status()

df = pd.read_csv(StringIO(r.text.strip()), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "inparalog_score", "protein_id", "seed_score"]

# === Step 2: Add species labels ===
def infer_species(src):
    if orig_taxid in src:
        return orig_species_input
    elif taxid in src:
        return species_input
    return "unknown"


df["species"] = df["source_file"].apply(infer_species)

In [24]:
df

Unnamed: 0,cluster_id,bitscore,source_file,inparalog_score,protein_id,seed_score,species
0,1,60090,10090.fa,1.0,A2ASS6,1.0,mouse
1,1,60090,9606.fa,1.0,Q8WZ42,1.0,human
2,2,14503,10090.fa,1.0,Q6ZWR6,1.0,mouse
3,2,14503,9606.fa,1.0,Q8NF91,1.0,human
4,3,12156,10090.fa,1.0,A2AAJ9,1.0,mouse
...,...,...,...,...,...,...,...
35953,17096,41,9606.fa,1.0,P0DP42,1.0,human
35954,17097,41,10090.fa,1.0,A0A0G2JDY8,1.0,mouse
35955,17097,41,9606.fa,1.0,A0A075B6W3,1.0,human
35956,17098,40,10090.fa,1.0,Q64389,1.0,mouse


In [26]:
# === Step 3: Build ortholog pairs ===
# Assume 'species' column exists, and 'orig_species_input' is defined
records = []
for cid, grp in df.groupby("cluster_id"):
    orig_spec = grp[grp["species"] == orig_species_input]
    others = grp[grp["species"] == species_input]
    
    for h, o in product(orig_spec.itertuples(index=False), others.itertuples(index=False)):
        records.append({
            "cluster_id": cid,
            f"{orig_species_input}_protein": h.protein_id,
            f"{orig_species_input}_inparalog_score": h.inparalog_score,
            f"{orig_species_input}_seed_score": h.seed_score,
            f"{species_input}_protein": o.protein_id,
            f"{species_input}_inparalog_score": o.inparalog_score,
            f"{species_input}_seed_score": o.seed_score,
            "bitscore": (h.bitscore + o.bitscore) / 2
        })
df_orthologs = pd.DataFrame(records)
df_orthologs.to_csv(f"data/{orig_species_input}_centric_inParanoid_{species_input}.csv", index=False)

In [30]:
# === Step 4: UniProt → Gene Name ===
print("Starting annotation process...")

# --- Load Human UniProt Mapping once ---
orig_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{orig_species_input}.csv")
if not os.path.exists(orig_uniprot_file):
    print(f"Error: {orig_species_input} UniProt mapping file not found at {orig_uniprot_file}. Please run the UniProt fetching script for human first.")
    exit() # Exit if the essential human file is missing

print(f"Loading {orig_species_input} UniProt mapping from {orig_uniprot_file}...")
# We only need 'Entry' (Accession) and 'Gene Names'
orig_uniprot_df = pd.read_csv(orig_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as human gene name
orig_uniprot_df = orig_uniprot_df.rename(columns={"Gene Names": f"{orig_species_input}_Gene_Name"})
print(f"Loaded {len(orig_uniprot_df)} {orig_species_input} UniProt entries.")

# --- Process each species ---
for species_input in species_to_process:
    print(f"\n--- Processing {species_input} ---")

    inparanoid_file = os.path.join(data_dir, f"inParanoid_{species_input}.csv")
    species_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{species_input}.csv")
    output_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}_AnnWithUniProt.csv")

    # Check if input files exist
    if not os.path.exists(inparanoid_file):
        print(f"Skipping {species_input}: InParanoid file not found at {inparanoid_file}")
        continue
    if not os.path.exists(species_uniprot_file):
        print(f"Skipping {species_input}: UniProt mapping file not found at {species_uniprot_file}")
        continue

    print(f"Loading InParanoid data from {inparanoid_file}...")
    df_inparanoid = pd.read_csv(inparanoid_file)
    print(f"Loaded {len(df_inparanoid)} InParanoid entries for {species_input}.")

    print(f"Loading {species_input} UniProt mapping from {species_uniprot_file}...")
    species_uniprot_df = pd.read_csv(species_uniprot_file, usecols=["Entry", "Gene Names"])
    # Rename 'Gene Names' to distinguish it as the species' gene name
    species_uniprot_df = species_uniprot_df.rename(columns={"Gene Names": f"{species_input}_Gene_Name"})
    print(f"Loaded {len(species_uniprot_df)} {species_input} UniProt entries.")

    # 1. Annotate based on "orig_species_protein" with orig UniProt data
    print(f"Merging {orig_species_input} gene names...")
    # Use left merge to keep all rows from df_inparanoid
    df_merged = pd.merge(
        df_inparanoid,
        orig_uniprot_df,
        left_on=f"{orig_species_input}_protein",
        right_on="Entry",
        how="left"
    )
    # Drop the redundant 'Entry' column from the merge
    df_merged = df_merged.drop(columns=["Entry"])

    # 2. Annotate based on "{species_input}_protein" with species UniProt data
    print(f"Merging {species_input} gene names...")
    df_merged = pd.merge(
        df_merged, # Merge into the already merged dataframe
        species_uniprot_df,
        left_on=f"{orig_species_input}_protein",
        right_on="Entry",
        how="left"
    )
    # Drop the redundant 'Entry' column from the second merge
    df_merged = df_merged.drop(columns=["Entry"])

    # 3. Save merged data
    print(f"Saving merged data to {output_file}...")
    df_merged.to_csv(output_file, index=False)
    print(f"Successfully saved {len(df_merged)} annotated entries for {species_input}.")

print("\nAnnotation process completed for all specified species.")


Starting annotation process...
Loading mouse UniProt mapping from data/uniprotMapping_mouse.csv...
Loaded 32289 mouse UniProt entries.

--- Processing mouse ---
Loading InParanoid data from data/inParanoid_mouse.csv...
Loaded 20805 InParanoid entries for mouse.
Loading mouse UniProt mapping from data/uniprotMapping_mouse.csv...
Loaded 32289 mouse UniProt entries.
Merging mouse gene names...
Merging mouse gene names...
Saving merged data to data/mouse_centric_inParanoid_mouse_AnnWithUniProt.csv...
Successfully saved 81780 annotated entries for mouse.

Annotation process completed for all specified species.


In [37]:
# === Step 4: UniProt → Gene Name ===
print("Starting annotation process...")

# --- Load Human UniProt Mapping once ---
orig_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{orig_species_input}.csv")
if not os.path.exists(orig_uniprot_file):
    print(f"Error: {orig_species_input} UniProt mapping file not found at {orig_uniprot_file}. Please run the UniProt fetching script for human first.")
    exit() # Exit if the essential human file is missing

print(f"Loading {orig_species_input} UniProt mapping from {orig_uniprot_file}...")
# We only need 'Entry' (Accession) and 'Gene Names'
orig_uniprot_df = pd.read_csv(orig_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as human gene name
orig_uniprot_df = orig_uniprot_df.rename(columns={"Gene Names": f"{orig_species_input}_Gene_Name"})
print(f"Loaded {len(orig_uniprot_df)} {orig_species_input} UniProt entries.")

Starting annotation process...
Loading mouse UniProt mapping from data/uniprotMapping_mouse.csv...
Loaded 32289 mouse UniProt entries.


In [41]:
species_to_process = ["mouse", "rat", "human", "zebrafish", "chimpanzee", "chicken", "pig", "cow", "dog", "horse", "marmoset",   "macaque"]

In [42]:
species_input

'human'

In [45]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

### IMPORTANT ####
# Warning: First run "src/convertOrthUniprotToEnsembl.r" in R
##################

# === USER INPUT ===
# original species
orig_species_input = "mouse"
# Ortholog species
species_input = "human"  # Options: "mouse", "zebrafish", # No sheep in inParanoid

species_to_process = ["mouse", "rat", "human", "zebrafish", "chimpanzee", "chicken", "pig", "cow", "dog", "horse", "marmoset",   "macaque"]

# === INTERNAL MAPPINGS ===

orig_species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    "sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}


species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    "sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}

if orig_species_input not in orig_species_info:
    raise ValueError(f"Species '{orig_species_input}' not supported. Choose from: {list(orig_species_info)}")

if species_input not in species_info:
    raise ValueError(f"Species '{species_input}' not supported. Choose from: {list(species_info)}")

species = species_info[species_input]
taxid = species["taxid"]
code = species["code"]

orig_species = orig_species_info[orig_species_input]
orig_taxid = orig_species["taxid"]
orig_code = orig_species["code"]

# === Step 1: Download inParanoid file for human vs species ===
url = f"https://inparanoidb.sbc.su.se/download/sqltable/{orig_taxid}&{taxid}&prot"
r = requests.get(url)
r.raise_for_status()

df = pd.read_csv(StringIO(r.text.strip()), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "inparalog_score", "protein_id", "seed_score"]

# === Step 2: Add species labels ===
def infer_species(src):
    if orig_taxid in src:
        return orig_species_input
    elif taxid in src:
        return species_input
    return "unknown"

df["species"] = df["source_file"].apply(infer_species)

# === Step 3: Build ortholog pairs ===
records = []
for cid, grp in df.groupby("cluster_id"):
    orig_spec = grp[grp["species"] == orig_species_input]
    others = grp[grp["species"] == species_input]
    
    for h, o in product(orig_spec.itertuples(index=False), others.itertuples(index=False)):
        records.append({
            "cluster_id": cid,
            f"{orig_species_input}_protein": h.protein_id,
            f"{orig_species_input}_inparalog_score": h.inparalog_score,
            f"{orig_species_input}_seed_score": h.seed_score,
            f"{species_input}_protein": o.protein_id,
            f"{species_input}_inparalog_score": o.inparalog_score,
            f"{species_input}_seed_score": o.seed_score,
            "bitscore": (h.bitscore + o.bitscore) / 2
        })


df_orthologs = pd.DataFrame(records)
df_orthologs.to_csv(f"data/{orig_species_input}_centric_inParanoid_{species_input}.csv", index=False)

In [46]:
df_orthologs

Unnamed: 0,cluster_id,mouse_protein,mouse_inparalog_score,mouse_seed_score,human_protein,human_inparalog_score,human_seed_score,bitscore
0,1,A2ASS6,1.0,1.0,Q8WZ42,1.0,1.0,60090.0
1,2,Q6ZWR6,1.0,1.0,Q8NF91,1.0,1.0,14503.0
2,3,A2AAJ9,1.0,1.0,Q5VST9,1.0,1.0,12156.0
3,4,Q9QXZ0,1.0,1.0,Q9UPN3,1.0,1.0,10869.0
4,5,Q91ZU6,1.0,1.0,Q03001,1.0,1.0,10549.0
...,...,...,...,...,...,...,...,...
20800,17094,A0A0G2JGM9,1.0,1.0,A0A075B6W9,1.0,1.0,45.0
20801,17095,A0A0G2JFW2,1.0,1.0,A0A075B6W1,1.0,1.0,44.0
20802,17096,A0A494B9K2,1.0,1.0,P0DP42,1.0,1.0,41.0
20803,17097,A0A0G2JDY8,1.0,1.0,A0A075B6W3,1.0,1.0,41.0


In [47]:
# === Step 4: UniProt → Gene Name ===
print("Starting annotation process...")

# --- Load Human UniProt Mapping once ---
orig_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{orig_species_input}.csv")
if not os.path.exists(orig_uniprot_file):
    print(f"Error: {orig_species_input} UniProt mapping file not found at {orig_uniprot_file}. Please run the UniProt fetching script for human first.")
    exit() # Exit if the essential human file is missing

print(f"Loading {orig_species_input} UniProt mapping from {orig_uniprot_file}...")
# We only need 'Entry' (Accession) and 'Gene Names'
orig_uniprot_df = pd.read_csv(orig_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as human gene name
orig_uniprot_df = orig_uniprot_df.rename(columns={"Gene Names": f"{orig_species_input}_Gene_Name"})
print(f"Loaded {len(orig_uniprot_df)} {orig_species_input} UniProt entries.")
orig_uniprot_df

Starting annotation process...
Loading mouse UniProt mapping from data/uniprotMapping_mouse.csv...
Loaded 32289 mouse UniProt entries.


Unnamed: 0,Entry,mouse_Gene_Name
0,A0A087WPF7,Auts2
1,A0A087WPF7,Kiaa0442
2,A0A088MLT8,Iqcj-Schip1
3,A0A088MLT8,Iqschfp
4,A0A088MLT8,Schip1
...,...,...
32284,Q9WUQ7,Dexi
32285,Q9WUQ7,Myle
32286,Q9WVB6,Lenep
32287,Q9WVB6,Lep503


In [53]:
# --- Process each species ---
print(f"\n--- Processing {species_input} ---")

inparanoid_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}.csv")
species_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{species_input}.csv")
output_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}_AnnWithUniProt.csv")

# Check if input files exist
if not os.path.exists(inparanoid_file):
    print(f"Skipping {species_input}: InParanoid file not found at {inparanoid_file}")

if not os.path.exists(species_uniprot_file):
    print(f"Skipping {species_input}: UniProt mapping file not found at {species_uniprot_file}")

print(f"Loading InParanoid data from {inparanoid_file}...")
df_inparanoid = pd.read_csv(inparanoid_file)
print(f"Loaded {len(df_inparanoid)} InParanoid entries for {species_input}.")

print(f"Loading {species_input} UniProt mapping from {species_uniprot_file}...")
species_uniprot_df = pd.read_csv(species_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as the species' gene name
species_uniprot_df = species_uniprot_df.rename(columns={"Gene Names": f"{species_input}_Gene_Name"})
print(f"Loaded {len(species_uniprot_df)} {species_input} UniProt entries.")



--- Processing human ---
Loading InParanoid data from data/mouse_centric_inParanoid_human.csv...
Loaded 20805 InParanoid entries for human.
Loading human UniProt mapping from data/uniprotMapping_human.csv...
Loaded 45583 human UniProt entries.


In [54]:
# 1. Annotate based on "orig_species_protein" with orig UniProt data
print(f"Merging {orig_species_input} gene names...")
# Use left merge to keep all rows from df_inparanoid
df_merged = pd.merge(
    df_inparanoid,
    orig_uniprot_df,
    left_on=f"{orig_species_input}_protein",
    right_on="Entry",
    how="left"
)
    # Drop the redundant 'Entry' column from the merge
df_merged = df_merged.drop(columns=["Entry"])

# 2. Annotate based on "{species_input}_protein" with species UniProt data
print(f"Merging {species_input} gene names...")
df_merged = pd.merge(
    df_merged, # Merge into the already merged dataframe
    species_uniprot_df,
    left_on=f"{species_input}_protein",
    right_on="Entry",
    how="left"
)
# Drop the redundant 'Entry' column from the second merge
df_merged = df_merged.drop(columns=["Entry"])

# 3. Save merged data
print(f"Saving merged data to {output_file}...")
df_merged.to_csv(output_file, index=False)
print(f"Successfully saved {len(df_merged)} annotated entries for {species_input}.")

print("\nAnnotation process completed for all specified species.")


Merging mouse gene names...
Merging human gene names...
Saving merged data to data/mouse_centric_inParanoid_human_AnnWithUniProt.csv...
Successfully saved 92169 annotated entries for human.

Annotation process completed for all specified species.


In [7]:
triplicate= gene_pair[['LR_pair', 'triplet', 'ligand_species_ann', 'receptor_species_ann', 'used Human L', 'used Human R', 'notes', 'Remove decision']]

In [8]:
triplicate

Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
0,LPL LRP1,1281473 LPL LRP1,Bos taurus,Homo sapiens,,,1992,
1,FGF1 ITGAV,18441324 FGF1 ITGAV,Homo sapiens,Homo sapiens,,,,
2,CALCA RAMP1,18599553 CALCA RAMP1,Homo sapiens,Homo sapiens,,,,
3,PODXL2 SELE,18606703 PODXL2 SELE,Homo sapiens,Homo sapiens,,,,
4,TMPRSS6 HJV,18976966 TMPRSS6 HJV,Homo sapiens,Homo sapiens,,,,
...,...,...,...,...,...,...,...,...
5411,KISS1 KISS1R,12944565 KISS1 KISS1R,,,,,REMOVE-Not support,REMOVE
5412,CCL1 CCR8,12967681 CCL1 CCR8,,,,,REMOVE-Not support,REMOVE
5413,TNF TNFRSF21,9714541 TNF TNFRSF21,,,,,REMOVE-NOT SUPPORT - Al - Agrees,REMOVE
5414,NTNG1 LRRC4C,14595443 NTNG1 LRRC4C,,,,,REMOVE-Reversed Interaction,REMOVE


In [165]:
# Filter out rows where 'notes' contains 'remove'
triplicate['notes'] = triplicate['Remove decision'].str.lower()

# Remove rows where 'notes' contains 'remove'
triplicate_with_remove = triplicate[triplicate['notes'].str.contains('remove', na=False)]

# Remove rows where 'notes' contains 'remove'
triplicate = triplicate[~triplicate['notes'].str.contains('remove', na=False)]

triplicate
triplicate_with_remove

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  triplicate['notes'] = triplicate['Remove decision'].str.lower()


Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
0,ROBO1 ROBO2,26186094 ROBO1 ROBO2,,,,,remove,REMOVE
6,IL18 IL18RAP,15760905 IL18 IL18RAP,,,,,remove,REMOVE
152,BDNF NTRK1,2157470 BDNF NTRK1,,,,,remove,REMOVE
178,IL1A IL1RAP,2950091 IL1A IL1RAP,,,,,remove,REMOVE
221,HSPG2 LRP1,7526899 HSPG2 LRP1,,,,,remove,REMOVE
...,...,...,...,...,...,...,...,...
5376,COL4A3 CD93,,Homo sapiens,Homo sapiens,✅,✅,remove,REMOVE
5377,COL4A4 CD93,,Homo sapiens,Homo sapiens,✅,✅,remove,REMOVE
5378,COL4A5 CD93,,Homo sapiens,Homo sapiens,✅,✅,remove,REMOVE
5379,COL4A6 CD93,,Homo sapiens,Homo sapiens,✅,✅,remove,REMOVE


In [9]:
# Convert both columns to sets for easy set operations
set_filtered = set(triplicate['LR_pair'])
set_unfiltered = set(triplicate_with_remove['LR_pair'])

# Overlap (present in both)
overlap = list(set_filtered & set_unfiltered)

# Not covered (in filtered but NOT in unfiltered)
not_covered = list(set_unfiltered - set_filtered)

print("LR_pairs covered (overlap):", overlap, len(overlap))
print("LR_pairs not covered:", not_covered, len(not_covered))

NameError: name 'triplicate_with_remove' is not defined

In [167]:
triplicate_with_remove.to_csv("data/rows_with_remove.csv")

In [10]:
# Group by the two species columns and count the number of rows for each combination
summary_counts = triplicate.groupby(['ligand_species_ann', 'receptor_species_ann']).size().reset_index(name='count')

# Optional: sort by count descending
summary_counts = summary_counts.sort_values(by='count', ascending=False)
summary_counts

Unnamed: 0,ligand_species_ann,receptor_species_ann,count
36,Homo sapiens,Homo sapiens,3082
104,Mus Musculus,Mus Musculus,702
50,Homo sapiens,Mus Musculus,181
0,,,179
101,Mus Musculus,Homo sapiens,176
...,...,...,...
74,"Homo sapiens, Mus Musculus",Rattus norvegicus,1
71,"Homo sapiens, Mus Musculus","Homo sapiens, Mus Musculus, Rattus norvegicus,...",1
67,"Homo sapiens, Gallus gallus",Homo sapiens,1
62,Homo sapiens,Xenopus laevis,1


In [15]:
summary_counts.to_csv("data/pairs_temp_perPair.csv")

In [169]:
homo_sapiens_in_both = triplicate[
    triplicate['ligand_species_ann'].str.contains('Homo sapiens', case=False, na=False) &
    triplicate['receptor_species_ann'].str.contains('Homo sapiens', case=False, na=False)
]

print("Rows where both species columns contain 'Homo sapiens':", homo_sapiens_in_both.shape[0])


Rows where both species columns contain 'Homo sapiens': 3536


In [170]:
homo_sapiens_in_both.shape[0]/len(triplicate)

0.6742944317315027

In [171]:
len(triplicate)

5244

In [172]:
triplicate

Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
1,ANGPT2 ITGA5,16424009 ANGPT2 ITGA5,Homo sapiens,Homo sapiens,✅,✅,,
2,ANGPT2 ITGAV,16424009 ANGPT2 ITGAV,Homo sapiens,Homo sapiens,✅,✅,,
3,ANGPT2 ITGB1,16424009 ANGPT2 ITGB1,Homo sapiens,Homo sapiens,✅,✅,,
4,IL18 CD48,15760905 IL18 CD48,Homo sapiens,Homo sapiens,✅,✅,,
5,IL18 IL18R1,15760905 IL18 IL18R1,Homo sapiens,Homo sapiens,✅,✅,,
...,...,...,...,...,...,...,...,...
5406,LAMA5 ITGA7,,Homo sapiens,Homo sapiens,✅,✅,,
5407,LAMB2 ITGB4,,Homo sapiens,Homo sapiens,✅,✅,,
5408,LAMB2 ITGA6,,Homo sapiens,Homo sapiens,✅,✅,,
5409,LAMB1 ITGB4,,"Mus Musculus, Homo sapiens",Homo sapiens,✅,✅,,


In [177]:
homo_sapiens_in_both

Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
1,ANGPT2 ITGA5,16424009 ANGPT2 ITGA5,Homo sapiens,Homo sapiens,✅,✅,,
2,ANGPT2 ITGAV,16424009 ANGPT2 ITGAV,Homo sapiens,Homo sapiens,✅,✅,,
3,ANGPT2 ITGB1,16424009 ANGPT2 ITGB1,Homo sapiens,Homo sapiens,✅,✅,,
4,IL18 CD48,15760905 IL18 CD48,Homo sapiens,Homo sapiens,✅,✅,,
5,IL18 IL18R1,15760905 IL18 IL18R1,Homo sapiens,Homo sapiens,✅,✅,,
...,...,...,...,...,...,...,...,...
5406,LAMA5 ITGA7,,Homo sapiens,Homo sapiens,✅,✅,,
5407,LAMB2 ITGB4,,Homo sapiens,Homo sapiens,✅,✅,,
5408,LAMB2 ITGA6,,Homo sapiens,Homo sapiens,✅,✅,,
5409,LAMB1 ITGB4,,"Mus Musculus, Homo sapiens",Homo sapiens,✅,✅,,


In [12]:
LR_pair_flat = (
    triplicate
    .groupby('LR_pair', as_index=False)
    .agg(lambda x: ', '.join(sorted(set(x.dropna().astype(str)))))
)
LR_pair_flat

Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
0,A2M HSPA5,"12194978 A2M HSPA5, 32541810 A2M HSPA5",Homo sapiens,Homo sapiens,,,", human α2M, human Grp 78 (HSPA5?)",
1,A2M LRP1,"10652313 A2M LRP1, 12194978 A2M LRP1, 1702392 ...",Homo sapiens,Homo sapiens,,,", LRP primary accession number Q0754 was perfo...",
2,ACE BDKRB2,10748135 ACE BDKRB2,Homo sapiens,Homo sapiens,,,,
3,ADA DPP4,15213224 ADA DPP4,Homo sapiens,Bos taurus,,,,
4,ADAM10 EFNA5,16239146 ADAM10 EFNA5,Bos taurus,Mus Musculus,,,NEW PAIR,
...,...,...,...,...,...,...,...,...
3540,ZG16B TLR2,20802527 ZG16B TLR2,Homo sapiens,"Homo sapiens, Mus Musculus",,,"The expression vectors pHA-mTLR2, -dominant mu...",
3541,ZG16B TLR4,20802527 ZG16B TLR4,Homo sapiens,"Homo sapiens, Mus Musculus",,,"Human TLR1, 2, 3, 4, 5, 8, 9, and 10, CXCR4 an...",
3542,ZG16B TLR5,20802527 ZG16B TLR5,Homo sapiens,Homo sapiens,,,PAUF is a mammalian ligand identified for the ...,
3543,ZG16B TLR6,20802527 ZG16B TLR6,Homo sapiens,"Homo sapiens, Mus Musculus",,,,


In [14]:
# Group by the two species columns and count the number of rows for each combination
summary_counts = LR_pair_flat.groupby(['ligand_species_ann', 'receptor_species_ann']).size().reset_index(name='count')

# Optional: sort by count descending
summary_counts = summary_counts.sort_values(by='count', ascending=False)
summary_counts

Unnamed: 0,ligand_species_ann,receptor_species_ann,count
83,Homo sapiens,Homo sapiens,1644
247,Mus Musculus,Mus Musculus,407
168,"Homo sapiens, Mus Musculus","Homo sapiens, Mus Musculus",175
111,Homo sapiens,Mus Musculus,93
0,,,87
...,...,...,...
133,"Homo sapiens, Homo sapiens, Mus Musculus","Homo sapiens, Mus Musculus, Homo sapiens",1
134,"Homo sapiens, Homo sapiens, Mus Musculus","Homo sapiens, Mus Musculus, Mus Musculus",1
135,"Homo sapiens, Homo sapiens, Mus Musculus","Mus Musculus, Mus Musculus, Homo sapiens",1
136,"Homo sapiens, Homo sapiens, Mus Musculus","Mus Musculus, Rattus norvegicus",1


In [178]:
homo_sapiens_in_both = LR_pair_flat[
    LR_pair_flat['ligand_species_ann'].str.contains('Homo sapiens', case=False, na=False) &
    LR_pair_flat['receptor_species_ann'].str.contains('Homo sapiens', case=False, na=False)
]

print("Rows where both species columns contain 'Homo sapiens':", homo_sapiens_in_both.shape[0])
homo_sapiens_in_both.shape[0]/len(LR_pair_flat)

Rows where both species columns contain 'Homo sapiens': 2444


0.7061542906674372

In [179]:
mus_musculus_in_both = (
    LR_pair_flat[
        LR_pair_flat['ligand_species_ann'].str.contains('Mus musculus', case=False, na=False) &
        LR_pair_flat['receptor_species_ann'].str.contains('Mus musculus', case=False, na=False)
    ]
)

result = mus_musculus_in_both[
    ~(
        LR_pair_flat['ligand_species_ann'].str.contains('Homo sapiens', case=False, na=False) &
        LR_pair_flat['receptor_species_ann'].str.contains('Homo sapiens', case=False, na=False)
    )
]

print(len(result))

529


  result = mus_musculus_in_both[


In [157]:
result.to_csv("data/mouse_as_main_pairs.csv")

In [12]:
import sys
import os
import pandas as pd
import warnings
import re
from itables import init_notebook_mode, show
from IPython.display import display, Javascript
import itables.options as opt
import createDataTable_perSpecies

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# === Species Parameter === #
species = "horse"  # Change to "zebrafish", "sheep", etc.
species_file_prefix = {
    "mouse": "mmusculus",
    "rat": "rnorvegicus",
    "zebrafish": "drerio",
    "chimpanzee": "ptroglodytes",
    "chicken": "ggallus",
    "pig": "sscrofa",
    "cow": "btaurus",
    "dog": "clfamiliaris",
    "horse": "ecaballus",
    "marmoset": "cjacchus",
    "macaque": "mmulatta",
    "sheep": "oarambouillet"
}[species]

# === Load gene pair === #
gene_pair_var = f"{species}_gene_pair1"
gene_pair_df = getattr(createDataTable_perSpecies, gene_pair_var)
ligand_ens_id = [col for col in gene_pair_df.columns if "Ligand Ensembl ID" in col][0]
receptor_ens_id = [col for col in gene_pair_df.columns if "Receptor Ensembl ID" in col][0]


gene_pair_df = gene_pair_df[[
    '<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
    '<span title="Double-click header of Ligand to ensure all values are shown">Ligand&nbsp;</span>',
    '<span title="Double-click header of Receptor to ensure all values are shown">Receptor&nbsp;</span>',
    'LR Pair Card', f'{species.capitalize()} LR Pair',
    '<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Ligand HGNC ID&nbsp;&nbsp;</span>',
    '<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Receptor HGNC ID&nbsp;&nbsp;</span>',
    'Ligand GOC score', 'Ligand WGA coverage',
    'Ligand % Identity', 'Ligand Target % Identity',
    'Ligand Orthology Confidence', ligand_ens_id,
    'Receptor GOC score', 'Receptor WGA coverage', 'Receptor % Identity',
    'Receptor Target % Identity', 'Receptor Orthology Confidence', receptor_ens_id]]

# Rename columns
rename_dict = dict(zip(gene_pair_df.columns[:7], [
    "Interaction ID", "Ligand", "Receptor", "LR Pair Card",
    f"{species.capitalize()} LR Pair", "Ligand HGNC ID", "Receptor HGNC ID"
]))
gene_pair_df.rename(columns=rename_dict, inplace=True)

gene_pair_df = gene_pair_df.rename(columns={
    ligand_ens_id: "Ligand Ensembl ID",
    receptor_ens_id: "Receptor Ensembl ID"
})
    

# Load ortholog mapping
biomart_df = pd.read_csv(f"data/{species_file_prefix}_ID_biomart.csv", dtype=str)
biomart_df = biomart_df.dropna(subset=[f"{species_file_prefix}_homolog_ensembl_gene", "ensembl_gene_id"])

# Extract ID from anchor tags
def extract_link_text(html_string):
    match = re.search(r'<a[^>]*>(.*?)</a>', html_string)
    return match.group(1).strip() if match else None

def extract_hgnc_id(col):
    match = re.search(r'HGNC:(\d+)', col)
    return 'HGNC:' + str(match.group(1)) if match else None
def extract_paircard_id(col):
    """Use regular expression to extract the HGNC ID after 'cards/'."""
    match = re.search(r'cards/([^/]+)\.html', col)
    if match:
        return str(match.group(1))
    return None
    
# Process columns
gene_pair_df['LR Pair Card'] = gene_pair_df['LR Pair Card'].apply(extract_paircard_id)
gene_pair_df['Ligand HGNC ID'] = gene_pair_df['Ligand HGNC ID'].apply(extract_hgnc_id)
gene_pair_df['Receptor HGNC ID'] = gene_pair_df['Receptor HGNC ID'].apply(extract_hgnc_id)

# Mapping
species_to_human_map = dict(zip(
    biomart_df[f"{species_file_prefix}_homolog_ensembl_gene"],
    biomart_df["ensembl_gene_id"]
))

gene_pair_df["Human Ligand Ensembl ID"] = gene_pair_df["Ligand Ensembl ID"].map(species_to_human_map)
gene_pair_df["Human Receptor Ensembl ID"] = gene_pair_df["Receptor Ensembl ID"].map(species_to_human_map)

# Load df_merged
merged_df = pd.read_csv(f"data/df_merged_with_{species}_ensembl.tsv", sep="\t")

# Index for merge
gene_pair_indexed = gene_pair_df.reset_index(drop=False).rename(columns={"index": "orig_row"})

# LIGAND MERGE
df_ligand = merged_df.add_prefix("Ligand_")
ligand_merge = gene_pair_indexed.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left"
)
if isinstance(ligand_merge.columns, pd.MultiIndex):
    ligand_merge.columns = ligand_merge.columns.get_level_values(0)
if ligand_merge.columns.duplicated().any():
    ligand_merge = ligand_merge.loc[:, ~ligand_merge.columns.duplicated()]

def resolve_ligand_row(group):
    match = group[group[f"Ligand_{species}_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    return match.iloc[[0]] if len(match) else group.iloc[[0]]

ligand_final = ligand_merge.groupby("orig_row", group_keys=False).apply(resolve_ligand_row).reset_index(drop=True)

# RECEPTOR MERGE
df_receptor = merged_df.add_prefix("Receptor_")
receptor_merge = ligand_final.merge(
    df_receptor,
    left_on="Human Receptor Ensembl ID",
    right_on="Receptor_human_ensembl_gene_id",
    how="left"
)
if isinstance(receptor_merge.columns, pd.MultiIndex):
    receptor_merge.columns = receptor_merge.columns.get_level_values(0)
if receptor_merge.columns.duplicated().any():
    receptor_merge = receptor_merge.loc[:, ~receptor_merge.columns.duplicated()]

def resolve_receptor_row(group):
    match = group[group[f"Receptor_{species}_ensembl_gene_id"] == group["Receptor Ensembl ID"]]
    return match.iloc[[0]] if len(match) else group.iloc[[0]]

final_result = receptor_merge.groupby("orig_row", group_keys=False).apply(resolve_receptor_row).reset_index(drop=True).drop(columns=["orig_row"])

assert len(final_result) == len(gene_pair_df), f"Row mismatch: {len(final_result)} != {len(gene_pair_df)}"

final_result.to_csv(f"data/human_{species}_merged_ensemblBiomaRt_inParanoid.csv")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_pair_df.rename(columns=rename_dict, inplace=True)
  ligand_final = ligand_merge.groupby("orig_row", group_keys=False).apply(resolve_ligand_row).reset_index(drop=True)
  final_result = receptor_merge.groupby("orig_row", group_keys=False).apply(resolve_receptor_row).reset_index(drop=True).drop(columns=["orig_row"])


In [13]:
# summarize counts depending on filter
species = "horse"  # Change to desired species, e.g., "zebrafish", "sheep"
capital_species = species.capitalize()

# Load final result file
final_result = pd.read_csv(f"data/human_{species}_merged_ensemblBiomaRt_inParanoid.csv")

# Define score columns dynamically
score_cols = [
    f"Ligand_human_inparalog_score",
    f"Receptor_human_inparalog_score",
    f"Ligand_{species}_inparalog_score",
    f"Receptor_{species}_inparalog_score",
    f"Ligand_human_seed_score",
    f"Receptor_human_seed_score",
    f"Ligand_{species}_seed_score",
    f"Receptor_{species}_seed_score",
    f"Ligand_bitscore",
    f"Receptor_bitscore"
]

for col in score_cols:
    if col in final_result.columns:
        final_result[col] = pd.to_numeric(final_result[col], errors='coerce')

# Detect columns
confidence_orth_ligand = [col for col in final_result.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in final_result.columns if "Ligand GOC" in col][0]
percIdent_col_ligand = [col for col in final_result.columns if "Ligand % Identity" in col][0]
human_ligand_col = [col for col in final_result.columns if "Ligand HGNC ID" in col][0]
ligand_col = [col for col in final_result.columns if "Ligand" in col][0]
ligand_human_inparalog_score = [col for col in final_result.columns if "Ligand_human_inparalog_score" in col][0]
ligand_species_inparalog_score = [col for col in final_result.columns if f"Ligand_{species}_inparalog_score" in col][0]
ligand_human_seed_score = [col for col in final_result.columns if "Ligand_human_seed_score" in col][0]
ligand_species_seed_score = [col for col in final_result.columns if f"Ligand_{species}_seed_score" in col][0]
ligand_bit_score = [col for col in final_result.columns if "Ligand_bitscore" in col][0]

human_receptor_col = [col for col in final_result.columns if "Receptor HGNC ID" in col][0]
confidence_orth_receptor = [col for col in final_result.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in final_result.columns if "Receptor GOC" in col][0]
percIdent_col_receptor = [col for col in final_result.columns if "Receptor % Identity" in col][0]
receptor_col = [col for col in final_result.columns if "Receptor" in col][0]
receptor_human_inparalog_score = [col for col in final_result.columns if "Receptor_human_inparalog_score" in col][0]
receptor_species_inparalog_score = [col for col in final_result.columns if f"Receptor_{species}_inparalog_score" in col][0]
receptor_human_seed_score = [col for col in final_result.columns if "Receptor_human_seed_score" in col][0]
receptor_species_seed_score = [col for col in final_result.columns if f"Receptor_{species}_seed_score" in col][0]
receptor_bit_score = [col for col in final_result.columns if "Receptor_bitscore" in col][0]

# Define function

def summarize_orthologs(human_col, species_col, label,
                        confidence_orth_col=None, confidence_orth_threshold=None,
                        GOC_col=None, GOC_threshold=None,
                        perc_identity_col=None, perc_identity_thres=None,
                        ligand_human_inparalog_score_col=None, ligand_human_inparalog_score_threshold=None,
                        receptor_human_inparalog_score_col=None, receptor_human_inparalog_score_threshold=None,
                        ligand_species_inparalog_score_col=None, ligand_species_inparalog_score_threshold=None,
                        receptor_species_inparalog_score_col=None, receptor_species_inparalog_score_threshold=None,
                        ligand_human_seed_score_col=None, ligand_human_seed_score_threshold=None,
                        receptor_human_seed_score_col=None, receptor_human_seed_score_threshold=None,
                        ligand_species_seed_score_col=None, ligand_species_seed_score_threshold=None,
                        receptor_species_seed_score_col=None, receptor_species_seed_score_threshold=None,
                        ligand_bit_score_col=None, ligand_bit_score_threshold=None,
                        receptor_bit_score_col=None, receptor_bit_score_threshold=None):

    df = final_result.copy()

    filters = [
        (confidence_orth_col, lambda x: x == confidence_orth_threshold),
        (GOC_col, lambda x: x >= GOC_threshold),
        (perc_identity_col, lambda x: x >= perc_identity_thres),
        (ligand_human_inparalog_score_col, lambda x: x >= ligand_human_inparalog_score_threshold),
        (receptor_human_inparalog_score_col, lambda x: x >= receptor_human_inparalog_score_threshold),
        (ligand_species_inparalog_score_col, lambda x: x >= ligand_species_inparalog_score_threshold),
        (receptor_species_inparalog_score_col, lambda x: x >= receptor_species_inparalog_score_threshold),
        (ligand_human_seed_score_col, lambda x: x >= ligand_human_seed_score_threshold),
        (receptor_human_seed_score_col, lambda x: x >= receptor_human_seed_score_threshold),
        (ligand_species_seed_score_col, lambda x: x >= ligand_species_seed_score_threshold),
        (receptor_species_seed_score_col, lambda x: x >= receptor_species_seed_score_threshold),
        (ligand_bit_score_col, lambda x: x >= ligand_bit_score_threshold),
        (receptor_bit_score_col, lambda x: x >= receptor_bit_score_threshold),
    ]

    original_rows = df.shape[0]
    for col, condition in filters:
        if col and condition is not None:
            before = df.shape[0]
            df = df[df[col].apply(condition)]
            after = df.shape[0]
            print(f"Filtered {col}: {before - after} rows removed (remaining: {after})")

    unique_pairs = df[[human_col, species_col]].drop_duplicates()

    counts = (
        unique_pairs
        .groupby(human_col)[species_col]
        .count()
        .sort_values(ascending=False)
        .reset_index(name='count')
    )

    filter_tag = f"{label.lower()}"
    if confidence_orth_threshold is not None:
        filter_tag += f"_conf{confidence_orth_threshold}"
    if GOC_threshold is not None:
        filter_tag += f"_GOCge{GOC_threshold}"
    if ligand_human_inparalog_score_threshold is not None:
        filter_tag += f"_LHISge{ligand_human_inparalog_score_threshold}"
    if receptor_bit_score_threshold is not None:
        filter_tag += f"_RBSge{receptor_bit_score_threshold}"

    #counts.to_csv(f"data/human_{species}_orth_count_{filter_tag}.csv", index=False)

    summary_counts = counts['count'].value_counts().sort_index()
    total_human_genes = counts.shape[0]

    summary_lines = [
        f"Out of {total_human_genes} unique human {label.lower()} genes:",
        f" - Filters applied: " + "; ".join([
            f"{col} ≥ {threshold}" for col, threshold in [
                (confidence_orth_col, confidence_orth_threshold),
                (GOC_col, GOC_threshold),
                (perc_identity_col, perc_identity_thres),
                (ligand_human_inparalog_score_col, ligand_human_inparalog_score_threshold),
                (receptor_human_inparalog_score_col, receptor_human_inparalog_score_threshold),
                (ligand_species_inparalog_score_col, ligand_species_inparalog_score_threshold),
                (receptor_species_inparalog_score_col, receptor_species_inparalog_score_threshold),
                (ligand_human_seed_score_col, ligand_human_seed_score_threshold),
                (receptor_human_seed_score_col, receptor_human_seed_score_threshold),
                (ligand_species_seed_score_col, ligand_species_seed_score_threshold),
                (receptor_species_seed_score_col, receptor_species_seed_score_threshold),
                (ligand_bit_score_col, ligand_bit_score_threshold),
                (receptor_bit_score_col, receptor_bit_score_threshold),
            ] if threshold is not None and col is not None
        ])
    ]

    for orth_count, gene_count in summary_counts.items():
        summary_lines.append(
            f" - {gene_count} human {label.lower()} genes had {orth_count} {species} ortholog(s)"
        )

    return "\n".join(summary_lines)

In [18]:
# Ligand
ligand_summary = summarize_orthologs(
    human_col=human_ligand_col,
    species_col=ligand_col,
    label="Ligand",
    # GOC_col=GOC_col_ligand,
    # GOC_threshold=25,
    confidence_orth_col=confidence_orth_ligand,
    confidence_orth_threshold=1,
    # perc_identity_col = percIdent_col_ligand,
    # perc_identity_thres = 60
    # ligand_human_inparalog_score_col=ligand_human_inparalog_score,
    # ligand_human_inparalog_score_threshold=1,
    ligand_bit_score_col=ligand_bit_score,
    ligand_bit_score_threshold=40,
    # ligand_species_inparalog_score_col=ligand_species_inparalog_score,
    # ligand_species_inparalog_score_threshold=1,
)
print(ligand_summary)

# Receptor
receptor_summary = summarize_orthologs(
    human_col=human_receptor_col,
    species_col=receptor_col,
    label="Receptor",
    # GOC_col=GOC_col_receptor,
    # GOC_threshold=25,
    confidence_orth_col=confidence_orth_receptor,
    confidence_orth_threshold=1,
    # perc_identity_col = percIdent_col_receptor,
    # perc_identity_thres = 60
    # receptor_human_inparalog_score_col=receptor_human_inparalog_score,
    # receptor_human_inparalog_score_threshold=1,
    receptor_bit_score_col=receptor_bit_score,
    receptor_bit_score_threshold=40,
    # receptor_species_inparalog_score_col=receptor_species_inparalog_score,
    # receptor_species_inparalog_score_threshold=1,
)
print(receptor_summary)

Filtered Ligand Orthology Confidence: 341 rows removed (remaining: 2650)
Filtered Ligand_bitscore: 121 rows removed (remaining: 2529)
Out of 731 unique human ligand genes:
 - Filters applied: Ligand Orthology Confidence ≥ 1; Ligand_bitscore ≥ 40
 - 727 human ligand genes had 1 horse ortholog(s)
 - 3 human ligand genes had 2 horse ortholog(s)
 - 1 human ligand genes had 3 horse ortholog(s)
Filtered Receptor Orthology Confidence: 163 rows removed (remaining: 2828)
Filtered Receptor_bitscore: 92 rows removed (remaining: 2736)
Out of 644 unique human receptor genes:
 - Filters applied: Receptor Orthology Confidence ≥ 1; Receptor_bitscore ≥ 40
 - 641 human receptor genes had 1 horse ortholog(s)
 - 1 human receptor genes had 2 horse ortholog(s)
 - 2 human receptor genes had 3 horse ortholog(s)


In [2]:

df_orthologs.to_csv(f"data/inParanoid_{species_input}.csv", index=False)

# === Step 4: Human UniProt → HGNC/Ensembl mapping ===
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "ensembl_gene_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "ensembl_gene_id": "human_ensembl_gene_id"
}).drop(columns=["uniprot_id"])

df_merged = df_merged.dropna(subset=["human_ensembl_gene_id"])
df_merged.to_csv(f"data/{species_input}_inParanoid_withHGNC.tsv", sep="\t", index=False)

# === Step 5: Optional - Species UniProt → Ensembl mapping ===
map_path = f"data/{species_input}_uniprot_to_ensembl.tsv"
try:
    species_map = pd.read_csv(map_path, sep="\t", dtype=str)
    df_merged = df_merged.merge(
        species_map,
        left_on=f"{species_input}_protein",
        right_on="uniprotswissprot",
        how="left"
    ).rename(columns={"ensembl_gene_id": f"{species_input}_ensembl_gene_id"}) \
     .drop(columns=["uniprotswissprot"])
except FileNotFoundError:
    print(f"⚠️  Mapping file not found: {map_path}")

df_merged.to_csv(f"data/df_merged_with_{species_input}_ensembl.tsv", sep="\t", index=False)


EmptyDataError: No columns to parse from file

In [339]:
# Ligand
ligand_summary = summarize_orthologs(
    human_col=human_ligand_col,
    species_col=ligand_col,
    label="Ligand",
    # GOC_col=GOC_col_ligand,
    # GOC_threshold=25,
    # confidence_orth_col=confidence_orth_ligand,
    # confidence_orth_threshold=1,
    # perc_identity_col = percIdent_col_ligand,
    # perc_identity_thres = 60
    # ligand_human_inparalog_score_col=ligand_human_inparalog_score,
    # ligand_human_inparalog_score_threshold=1,
    ligand_bit_score_col=ligand_bit_score,
    ligand_bit_score_threshold=40,
    # ligand_species_inparalog_score_col=ligand_species_inparalog_score,
    # ligand_species_inparalog_score_threshold=1,
)
print(ligand_summary)

# Receptor
receptor_summary = summarize_orthologs(
    human_col=human_receptor_col,
    species_col=receptor_col,
    label="Receptor",
    # GOC_col=GOC_col_receptor,
    # GOC_threshold=25,
    # confidence_orth_col=confidence_orth_receptor,
    # confidence_orth_threshold=1,
    # perc_identity_col = percIdent_col_receptor,
    # perc_identity_thres = 60
    # receptor_human_inparalog_score_col=receptor_human_inparalog_score,
    # receptor_human_inparalog_score_threshold=1,
    receptor_bit_score_col=receptor_bit_score,
    receptor_bit_score_threshold=40,
    # receptor_species_inparalog_score_col=receptor_species_inparalog_score,
    # receptor_species_inparalog_score_threshold=1,
)
print(receptor_summary)



Filtered Ligand_bitscore: 1323 rows removed (remaining: 4193)
Out of 509 unique human ligand genes:
 - Filters applied: Ligand_bitscore ≥ 40
 - 311 human ligand genes had 1 zebrafish ortholog(s)
 - 178 human ligand genes had 2 zebrafish ortholog(s)
 - 9 human ligand genes had 3 zebrafish ortholog(s)
 - 4 human ligand genes had 4 zebrafish ortholog(s)
 - 4 human ligand genes had 5 zebrafish ortholog(s)
 - 2 human ligand genes had 7 zebrafish ortholog(s)
 - 1 human ligand genes had 8 zebrafish ortholog(s)
Filtered Receptor_bitscore: 1293 rows removed (remaining: 4223)
Out of 438 unique human receptor genes:
 - Filters applied: Receptor_bitscore ≥ 40
 - 273 human receptor genes had 1 zebrafish ortholog(s)
 - 139 human receptor genes had 2 zebrafish ortholog(s)
 - 12 human receptor genes had 3 zebrafish ortholog(s)
 - 7 human receptor genes had 4 zebrafish ortholog(s)
 - 3 human receptor genes had 5 zebrafish ortholog(s)
 - 3 human receptor genes had 7 zebrafish ortholog(s)
 - 1 human rece

In [322]:
extract_link_text

def extract_paircard_id(col):
    """Use regular expression to extract the HGNC ID after 'cards/'."""
    match = re.search(r'cards/([^/]+)\.html', col)
    if match:
        return str(match.group(1))
    return None



In [299]:

#  'Sheep LR Pair' data
update_connectomedb_qmd(
    qmd_file_path="database/other/sheepOrth.qmd", 
    lr_pair_data=createDataTable_perSpecies.sheep_gene_pair1["Sheep LR Pair"],
    species_name="Ovis aries rambouillet",
    species = "Sheep",
    ortholog = True
)


--- Updating database/other/sheepOrth.qmd for Ovis aries rambouillet ---
Successfully updated 'database/other/sheepOrth.qmd' for Ovis aries rambouillet.


In [203]:
import sys
import os
import pandas as pd
import warnings
import re
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
from itables import init_notebook_mode, show
from IPython.display import display, Javascript
import itables.options as opt
from createDataTable_perSpecies import mouse_gene_pair1

mouse_gene_pair1= mouse_gene_pair1[['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>','<span title="Double-click header of Ligand to ensure all values are shown">Ligand&nbsp;</span>',
       '<span title="Double-click header of Receptor to ensure all values are shown">Receptor&nbsp;</span>',
                  'LR Pair Card', 'Mouse LR Pair','<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Ligand HGNC ID&nbsp;&nbsp;</span>',
       '<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Receptor HGNC ID&nbsp;&nbsp;</span>','Ligand GOC score', 'Ligand WGA coverage',
       'Ligand % Identity', 'Ligand Target % Identity',
       'Ligand Orthology Confidence', 'Ligand Ensembl ID',
       'Receptor GOC score', 'Receptor WGA coverage', 'Receptor % Identity',
       'Receptor Target % Identity', 'Receptor Orthology Confidence','Receptor Ensembl ID']]

mouse_gene_pair1.columns = [
    "Interaction ID",
    "Ligand",
    "Receptor",
    "LR Pair Card", 
    "Mouse LR Pair",
    "Ligand HGNC ID",
    "Receptor HGNC ID",
    *mouse_gene_pair1.columns[7:]
]
mousebioM_df = pd.read_csv("data/mmusculus_ID_biomart.csv", dtype=str)
mousebioM_df = mousebioM_df.dropna(subset=["mmusculus_homolog_ensembl_gene", "ensembl_gene_id"])


def extract_link_text(html_string):
    """Extract visible text from an anchor tag <a>...</a>."""
    match = re.search(r'<a[^>]*>(.*?)</a>', html_string)
    if match:
        return match.group(1).strip()
    return None

mouse_gene_pair1['LR Pair Card'] = mouse_gene_pair1['LR Pair Card'].apply(extract_paircard_id)
# Create the mapping dictionary from mouse to human Ensembl gene ID
mouse_to_human_map = dict(zip(
    mousebioM_df["mmusculus_homolog_ensembl_gene"],
    mousebioM_df["ensembl_gene_id"]
))

# Map Ligand
mouse_gene_pair1["Human Ligand Ensembl ID"] = mouse_gene_pair1["Ligand Ensembl ID"].map(mouse_to_human_map)

# Map Receptor
mouse_gene_pair1["Human Receptor Ensembl ID"] = mouse_gene_pair1["Receptor Ensembl ID"].map(mouse_to_human_map)
def extract_hgnc_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'HGNC:(\d+)', col)
    if match:
        return 'HGNC:' +str(match.group(1))
    return None
    
mouse_gene_pair1['Ligand HGNC ID'] = mouse_gene_pair1['Ligand HGNC ID'].apply(extract_hgnc_id)
mouse_gene_pair1['Receptor HGNC ID'] = mouse_gene_pair1['Receptor HGNC ID'].apply(extract_hgnc_id)
df_merged =pd.read_csv("data/df_merged_with_mouse_ensembl.tsv",sep="\t")

# Step 0: Add 'orig_row' once, keep it clean
mouse_gene_pair1_indexed = mouse_gene_pair1.reset_index(drop=False).rename(columns={"index": "orig_row"})

### === LIGAND MERGE === ###
df_ligand = df_merged.add_prefix("Ligand_")
ligand_merge = mouse_gene_pair1_indexed.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left"
)

# Ensure orig_row is single column (sometimes merge creates duplicates with suffix)
if isinstance(ligand_merge.columns, pd.MultiIndex):
    ligand_merge.columns = ligand_merge.columns.get_level_values(0)

if ligand_merge.columns.duplicated().any():
    ligand_merge = ligand_merge.loc[:, ~ligand_merge.columns.duplicated()]

def resolve_ligand_row(group):
    match = group[group["Ligand_mouse_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    else:
        return group.iloc[[0]]

ligand_final = (
    ligand_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_ligand_row)
    .reset_index(drop=True)
)

### === RECEPTOR MERGE === ###
df_receptor = df_merged.add_prefix("Receptor_")
receptor_merge = ligand_final.merge(
    df_receptor,
    left_on="Human Receptor Ensembl ID",
    right_on="Receptor_human_ensembl_gene_id",
    how="left"
)

# Same cleanup for receptor_merge
if isinstance(receptor_merge.columns, pd.MultiIndex):
    receptor_merge.columns = receptor_merge.columns.get_level_values(0)

if receptor_merge.columns.duplicated().any():
    receptor_merge = receptor_merge.loc[:, ~receptor_merge.columns.duplicated()]

def resolve_receptor_row(group):
    match = group[group["Receptor_mouse_ensembl_gene_id"] == group["Receptor Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    else:
        return group.iloc[[0]]

final_result = (
    receptor_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_receptor_row)
    .reset_index(drop=True)
    .drop(columns=["orig_row"])
)

assert len(final_result) == len(mouse_gene_pair1), f"Row mismatch: {len(final_result)} != {len(mouse_gene_pair1)}"
final_result


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1['LR Pair Card'] = mouse_gene_pair1['LR Pair Card'].apply(extract_paircard_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1["Human Ligand Ensembl ID"] = mouse_gene_pair1["Ligand Ensembl ID"].map(mouse_to_human_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mo

Unnamed: 0,Interaction ID,Ligand,Receptor,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,...,Receptor_human_protein,Receptor_human_inparalog_score,Receptor_human_seed_score,Receptor_mouse_protein,Receptor_mouse_inparalog_score,Receptor_mouse_seed_score,Receptor_bitscore,Receptor_human_ensembl_gene_id,Receptor_human_gene,Receptor_mouse_ensembl_gene_id
0,CDB00001,A2m,Hspa5,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,...,P11021,1.0,1.0,P20029,1.0,1.000,1246.0,ENSG00000044574,HSPA5,ENSMUSG00000026864
1,CDB00002,A2m,Lrp1,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,...,Q07954,1.0,1.0,Q91ZX7,1.0,1.000,6853.0,ENSG00000123384,LRP1,
2,CDB00003,Ace,Bdkrb2,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,...,P30411,1.0,1.0,P32299,1.0,1.000,636.0,ENSG00000168398,BDKRB2,ENSMUSG00000021070
3,CDB00004,Ada,Dpp4,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,...,P27487,1.0,1.0,P28843,1.0,1.000,1365.0,ENSG00000197635,DPP4,ENSMUSG00000035000
4,CDB00005,Adam10,Epha3,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,...,P29320,1.0,1.0,P29319,1.0,1.000,1923.0,ENSG00000044524,EPHA3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21,Pcdhb21,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,...,,,,,,,,,,
3957,CDB03445,Pcdhb22,Pcdhb22,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3958,CDB03446,Pcdhgb8,Pcdhgb8,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,...,,,,,,,,,,
3959,CDB03447,Saa3,Tlr4,Saa3-Tlr4,Saa3 → Tlr4,,,,,,...,O00206,1.0,1.0,Q9QUK6,1.0,1.000,909.0,ENSG00000136869,TLR4,ENSMUSG00000039005


In [204]:
final_result.to_csv("data/human_mouse_merged_ensemblBiomaRt_inParanoid.csv")


In [136]:
mouse_gene_pair1.columns

Index(['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
       'LR Pair Card', 'Mouse LR Pair',
       '<span title="Double-click header of Ligand to ensure all values are shown">Ligand&nbsp;</span>',
       '<span title="Double-click header of Receptor to ensure all values are shown">Receptor&nbsp;</span>',
       '<span title="Genome Informatics (MGI) ID. Click on the link for more details">Ligand MGI ID</span>',
       '<span title="Genome Informatics (MGI) ID. Click on the link for more details">Receptor MGI ID</span>',
       'Ligand Ensembl ID', 'Ligand GOC score', 'Ligand WGA coverage',
       'Ligand % Identity', 'Ligand Target % Identity',
       'Ligand Orthology Confidence', 'Ligand Name', 'Receptor Ensembl ID',
       'Receptor GOC score', 'Receptor WGA coverage', 'Receptor % Identity',
       'Receptor Target % Identity', 'Receptor Orthology Confidence',
       'Receptor Name',
       '<span title="Official G

In [103]:
mouse_gene_pair1= mouse_gene_pair1[['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
                  'LR Pair Card', 'Mouse LR Pair','<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Ligand HGNC ID&nbsp;&nbsp;</span>',
       '<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Receptor HGNC ID&nbsp;&nbsp;</span>','Ligand GOC score', 'Ligand WGA coverage',
       'Ligand % Identity', 'Ligand Target % Identity',
       'Ligand Orthology Confidence', 'Ligand Ensembl ID',
       'Receptor GOC score', 'Receptor WGA coverage', 'Receptor % Identity',
       'Receptor Target % Identity', 'Receptor Orthology Confidence','Receptor Ensembl ID']]

In [104]:
mouse_gene_pair1.columns = [
    "Interaction ID",
    "LR Pair Card",
    "Mouse LR Pair",
    "Ligand HGNC ID",
    "Receptor HGNC ID",
    *mouse_gene_pair1.columns[5:]
]
mouse_gene_pair1

Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID
0,CDB00001,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m → Hspa5,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864
1,CDB00002,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m → Lrp1,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249
2,CDB00003,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ace → Bdkrb2,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070
3,CDB00004,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ada → Dpp4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000
4,CDB00005,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Adam10 → Epha3,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb21 → Pcdhb21,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022
3957,CDB03445,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb22 → Pcdhb22,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591
3958,CDB03446,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhgb8 → Pcdhgb8,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081
3959,CDB03447,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Saa3 → Tlr4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005


In [105]:
mousebioM_df = pd.read_csv("data/mmusculus_ID_biomart.csv", dtype=str)
mousebioM_df = mousebioM_df.dropna(subset=["mmusculus_homolog_ensembl_gene", "ensembl_gene_id"])

In [106]:
mousebioM_df= mousebioM_df[["mmusculus_homolog_ensembl_gene", "ensembl_gene_id"]]

In [107]:
import re

def extract_link_text(html_string):
    """Extract visible text from an anchor tag <a>...</a>."""
    match = re.search(r'<a[^>]*>(.*?)</a>', html_string)
    if match:
        return match.group(1).strip()
    return None

mouse_gene_pair1['LR Pair Card'] = mouse_gene_pair1['LR Pair Card'].apply(extract_paircard_id)
mouse_gene_pair1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1['LR Pair Card'] = mouse_gene_pair1['LR Pair Card'].apply(extract_paircard_id)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID
0,CDB00001,A2M-HSPA5,A2m → Hspa5,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864
1,CDB00002,A2M-LRP1,A2m → Lrp1,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070
3,CDB00004,ADA-DPP4,Ada → Dpp4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022
3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591
3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081
3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005


In [108]:
# Create the mapping dictionary from mouse to human Ensembl gene ID
mouse_to_human_map = dict(zip(
    mousebioM_df["mmusculus_homolog_ensembl_gene"],
    mousebioM_df["ensembl_gene_id"]
))

# Map Ligand
mouse_gene_pair1["Human Ligand Ensembl ID"] = mouse_gene_pair1["Ligand Ensembl ID"].map(mouse_to_human_map)

# Map Receptor
mouse_gene_pair1["Human Receptor Ensembl ID"] = mouse_gene_pair1["Receptor Ensembl ID"].map(mouse_to_human_map)
mouse_gene_pair1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1["Human Ligand Ensembl ID"] = mouse_gene_pair1["Ligand Ensembl ID"].map(mouse_to_human_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1["Human Receptor Ensembl ID"] = mouse_gene_pair1["Receptor Ensembl ID"].map(mouse_to_human_map)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID,Human Ligand Ensembl ID,Human Receptor Ensembl ID
0,CDB00001,A2M-HSPA5,A2m → Hspa5,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864,ENSG00000175899,ENSG00000044574
1,CDB00002,A2M-LRP1,A2m → Lrp1,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249,ENSG00000175899,ENSG00000123384
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070,ENSG00000159640,ENSG00000168398
3,CDB00004,ADA-DPP4,Ada → Dpp4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000,ENSG00000196839,ENSG00000197635
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504,ENSG00000137845,ENSG00000044524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022,,
3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591,ENSG00000113248,ENSG00000113248
3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081,,
3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005,,ENSG00000136869


In [109]:
def extract_hgnc_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'HGNC:(\d+)', col)
    if match:
        return 'HGNC:' +str(match.group(1))
    return None
    
mouse_gene_pair1['Ligand HGNC ID'] = mouse_gene_pair1['Ligand HGNC ID'].apply(extract_hgnc_id)
mouse_gene_pair1['Receptor HGNC ID'] = mouse_gene_pair1['Receptor HGNC ID'].apply(extract_hgnc_id)
mouse_gene_pair1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1['Ligand HGNC ID'] = mouse_gene_pair1['Ligand HGNC ID'].apply(extract_hgnc_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1['Receptor HGNC ID'] = mouse_gene_pair1['Receptor HGNC ID'].apply(extract_hgnc_id)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID,Human Ligand Ensembl ID,Human Receptor Ensembl ID
0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864,ENSG00000175899,ENSG00000044574
1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249,ENSG00000175899,ENSG00000123384
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070,ENSG00000159640,ENSG00000168398
3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000,ENSG00000196839,ENSG00000197635
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504,ENSG00000137845,ENSG00000044524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022,,
3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591,ENSG00000113248,ENSG00000113248
3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081,,
3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005,,ENSG00000136869


In [85]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
# Split uniprot_ids on comma and explode
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

uniprot_to_ensembl = hgnc_exploded.set_index("uniprot_id")["ensembl_gene_id"].to_dict()

# Left join on human_protein
df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "ensembl_gene_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

# Optionally rename
df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "ensembl_gene_id": "human_ensembl_gene_id"
}).drop(columns=["uniprot_id"])
df_merged = df_merged.dropna(subset=["human_ensembl_gene_id"])
df_merged.to_csv("data/mmusculus_inParanoid_uniProt_withHGNCAnn.tsv", sep="\t", index=False)

df_merged

Unnamed: 0,cluster_id,human_protein,human_inparalog_score,human_seed_score,mouse_protein,mouse_inparalog_score,mouse_seed_score,bitscore,human_ensembl_gene_id,human_gene
0,1,Q8WZ42,1.0,1.0,A2ASS6,1.000,1.0,60090.0,ENSG00000155657,TTN
1,2,Q8NF91,1.0,1.0,Q6ZWR6,1.000,1.0,14503.0,ENSG00000131018,SYNE1
2,3,Q5VST9,1.0,1.0,A2AAJ9,1.000,1.0,12156.0,ENSG00000154358,OBSCN
4,5,Q03001,1.0,1.0,Q91ZU6,1.000,1.0,10549.0,ENSG00000151914,DST
5,6,Q8WXG9,1.0,1.0,Q8VHN7,1.000,1.0,10049.0,ENSG00000164199,ADGRV1
...,...,...,...,...,...,...,...,...,...,...
20946,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSI2,0.212,,46.0,ENSG00000225396,FAM236D
20947,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSB3,0.178,,46.0,ENSG00000225396,FAM236D
20948,17092,Q96LM9,1.0,1.0,E9Q1X6,1.000,1.0,46.0,ENSG00000125975,C20orf173
20952,17096,P0DP42,1.0,1.0,A0A494B9K2,1.000,1.0,41.0,ENSG00000244219,TMEM225B


In [100]:
# Step 1: Load the mapping file
mouse_map = pd.read_csv("data/mouse_uniprot_to_ensembl.tsv", sep="\t", dtype=str)

# Step 2: Merge with df_merged on UniProt ID
# Assuming your UniProt column in df_merged is named 'mouse_protein'
df_merged = df_merged.merge(
    mouse_map,
    left_on="mouse_protein",
    right_on="uniprotswissprot",
    how="left"
)

# Step 3: Rename the new column for clarity (optional)
df_merged = df_merged.rename(columns={"ensembl_gene_id": "mouse_ensembl_gene_id"})

# Step 4: Drop the helper merge column if not needed
df_merged = df_merged.drop(columns=["uniprotswissprot"])

# Optional: Save
df_merged.to_csv("data/df_merged_with_mouse_ensembl.tsv", sep="\t", index=False)
df_merged

Unnamed: 0,cluster_id,human_protein,human_inparalog_score,human_seed_score,mouse_protein,mouse_inparalog_score,mouse_seed_score,bitscore,human_ensembl_gene_id,human_gene,mouse_ensembl_gene_id
0,1,Q8WZ42,1.0,1.0,A2ASS6,1.000,1.0,60090.0,ENSG00000155657,TTN,ENSMUSG00000051747
1,2,Q8NF91,1.0,1.0,Q6ZWR6,1.000,1.0,14503.0,ENSG00000131018,SYNE1,ENSMUSG00000096054
2,3,Q5VST9,1.0,1.0,A2AAJ9,1.000,1.0,12156.0,ENSG00000154358,OBSCN,ENSMUSG00000061462
3,5,Q03001,1.0,1.0,Q91ZU6,1.000,1.0,10549.0,ENSG00000151914,DST,ENSMUSG00000026131
4,6,Q8WXG9,1.0,1.0,Q8VHN7,1.000,1.0,10049.0,ENSG00000164199,ADGRV1,ENSMUSG00000069170
...,...,...,...,...,...,...,...,...,...,...,...
20920,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSI2,0.212,,46.0,ENSG00000225396,FAM236D,
20921,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSB3,0.178,,46.0,ENSG00000225396,FAM236D,
20922,17092,Q96LM9,1.0,1.0,E9Q1X6,1.000,1.0,46.0,ENSG00000125975,C20orf173,
20923,17096,P0DP42,1.0,1.0,A0A494B9K2,1.000,1.0,41.0,ENSG00000244219,TMEM225B,


In [112]:
# 1. Prepare df_merged with prefix
df_ligand = df_merged.add_prefix("Ligand_")

# 2. Merge on Human Ligand Ensembl ID (left join)
ligand_merged = mouse_gene_pair1.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left",
    suffixes=('', '_merged')
)
ligand_merged

Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,...,Ligand_human_protein,Ligand_human_inparalog_score,Ligand_human_seed_score,Ligand_mouse_protein,Ligand_mouse_inparalog_score,Ligand_mouse_seed_score,Ligand_bitscore,Ligand_human_ensembl_gene_id,Ligand_human_gene,Ligand_mouse_ensembl_gene_id
0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,...,P12821,1.0,1.0,P09470,1.0,1.000,2342.0,ENSG00000159640,ACE,ENSMUSG00000020681
3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,...,P00813,1.0,1.0,P03958,1.0,1.000,620.0,ENSG00000196839,ADA,ENSMUSG00000017697
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,...,O14672,1.0,1.0,O35598,1.0,1.000,1461.0,ENSG00000137845,ADAM10,ENSMUSG00000054693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9562,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,...,,,,,,,,,,
9563,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
9564,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,...,,,,,,,,,,
9565,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,...,,,,,,,,,,


In [114]:
def resolve_ligand(group):
    if group.shape[0] == 1:
        return group
    # Try to keep rows where mouse Ensembl ID also matches
    match = group[group["Ligand_mouse_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]  # arbitrarily pick one if multiple match
    else:
        return group.iloc[[0]]  # fallback to first

# Apply resolver only where duplication exists
resolved = (
    ligand_merged.groupby("Interaction ID", group_keys=False)
    .apply(resolve_ligand)
    .reset_index(drop=True)
)
resolved

  .apply(resolve_ligand)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,...,Ligand_human_protein,Ligand_human_inparalog_score,Ligand_human_seed_score,Ligand_mouse_protein,Ligand_mouse_inparalog_score,Ligand_mouse_seed_score,Ligand_bitscore,Ligand_human_ensembl_gene_id,Ligand_human_gene,Ligand_mouse_ensembl_gene_id
0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,...,P12821,1.0,1.0,P09470,1.0,1.000,2342.0,ENSG00000159640,ACE,ENSMUSG00000020681
3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,...,P00813,1.0,1.0,P03958,1.0,1.000,620.0,ENSG00000196839,ADA,ENSMUSG00000017697
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,...,O14672,1.0,1.0,O35598,1.0,1.000,1461.0,ENSG00000137845,ADAM10,ENSMUSG00000054693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,...,,,,,,,,,,
3157,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3158,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,...,,,,,,,,,,
3159,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,...,,,,,,,,,,


In [116]:
print("mouse_gene_pair1:", mouse_gene_pair1.shape)
print("df_merged:", df_merged.shape)

mouse_gene_pair1: (3961, 19)
df_merged: (20925, 11)


In [122]:
import pandas as pd

# Step 0: Add row ID to preserve original structure
mouse_gene_pair1 = mouse_gene_pair1.reset_index(drop=False).rename(columns={"index": "orig_row"})

# Step 1: Add prefix to merged data
df_ligand = df_merged.add_prefix("Ligand_")

# Step 2: Merge on Human Ligand Ensembl ID (left join, allows multiple matches per row)
ligand_merge = mouse_gene_pair1.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left",
    suffixes=('', '_dup')
)


In [123]:
# Step 3: For each original row, select best match
def resolve_per_row(group):
    # Prefer rows where mouse Ensembl also matches
    match = group[group["Ligand_mouse_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]  # pick first among valid
    elif len(group) > 0:
        return group.iloc[[0]]  # fallback: first available (human matched)
    else:
        return pd.DataFrame([group.iloc[0] * pd.NA])  # unlikely fallback

# Step 4: Apply per original row (not Interaction ID)
ligand_final = (
    ligand_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_per_row)
    .reset_index(drop=True)
    .drop(columns=["orig_row"])
)
ligand_final

  .apply(resolve_per_row)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,...,Ligand_human_protein,Ligand_human_inparalog_score,Ligand_human_seed_score,Ligand_mouse_protein,Ligand_mouse_inparalog_score,Ligand_mouse_seed_score,Ligand_bitscore,Ligand_human_ensembl_gene_id,Ligand_human_gene,Ligand_mouse_ensembl_gene_id
0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,...,P12821,1.0,1.0,P09470,1.0,1.000,2342.0,ENSG00000159640,ACE,ENSMUSG00000020681
3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,...,P00813,1.0,1.0,P03958,1.0,1.000,620.0,ENSG00000196839,ADA,ENSMUSG00000017697
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,...,O14672,1.0,1.0,O35598,1.0,1.000,1461.0,ENSG00000137845,ADAM10,ENSMUSG00000054693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,...,,,,,,,,,,
3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,...,,,,,,,,,,
3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,...,,,,,,,,,,


In [124]:
# Step 0: Add row ID again if not already there
if "orig_row" not in mouse_gene_pair1.columns:
    mouse_gene_pair1 = mouse_gene_pair1.reset_index(drop=False).rename(columns={"index": "orig_row"})

# Step 1: Add prefix to df_merged for receptor
df_receptor = df_merged.add_prefix("Receptor_")

# Step 2: Merge on Human Receptor Ensembl ID
receptor_merge = mouse_gene_pair1.merge(
    df_receptor,
    left_on="Human Receptor Ensembl ID",
    right_on="Receptor_human_ensembl_gene_id",
    how="left",
    suffixes=('', '_dup')
)

# Step 3: Resolve best match per row using mouse Ensembl ID
def resolve_receptor_per_row(group):
    match = group[group["Receptor_mouse_ensembl_gene_id"] == group["Receptor Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    elif len(group) > 0:
        return group.iloc[[0]]
    else:
        return pd.DataFrame([group.iloc[0] * pd.NA])  # safe fallback

# Step 4: Apply
receptor_final = (
    receptor_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_receptor_per_row)
    .reset_index(drop=True)
    .drop(columns=["orig_row"])
)

# Step 5: Check consistency
assert len(receptor_final) == len(mouse_gene_pair1), f"Row mismatch: {len(receptor_final)} != {len(mouse_gene_pair1)}"


  .apply(resolve_receptor_per_row)


In [128]:
mouse_gene_pair1

Unnamed: 0,orig_row,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID,Human Ligand Ensembl ID,Human Receptor Ensembl ID
0,0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864,ENSG00000175899,ENSG00000044574
1,1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249,ENSG00000175899,ENSG00000123384
2,2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070,ENSG00000159640,ENSG00000168398
3,3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000,ENSG00000196839,ENSG00000197635
4,4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504,ENSG00000137845,ENSG00000044524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022,,
3957,3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591,ENSG00000113248,ENSG00000113248
3958,3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081,,
3959,3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005,,ENSG00000136869


In [199]:
# Step 0: Add 'orig_row' once, keep it clean
mouse_gene_pair1_indexed = mouse_gene_pair1.reset_index(drop=False).rename(columns={"index": "orig_row"})

### === LIGAND MERGE === ###
df_ligand = df_merged.add_prefix("Ligand_")
ligand_merge = mouse_gene_pair1_indexed.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left"
)

# Ensure orig_row is single column (sometimes merge creates duplicates with suffix)
if isinstance(ligand_merge.columns, pd.MultiIndex):
    ligand_merge.columns = ligand_merge.columns.get_level_values(0)

if ligand_merge.columns.duplicated().any():
    ligand_merge = ligand_merge.loc[:, ~ligand_merge.columns.duplicated()]

def resolve_ligand_row(group):
    match = group[group["Ligand_mouse_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    else:
        return group.iloc[[0]]

ligand_final = (
    ligand_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_ligand_row)
    .reset_index(drop=True)
)

### === RECEPTOR MERGE === ###
df_receptor = df_merged.add_prefix("Receptor_")
receptor_merge = ligand_final.merge(
    df_receptor,
    left_on="Human Receptor Ensembl ID",
    right_on="Receptor_human_ensembl_gene_id",
    how="left"
)

# Same cleanup for receptor_merge
if isinstance(receptor_merge.columns, pd.MultiIndex):
    receptor_merge.columns = receptor_merge.columns.get_level_values(0)

if receptor_merge.columns.duplicated().any():
    receptor_merge = receptor_merge.loc[:, ~receptor_merge.columns.duplicated()]

def resolve_receptor_row(group):
    match = group[group["Receptor_mouse_ensembl_gene_id"] == group["Receptor Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    else:
        return group.iloc[[0]]

final_result = (
    receptor_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_receptor_row)
    .reset_index(drop=True)
    .drop(columns=["orig_row"])
)

assert len(final_result) == len(mouse_gene_pair1), f"Row mismatch: {len(final_result)} != {len(mouse_gene_pair1)}"


  .apply(resolve_ligand_row)
  .apply(resolve_receptor_row)


In [200]:
final_result

Unnamed: 0,Interaction ID,LR Pair Card,Ligand,Receptor,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,...,Receptor_human_protein,Receptor_human_inparalog_score,Receptor_human_seed_score,Receptor_mouse_protein,Receptor_mouse_inparalog_score,Receptor_mouse_seed_score,Receptor_bitscore,Receptor_human_ensembl_gene_id,Receptor_human_gene,Receptor_mouse_ensembl_gene_id
0,CDB00001,,Hspa5,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,...,P11021,1.0,1.0,P20029,1.0,1.000,1246.0,ENSG00000044574,HSPA5,ENSMUSG00000026864
1,CDB00002,,Lrp1,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,...,Q07954,1.0,1.0,Q91ZX7,1.0,1.000,6853.0,ENSG00000123384,LRP1,
2,CDB00003,,Bdkrb2,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,...,P30411,1.0,1.0,P32299,1.0,1.000,636.0,ENSG00000168398,BDKRB2,ENSMUSG00000021070
3,CDB00004,,Dpp4,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,...,P27487,1.0,1.0,P28843,1.0,1.000,1365.0,ENSG00000197635,DPP4,ENSMUSG00000035000
4,CDB00005,,Epha3,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,...,P29320,1.0,1.0,P29319,1.0,1.000,1923.0,ENSG00000044524,EPHA3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,,Pcdhb21,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb21 → Pcdhb21,,,,,,...,,,,,,,,,,
3957,CDB03445,,Pcdhb22,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb22 → Pcdhb22,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3958,CDB03446,,Pcdhgb8,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhgb8 → Pcdhgb8,,,,,,...,,,,,,,,,,
3959,CDB03447,,Tlr4,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Saa3 → Tlr4,,,,,,...,O00206,1.0,1.0,Q9QUK6,1.0,1.000,909.0,ENSG00000136869,TLR4,ENSMUSG00000039005


In [191]:
final_result.to_csv("data/human_mouse_merged_ensemblBiomaRt_inParanoid.csv")

In [205]:
final_result= pd.read_csv("data/human_mouse_merged_ensemblBiomaRt_inParanoid.csv")
final_result

Unnamed: 0.1,Unnamed: 0,Interaction ID,Ligand,Receptor,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,...,Receptor_human_protein,Receptor_human_inparalog_score,Receptor_human_seed_score,Receptor_mouse_protein,Receptor_mouse_inparalog_score,Receptor_mouse_seed_score,Receptor_bitscore,Receptor_human_ensembl_gene_id,Receptor_human_gene,Receptor_mouse_ensembl_gene_id
0,0,CDB00001,A2m,Hspa5,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,...,P11021,1.0,1.0,P20029,1.0,1.000,1246.0,ENSG00000044574,HSPA5,ENSMUSG00000026864
1,1,CDB00002,A2m,Lrp1,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,...,Q07954,1.0,1.0,Q91ZX7,1.0,1.000,6853.0,ENSG00000123384,LRP1,
2,2,CDB00003,Ace,Bdkrb2,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,...,P30411,1.0,1.0,P32299,1.0,1.000,636.0,ENSG00000168398,BDKRB2,ENSMUSG00000021070
3,3,CDB00004,Ada,Dpp4,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,...,P27487,1.0,1.0,P28843,1.0,1.000,1365.0,ENSG00000197635,DPP4,ENSMUSG00000035000
4,4,CDB00005,Adam10,Epha3,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,...,P29320,1.0,1.0,P29319,1.0,1.000,1923.0,ENSG00000044524,EPHA3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,3956,CDB03444,Pcdhb21,Pcdhb21,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,...,,,,,,,,,,
3957,3957,CDB03445,Pcdhb22,Pcdhb22,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3958,3958,CDB03446,Pcdhgb8,Pcdhgb8,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,...,,,,,,,,,,
3959,3959,CDB03447,Saa3,Tlr4,Saa3-Tlr4,Saa3 → Tlr4,,,,,...,O00206,1.0,1.0,Q9QUK6,1.0,1.000,909.0,ENSG00000136869,TLR4,ENSMUSG00000039005


In [288]:
final_result.columns

Index(['Unnamed: 0', 'Interaction ID', 'Ligand', 'Receptor', 'LR Pair Card',
       'Mouse LR Pair', 'Ligand HGNC ID', 'Receptor HGNC ID',
       'Ligand GOC score', 'Ligand WGA coverage', 'Ligand % Identity',
       'Ligand Target % Identity', 'Ligand Orthology Confidence',
       'Ligand Ensembl ID', 'Receptor GOC score', 'Receptor WGA coverage',
       'Receptor % Identity', 'Receptor Target % Identity',
       'Receptor Orthology Confidence', 'Receptor Ensembl ID',
       'Human Ligand Ensembl ID', 'Human Receptor Ensembl ID',
       'Ligand_cluster_id', 'Ligand_human_protein',
       'Ligand_human_inparalog_score', 'Ligand_human_seed_score',
       'Ligand_mouse_protein', 'Ligand_mouse_inparalog_score',
       'Ligand_mouse_seed_score', 'Ligand_bitscore',
       'Ligand_human_ensembl_gene_id', 'Ligand_human_gene',
       'Ligand_mouse_ensembl_gene_id', 'Receptor_cluster_id',
       'Receptor_human_protein', 'Receptor_human_inparalog_score',
       'Receptor_human_seed_score', 'Rec

In [206]:
score_cols = [
    ligand_human_inparalog_score,
    receptor_human_inparalog_score,
    ligand_mouse_inparalog_score,
    receptor_mouse_inparalog_score,
    ligand_human_seed_score,
    receptor_human_seed_score,
    ligand_mouse_seed_score,
    receptor_mouse_seed_score,
    ligand_bit_score,
    receptor_bit_score
]

for col in score_cols:
    if col in final_result.columns:
        final_result[col] = pd.to_numeric(final_result[col], errors='coerce')


In [293]:
def summarize_orthologs(human_col, species_col, label,
                        confidence_orth_col=None, confidence_orth_threshold=None,
                        GOC_col=None, GOC_threshold=None,
                        perc_identity_col= None, perc_identity_thres= None,
                        ligand_human_inparalog_score_col=None, ligand_human_inparalog_score_threshold=None,
                        receptor_human_inparalog_score_col=None, receptor_human_inparalog_score_threshold=None,
                        ligand_mouse_inparalog_score_col=None, ligand_mouse_inparalog_score_threshold=None,
                        receptor_mouse_inparalog_score_col=None, receptor_mouse_inparalog_score_threshold=None,
                        ligand_human_seed_score_col=None, ligand_human_seed_score_threshold=None,
                        receptor_human_seed_score_col=None, receptor_human_seed_score_threshold=None,
                        ligand_mouse_seed_score_col=None, ligand_mouse_seed_score_threshold=None,
                        receptor_mouse_seed_score_col=None, receptor_mouse_seed_score_threshold=None,
                        ligand_bit_score_col=None, ligand_bit_score_threshold=None,
                        receptor_bit_score_col=None, receptor_bit_score_threshold=None):
    
    df = final_result.copy()

    # Apply filters one by one if thresholds are given
    filters = [
        (confidence_orth_col, lambda x: x == confidence_orth_threshold),
        (GOC_col, lambda x: x >= GOC_threshold),
        (perc_identity_col, lambda x: x >= perc_identity_thres),
        (ligand_human_inparalog_score_col, lambda x: x >= ligand_human_inparalog_score_threshold),
        (receptor_human_inparalog_score_col, lambda x: x >= receptor_human_inparalog_score_threshold),
        (ligand_mouse_inparalog_score_col, lambda x: x >= ligand_mouse_inparalog_score_threshold),
        (receptor_mouse_inparalog_score_col, lambda x: x >= receptor_mouse_inparalog_score_threshold),
        (ligand_human_seed_score_col, lambda x: x >= ligand_human_seed_score_threshold),
        (receptor_human_seed_score_col, lambda x: x >= receptor_human_seed_score_threshold),
        (ligand_mouse_seed_score_col, lambda x: x >= ligand_mouse_seed_score_threshold),
        (receptor_mouse_seed_score_col, lambda x: x >= receptor_mouse_seed_score_threshold),
        (ligand_bit_score_col, lambda x: x >= ligand_bit_score_threshold),
        (receptor_bit_score_col, lambda x: x >= receptor_bit_score_threshold),
    ]


    # Apply filters and track how many rows were removed
    original_rows = df.shape[0]
    for col, condition in filters:
        if col and condition is not None:
            before = df.shape[0]
            df = df[df[col].apply(condition)]
            after = df.shape[0]
            print(f"Filtered {col}: {before - after} rows removed (remaining: {after})")

    # Compute unique ortholog pairs
    unique_pairs = df[[human_col, species_col]].drop_duplicates()

    # Count mouse orthologs per human gene
    counts = (
        unique_pairs
        .groupby(human_col)[species_col]
        .count()
        .sort_values(ascending=False)
        .reset_index(name='count')
    )

    # Build tag from filters
    filter_tag = label.lower()
    if confidence_orth_threshold is not None:
        filter_tag += f"_conf{confidence_orth_threshold}"
    if GOC_threshold is not None:
        filter_tag += f"_GOCge{GOC_threshold}"
    if ligand_human_inparalog_score_threshold is not None:
        filter_tag += f"_LHISge{ligand_human_inparalog_score_threshold}"
    if receptor_bit_score_threshold is not None:
        filter_tag += f"_RBSge{receptor_bit_score_threshold}"

    counts.to_csv(f"data/human_mouse_orth_count_{filter_tag}.csv", index=False)

    summary_counts = counts['count'].value_counts().sort_index()
    total_human_genes = counts.shape[0]

    # Collect active filters for reporting
    active_filters = []
    if confidence_orth_col and confidence_orth_threshold is not None:
        if isinstance(confidence_orth_threshold, (list, set, tuple)):
            active_filters.append(f"{confidence_orth_col} in {sorted(confidence_orth_threshold)}")
        else:
            active_filters.append(f"{confidence_orth_col} equals '{confidence_orth_threshold}'")
    if GOC_col and GOC_threshold is not None:
        active_filters.append(f"{GOC_col} ≥ {GOC_threshold}")
    if perc_identity_col and perc_identity_thres is not None:
        active_filters.append(f"{perc_identity_col} ≥ {perc_identity_thres}")
    if ligand_human_inparalog_score_col and ligand_human_inparalog_score_threshold is not None:
        active_filters.append(f"{ligand_human_inparalog_score_col} ≥ {ligand_human_inparalog_score_threshold}")
    if receptor_human_inparalog_score_col and receptor_human_inparalog_score_threshold is not None:
        active_filters.append(f"{receptor_human_inparalog_score_col} ≥ {receptor_human_inparalog_score_threshold}")
    if ligand_mouse_inparalog_score_col and ligand_mouse_inparalog_score_threshold is not None:
        active_filters.append(f"{ligand_mouse_inparalog_score_col} ≥ {ligand_mouse_inparalog_score_threshold}")
    if receptor_mouse_inparalog_score_col and receptor_mouse_inparalog_score_threshold is not None:
        active_filters.append(f"{receptor_mouse_inparalog_score_col} ≥ {receptor_mouse_inparalog_score_threshold}")
    if ligand_human_seed_score_col and ligand_human_seed_score_threshold is not None:
        active_filters.append(f"{ligand_human_seed_score_col} ≥ {ligand_human_seed_score_threshold}")
    if receptor_human_seed_score_col and receptor_human_seed_score_threshold is not None:
        active_filters.append(f"{receptor_human_seed_score_col} ≥ {receptor_human_seed_score_threshold}")
    if ligand_mouse_seed_score_col and ligand_mouse_seed_score_threshold is not None:
        active_filters.append(f"{ligand_mouse_seed_score_col} ≥ {ligand_mouse_seed_score_threshold}")
    if receptor_mouse_seed_score_col and receptor_mouse_seed_score_threshold is not None:
        active_filters.append(f"{receptor_mouse_seed_score_col} ≥ {receptor_mouse_seed_score_threshold}")
    if ligand_bit_score_col and ligand_bit_score_threshold is not None:
        active_filters.append(f"{ligand_bit_score_col} ≥ {ligand_bit_score_threshold}")
    if receptor_bit_score_col and receptor_bit_score_threshold is not None:
        active_filters.append(f"{receptor_bit_score_col} ≥ {receptor_bit_score_threshold}")
    
    filter_text = "; ".join(active_filters) if active_filters else "No filters applied"
    
    summary_lines = [
        f"Out of {total_human_genes} unique human {label.lower()} genes:",
        f" - Filters applied: {filter_text}"
    ]

    for orth_count, gene_count in summary_counts.items():
        summary_lines.append(
            f" - {gene_count} human {label.lower()} genes had {orth_count} mouse ortholog(s)"
        )

    return "\n".join(summary_lines)


In [294]:
# Detect columns
# Ligand
confidence_orth_ligand = [col for col in final_result.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in final_result.columns if "Ligand GOC" in col][0]
percIdent_col_ligand = [col for col in final_result.columns if "Ligand % Identity" in col][0]
human_ligand_col = [col for col in final_result.columns if "Ligand HGNC ID" in col][0]
ligand_col = [col for col in final_result.columns if "Ligand" in col][0]
ligand_col = [col for col in final_result.columns if "Ligand" in col][0]
ligand_human_inparalog_score = [col for col in final_result.columns if "Ligand_human_inparalog_score" in col][0]
ligand_mouse_inparalog_score = [col for col in final_result.columns if "Ligand_mouse_inparalog_score" in col][0]
ligand_human_seed_score = [col for col in final_result.columns if "Ligand_human_seed_score" in col][0]
ligand_mouse_seed_score = [col for col in final_result.columns if "Ligand_mouse_seed_score" in col][0]
ligand_bit_score = [col for col in final_result.columns if "Ligand_bitscore" in col][0]

#Receptor
human_receptor_col = [col for col in final_result.columns if "Receptor HGNC ID" in col][0]
confidence_orth_receptor = [col for col in final_result.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in final_result.columns if "Receptor GOC" in col][0]
percIdent_col_receptor = [col for col in final_result.columns if "Receptor % Identity" in col][0]
receptor_col = [col for col in final_result.columns if "Receptor" in col][0]
receptor_human_inparalog_score = [col for col in final_result.columns if "Receptor_human_inparalog_score" in col][0]
receptor_mouse_inparalog_score = [col for col in final_result.columns if "Receptor_mouse_inparalog_score" in col][0]
receptor_human_seed_score = [col for col in final_result.columns if "Receptor_human_seed_score" in col][0]
receptor_mouse_seed_score = [col for col in final_result.columns if "Receptor_mouse_seed_score" in col][0]
receptor_bit_score = [col for col in final_result.columns if "Receptor_bitscore" in col][0]

In [295]:
ligand_summary = summarize_orthologs(
    human_col=human_ligand_col,
    species_col=ligand_col,
    label="Ligand",
    confidence_orth_col=confidence_orth_ligand,
    confidence_orth_threshold=1,
    perc_identity_col = percIdent_col_ligand,
    perc_identity_thres = 60
    # GOC_col=GOC_col_ligand,
    # GOC_threshold=100,
    # ligand_human_inparalog_score_col=ligand_human_inparalog_score,
    # ligand_human_inparalog_score_threshold=1,
    # ligand_human_seed_score_col=ligand_human_seed_score,
    # ligand_human_seed_score_threshold=1,
    # ligand_mouse_seed_score_col=ligand_mouse_seed_score,
    # ligand_mouse_seed_score_threshold=1,
    # ligand_bit_score_col=ligand_bit_score,
    # ligand_bit_score_threshold=40
)

print(ligand_summary)


Filtered Ligand Orthology Confidence: 657 rows removed (remaining: 3304)
Filtered Ligand % Identity: 246 rows removed (remaining: 3058)
Out of 835 unique human ligand genes:
 - Filters applied: Ligand Orthology Confidence equals '1'; Ligand % Identity ≥ 60
 - 814 human ligand genes had 1 mouse ortholog(s)
 - 13 human ligand genes had 2 mouse ortholog(s)
 - 4 human ligand genes had 3 mouse ortholog(s)
 - 4 human ligand genes had 5 mouse ortholog(s)


In [297]:
receptor_summary = summarize_orthologs(
    human_col=human_receptor_col,
    species_col=receptor_col,
    label="Receptor",
    confidence_orth_col=confidence_orth_receptor,
    confidence_orth_threshold=1,
    perc_identity_thres = 60,
    perc_identity_col = percIdent_col_receptor,
    # GOC_col=GOC_col_receptor,
    # GOC_threshold=100,
    # receptor_human_inparalog_score_col=receptor_human_inparalog_score,
    # receptor_human_inparalog_score_threshold=1,
    # receptor_human_seed_score_col=receptor_human_seed_score,
    # receptor_human_seed_score_threshold=1,
    # receptor_mouse_seed_score_col=receptor_mouse_seed_score,
    # receptor_mouse_seed_score_threshold=1,
    # receptor_bit_score_col=receptor_bit_score,
    # receptor_bit_score_threshold=40
)

print(receptor_summary)


Filtered Receptor Orthology Confidence: 779 rows removed (remaining: 3182)
Filtered Receptor % Identity: 107 rows removed (remaining: 3075)
Out of 684 unique human receptor genes:
 - Filters applied: Receptor Orthology Confidence equals '1'; Receptor % Identity ≥ 60
 - 679 human receptor genes had 1 mouse ortholog(s)
 - 4 human receptor genes had 2 mouse ortholog(s)
 - 1 human receptor genes had 3 mouse ortholog(s)


In [21]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

# Download InParanoid prot table
url = "https://inparanoidb.sbc.su.se/download/sqltable/9606&10090&prot"
r = requests.get(url)
r.raise_for_status()

df = pd.read_csv(StringIO(r.text.strip()), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "inparalog_score", "protein_id", "seed_score"]

# Tag each row by species
def infer_species(src):
    if "9606" in src:
        return "human"
    elif "10090" in src:
        return "mouse"
    return "unknown"

df["species"] = df["source_file"].apply(infer_species)


In [22]:
# Expand ortholog pairs within each cluster
records = []
for cid, grp in df.groupby("cluster_id"):
    humans = grp[grp["species"] == "human"]
    mice = grp[grp["species"] == "mouse"]
    for h, m in product(humans.itertuples(index=False), mice.itertuples(index=False)):
        records.append({
            "cluster_id": cid,
            "human_protein": h.protein_id,
            "human_inparalog_score": h.inparalog_score,
            "human_seed_score": h.seed_score,
            "mouse_protein": m.protein_id,
            "mouse_inparalog_score": m.inparalog_score,
            "mouse_seed_score": m.seed_score,
            "bitscore": (h.bitscore + m.bitscore) / 2  # average for now
        })

df_orthologs = pd.DataFrame(records)


In [24]:
df_orthologs.to_csv("data/inParanoid_mmusculus.csv")

In [30]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids"])
# Split uniprot_ids on comma and explode
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()
hgnc_exploded

Unnamed: 0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_id,lncipedia,gtrnadb,agr,mane_select,gencc,uniprot_id
0,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,...,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,,P04217
2,HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,...,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,,Q9NQ94
3,HGNC:7,A2M,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FWP007|S863-7|CPAMD5,,...,,,,,,,HGNC:7,ENST00000318602.12|NM_000014.6,HGNC:7,P01023
5,HGNC:23336,A2ML1,alpha-2-macroglobulin like 1,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FLJ25179|p170,,...,,,,,,,HGNC:23336,ENST00000299698.12|NM_144670.6,HGNC:23336,A8K2U0
9,HGNC:30005,A3GALT2,"alpha 1,3-galactosyltransferase 2",protein-coding gene,gene with protein product,Approved,1p35.1,01p35.1,IGBS3S|IGB3S,iGb3 synthase|isoglobotriaosylceramide synthase,...,,,,,,,HGNC:30005,ENST00000442999.3|NM_001080438.1,,U3KPV4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44099,HGNC:3556,FABP2,fatty acid binding protein 2,protein-coding gene,gene with protein product,Approved,4q26,04q26,I-FABP,,...,,,,,,,HGNC:3556,ENST00000274024.4|NM_000134.4,,P12104
44100,HGNC:3557,FABP3,fatty acid binding protein 3,protein-coding gene,gene with protein product,Approved,1p35.2,01p35.2,H-FABP|O-FABP,mammary-derived growth inhibitor,...,,,,,,,HGNC:3557,ENST00000373713.7|NM_004102.5,,P05413
44102,HGNC:3559,FABP4,fatty acid binding protein 4,protein-coding gene,gene with protein product,Approved,8q21.13,08q21.13,A-FABP|aP2,adipocyte fatty acid binding protein,...,,,,,,,HGNC:3559,ENST00000256104.5|NM_001442.3,HGNC:3559,P15090
44103,HGNC:3560,FABP5,fatty acid binding protein 5,protein-coding gene,gene with protein product,Approved,8q21.13,08q21.13,E-FABP|PA-FABP|KFABP,,...,,,,,,,HGNC:3560,ENST00000297258.11|NM_001444.3,,Q01469


In [31]:
# Left join on human_protein
df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "hgnc_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

# Optionally rename
df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "hgnc_id": "human_hgnc_id"
}).drop(columns=["uniprot_id"])
df_merged = df_merged.dropna(subset=["human_hgnc_id"])
df_merged

Unnamed: 0,cluster_id,human_protein,human_inparalog_score,human_seed_score,mouse_protein,mouse_inparalog_score,mouse_seed_score,bitscore,human_hgnc_id,human_gene
0,1,Q8WZ42,1.0,1.0,A2ASS6,1.000,1.0,60090.0,HGNC:12403,TTN
1,2,Q8NF91,1.0,1.0,Q6ZWR6,1.000,1.0,14503.0,HGNC:17089,SYNE1
2,3,Q5VST9,1.0,1.0,A2AAJ9,1.000,1.0,12156.0,HGNC:15719,OBSCN
4,5,Q03001,1.0,1.0,Q91ZU6,1.000,1.0,10549.0,HGNC:1090,DST
5,6,Q8WXG9,1.0,1.0,Q8VHN7,1.000,1.0,10049.0,HGNC:17416,ADGRV1
...,...,...,...,...,...,...,...,...,...,...
20948,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSI2,0.212,,46.0,HGNC:52642,FAM236D
20949,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSB3,0.178,,46.0,HGNC:52642,FAM236D
20950,17092,Q96LM9,1.0,1.0,E9Q1X6,1.000,1.0,46.0,HGNC:16166,C20orf173
20954,17096,P0DP42,1.0,1.0,A0A494B9K2,1.000,1.0,41.0,HGNC:53075,TMEM225B


In [35]:
interaction_id_col = [col for col in mouse_gene_pair1.columns if "Interaction ID" in col][0]

In [36]:
human_ligand_col = [col for col in mouse_gene_pair1.columns if "Human Ligand" in col][0]
ligand_col = [col for col in mouse_gene_pair1.columns if "Ligand" in col][0]

In [37]:
human_receptor_col = [col for col in mouse_gene_pair1.columns if "Human Receptor" in col][0]
receptor_col = [col for col in mouse_gene_pair1.columns if "Receptor" in col][0]

In [52]:
GOC_col = [col for col in mouse_gene_pair1.columns if "GOC" in col][0]
mouse_gene_pair1[GOC_col]

0        75.0
1        75.0
2       100.0
3       100.0
4       100.0
        ...  
3956      NaN
3957      NaN
3958      NaN
3959      NaN
3960      NaN
Name: Ligand GOC score, Length: 3961, dtype: float64

In [58]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
# Split uniprot_ids on comma and explode
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

uniprot_to_ensembl = hgnc_exploded.set_index("uniprot_id")["ensembl_gene_id"].to_dict()

In [61]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
# Split uniprot_ids on comma and explode
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

uniprot_to_ensembl = hgnc_exploded.set_index("uniprot_id")["ensembl_gene_id"].to_dict()

# Left join on human_protein
df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "ensembl_gene_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

# Optionally rename
df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "ensembl_gene_id": "human_ensembl_gene_id"
}).drop(columns=["uniprot_id"])
df_merged = df_merged.dropna(subset=["human_ensembl_gene_id"])
df_merged.to_csv("data/mmusculus_inParanoid_uniProt_withHGNCAnn.tsv", sep="\t", index=False)

In [62]:
df_merged

Unnamed: 0,cluster_id,human_protein,human_inparalog_score,human_seed_score,mouse_protein,mouse_inparalog_score,mouse_seed_score,bitscore,human_ensembl_gene_id,human_gene
0,1,Q8WZ42,1.0,1.0,A2ASS6,1.000,1.0,60090.0,ENSG00000155657,TTN
1,2,Q8NF91,1.0,1.0,Q6ZWR6,1.000,1.0,14503.0,ENSG00000131018,SYNE1
2,3,Q5VST9,1.0,1.0,A2AAJ9,1.000,1.0,12156.0,ENSG00000154358,OBSCN
4,5,Q03001,1.0,1.0,Q91ZU6,1.000,1.0,10549.0,ENSG00000151914,DST
5,6,Q8WXG9,1.0,1.0,Q8VHN7,1.000,1.0,10049.0,ENSG00000164199,ADGRV1
...,...,...,...,...,...,...,...,...,...,...
20946,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSI2,0.212,,46.0,ENSG00000225396,FAM236D
20947,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSB3,0.178,,46.0,ENSG00000225396,FAM236D
20948,17092,Q96LM9,1.0,1.0,E9Q1X6,1.000,1.0,46.0,ENSG00000125975,C20orf173
20952,17096,P0DP42,1.0,1.0,A0A494B9K2,1.000,1.0,41.0,ENSG00000244219,TMEM225B


In [68]:
def summarize_orthologs(human_col, species_col, label,
                        confidence_orth_col=None, confidence_orth_threshold=None,
                        GOC_col=None, GOC_threshold=None):
    df = mouse_gene_pair1.copy()

    if confidence_orth_col and confidence_orth_threshold is not None:
        df = df[df[confidence_orth_col] == confidence_orth_threshold]

    if GOC_col and GOC_threshold is not None:
        df = df[df[GOC_col] >= GOC_threshold]  # Use >= instead of ==

    unique_pairs = df[[human_col, species_col]].drop_duplicates()

    counts = (
        unique_pairs
        .groupby(human_col)[species_col]
        .count()
        .sort_values(ascending=False)
        .reset_index(name='count')
    )

    filter_tag = label.lower()
    if confidence_orth_threshold is not None:
        filter_tag += f"_conf{confidence_orth_threshold}"
    if GOC_threshold is not None:
        filter_tag += f"_GOCge{GOC_threshold}"

    counts.to_csv(f"data/human_mouse_orth_count_{filter_tag}.csv", index=False)

    summary_counts = counts['count'].value_counts().sort_index()
    total_human_genes = counts.shape[0]

    summary_lines = [
        f"Out of {total_human_genes} unique human {label.lower()} genes "
        f"(Orthology Confidence = {confidence_orth_threshold}, GOC ≥ {GOC_threshold}):"
    ]
    for orth_count, gene_count in summary_counts.items():
        summary_lines.append(
            f" - {gene_count} human {label.lower()} genes had {orth_count} mouse ortholog(s)"
        )

    return "\n".join(summary_lines)


# Detect columns
confidence_orth_ligand = [col for col in mouse_gene_pair1.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in mouse_gene_pair1.columns if "Ligand GOC" in col][0]

confidence_orth_receptor = [col for col in mouse_gene_pair1.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in mouse_gene_pair1.columns if "Receptor GOC" in col][0]

# Generate summaries
ligand_summary = summarize_orthologs(
    human_ligand_col, ligand_col, "Ligand",
    confidence_orth_col=confidence_orth_ligand, confidence_orth_threshold=None,
    GOC_col=GOC_col_ligand, GOC_threshold=0
)

receptor_summary = summarize_orthologs(
    human_receptor_col, receptor_col, "Receptor",
    confidence_orth_col=confidence_orth_receptor, confidence_orth_threshold=None,
    GOC_col=GOC_col_receptor, GOC_threshold=0
)

# Print
print(ligand_summary)
print()
print(receptor_summary)

In [69]:
mouse_gene_pair1[GOC_col_ligand]

0        75.0
1        75.0
2       100.0
3       100.0
4       100.0
        ...  
3956      NaN
3957      NaN
3958      NaN
3959      NaN
3960      NaN
Name: Ligand GOC score, Length: 3961, dtype: float64

In [74]:

# Detect columns
confidence_orth_ligand = [col for col in mouse_gene_pair1.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in mouse_gene_pair1.columns if "Ligand GOC" in col][0]

confidence_orth_receptor = [col for col in mouse_gene_pair1.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in mouse_gene_pair1.columns if "Receptor GOC" in col][0]

# Generate summaries
ligand_summary = summarize_orthologs(
    human_ligand_col, ligand_col, "Ligand",
    confidence_orth_col=confidence_orth_ligand, confidence_orth_threshold=None,
    GOC_col=GOC_col_ligand, GOC_threshold=0
)

receptor_summary = summarize_orthologs(
    human_receptor_col, receptor_col, "Receptor",
    confidence_orth_col=confidence_orth_receptor, confidence_orth_threshold=None,
    GOC_col=GOC_col_receptor, GOC_threshold=0
)

# Print
print(ligand_summary)
print()
print(receptor_summary)

Out of 985 unique human ligand genes (Orthology Confidence = None, GOC ≥ 0):
 - 925 human ligand genes had 1 mouse ortholog(s)
 - 31 human ligand genes had 2 mouse ortholog(s)
 - 5 human ligand genes had 3 mouse ortholog(s)
 - 2 human ligand genes had 4 mouse ortholog(s)
 - 5 human ligand genes had 5 mouse ortholog(s)
 - 3 human ligand genes had 6 mouse ortholog(s)
 - 2 human ligand genes had 7 mouse ortholog(s)
 - 12 human ligand genes had 14 mouse ortholog(s)

Out of 780 unique human receptor genes (Orthology Confidence = None, GOC ≥ 0):
 - 742 human receptor genes had 1 mouse ortholog(s)
 - 15 human receptor genes had 2 mouse ortholog(s)
 - 7 human receptor genes had 3 mouse ortholog(s)
 - 3 human receptor genes had 4 mouse ortholog(s)
 - 3 human receptor genes had 5 mouse ortholog(s)
 - 4 human receptor genes had 6 mouse ortholog(s)
 - 6 human receptor genes had 7 mouse ortholog(s)


In [42]:
len(pd.unique(mouse_gene_pair1[human_receptor_col]))

781