# Python Notebook

In [1]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
import pandas as pd
# Change working directory to ConnectomeDB
project_root = "/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories/ConnectomeDB"
os.chdir(project_root)
sys.path.append(os.path.abspath("src"))

In [5]:
import fetchGSheet

In [22]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

### IMPORTANT ####
# Warning: First run "src/convertOrthUniprotToEnsembl.r" in R
##################

# === USER INPUT ===
data_dir = "data"
# original species
orig_species_input = "human"
# Ortholog species
species_input = "horse"  # Options: "mouse", "zebrafish", # No sheep in inParanoid

# === INTERNAL MAPPINGS ===

orig_species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    #"sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}


species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    #"sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}

if orig_species_input not in orig_species_info:
    raise ValueError(f"Species '{orig_species_input}' not supported. Choose from: {list(orig_species_info)}")

if species_input not in species_info:
    raise ValueError(f"Species '{species_input}' not supported. Choose from: {list(species_info)}")

species = species_info[species_input]
taxid = species["taxid"]
code = species["code"]

orig_species = orig_species_info[orig_species_input]
orig_taxid = orig_species["taxid"]
orig_code = orig_species["code"]

# === Step 1: Download inParanoid file for human vs species ===
url = f"https://inparanoidb.sbc.su.se/download/sqltable/{orig_taxid}&{taxid}&prot"
r = requests.get(url)
r.raise_for_status()

df = pd.read_csv(StringIO(r.text.strip()), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "inparalog_score", "protein_id", "seed_score"]

# === Step 2: Add species labels ===
def infer_species(src):
    if orig_taxid in src:
        return orig_species_input
    elif taxid in src:
        return species_input
    return "unknown"

df["species"] = df["source_file"].apply(infer_species)

# === Step 3: Build ortholog pairs ===
records = []
for cid, grp in df.groupby("cluster_id"):
    orig_spec = grp[grp["species"] == orig_species_input]
    others = grp[grp["species"] == species_input]
    
    for h, o in product(orig_spec.itertuples(index=False), others.itertuples(index=False)):
        records.append({
            "cluster_id": cid,
            f"{orig_species_input}_protein": h.protein_id,
            f"{orig_species_input}_inparalog_score": h.inparalog_score,
            f"{orig_species_input}_seed_score": h.seed_score,
            f"{species_input}_protein": o.protein_id,
            f"{species_input}_inparalog_score": o.inparalog_score,
            f"{species_input}_seed_score": o.seed_score,
            "bitscore": (h.bitscore + o.bitscore) / 2
        })


df_orthologs = pd.DataFrame(records)
df_orthologs.to_csv(f"data/{orig_species_input}_centric_inParanoid_{species_input}.csv", index=False)

# === Step 4: UniProt → Gene Name ===
print("Starting annotation process...")

# --- Load Human UniProt Mapping once ---
orig_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{orig_species_input}.csv")
if not os.path.exists(orig_uniprot_file):
    print(f"Error: {orig_species_input} UniProt mapping file not found at {orig_uniprot_file}. Please run the UniProt fetching script for human first.")
    exit() # Exit if the essential human file is missing

print(f"Loading {orig_species_input} UniProt mapping from {orig_uniprot_file}...")
# We only need 'Entry' (Accession) and 'Gene Names'
orig_uniprot_df = pd.read_csv(orig_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as human gene name
orig_uniprot_df = orig_uniprot_df.rename(columns={"Gene Names": f"{orig_species_input}_Gene_Name"})
print(f"Loaded {len(orig_uniprot_df)} {orig_species_input} UniProt entries.")

# --- Process each species ---
print(f"\n--- Processing {species_input} ---")

inparanoid_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}.csv")

species_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{species_input}.csv")
output_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}_AnnWithUniProt.csv")

# Check if input files exist
if not os.path.exists(inparanoid_file):
    print(f"Skipping {species_input}: InParanoid file not found at {inparanoid_file}")
    if not os.path.exists(species_uniprot_file):
        print(f"Skipping {species_input}: UniProt mapping file not found at {species_uniprot_file}")

print(f"Loading InParanoid data from {inparanoid_file}...")
df_inparanoid = pd.read_csv(inparanoid_file)
print(f"Loaded {len(df_inparanoid)} InParanoid entries for {species_input}.")

print(f"Loading {species_input} UniProt mapping from {species_uniprot_file}...")
species_uniprot_df = pd.read_csv(species_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as the species' gene name
species_uniprot_df = species_uniprot_df.rename(columns={"Gene Names": f"{species_input}_Gene_Name"})
print(f"Loaded {len(species_uniprot_df)} {species_input} UniProt entries.")

# 1. Annotate based on "orig_species_protein" with orig UniProt data
print(f"Merging {orig_species_input} gene names...")
# Use left merge to keep all rows from df_inparanoid
df_merged = pd.merge(
    df_inparanoid,
    orig_uniprot_df,
    left_on=f"{orig_species_input}_protein",
    right_on="Entry",
    how="left"
)
    # Drop the redundant 'Entry' column from the merge
df_merged = df_merged.drop(columns=["Entry"])

# 2. Annotate based on "{species_input}_protein" with species UniProt data
print(f"Merging {species_input} gene names...")
df_merged = pd.merge(
    df_merged, # Merge into the already merged dataframe
    species_uniprot_df,
    left_on=f"{species_input}_protein",
    right_on="Entry",
    how="left"
)
# Drop the redundant 'Entry' column from the second merge
df_merged = df_merged.drop(columns=["Entry"])

# 3. Save merged data
print(f"Saving merged data to {output_file}...")
df_merged.to_csv(output_file, index=False)
print(f"Successfully saved {len(df_merged)} annotated entries for {species_input}.")

print("\nAnnotation process completed for all specified species.")

Starting annotation process...
Loading human UniProt mapping from data/uniprotMapping_human.csv...
Loaded 45583 human UniProt entries.

--- Processing horse ---
Loading InParanoid data from data/human_centric_inParanoid_horse.csv...
Loaded 18452 InParanoid entries for horse.
Loading horse UniProt mapping from data/uniprotMapping_horse.csv...
Loaded 413 horse UniProt entries.
Merging human gene names...
Merging horse gene names...
Saving merged data to data/human_centric_inParanoid_horse_AnnWithUniProt.csv...
Successfully saved 41894 annotated entries for horse.

Annotation process completed for all specified species.


In [61]:
df_merged

Unnamed: 0,cluster_id,mouse_protein,mouse_inparalog_score,mouse_seed_score,cow_protein,cow_inparalog_score,cow_seed_score,bitscore,mouse_Gene_Name,cow_Gene_Name
0,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Macf1,
1,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Acf7,
2,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Aclp7,
3,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Kiaa0754,
4,1,Q9QXZ0,1.000,1.0,F1N6H4,1.0,1.0,10545.0,Macf,
...,...,...,...,...,...,...,...,...,...,...
44484,16645,Q925H3,1.000,1.0,A0A3Q1NHC3,1.0,1.0,42.0,Krtap16-8,
44485,16645,Q925H3,1.000,1.0,A0A3Q1NHC3,1.0,1.0,42.0,Krtap16.8,
44486,16645,O09048,0.667,,A0A3Q1NHC3,1.0,1.0,42.0,,
44487,16645,O08631,0.241,,A0A3Q1NHC3,1.0,1.0,42.0,,


In [12]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "ensembl_gene_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "ensembl_gene_id": "human_ensembl_gene_id"
}).drop(columns=["uniprot_id"])

df_merged = df_merged.dropna(subset=["human_ensembl_gene_id"])
df_merged.to_csv(f"data/{species_input}_inParanoid_withHGNC.tsv", sep="\t", index=False)

# === Step 5: Optional - Species UniProt → Ensembl mapping ===
map_path = f"data/{species_input}_uniprot_to_ensembl.tsv"
try:
    species_map = pd.read_csv(map_path, sep="\t", dtype=str)
    df_merged = df_merged.merge(
        species_map,
        left_on=f"{species_input}_protein",
        right_on="uniprotswissprot",
        how="left"
    ).rename(columns={"ensembl_gene_id": f"{species_input}_ensembl_gene_id"}) \
     .drop(columns=["uniprotswissprot"])
except FileNotFoundError:
    print(f"⚠️  Mapping file not found: {map_path}")

df_merged.to_csv(f"data/df_merged_with_{species_input}_ensembl.tsv", sep="\t", index=False)


In [23]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

### IMPORTANT ####
# Warning: First run "src/convertOrthUniprotToEnsembl.r" in R
##################

# === USER INPUT ===
# original species
orig_species_input = "mouse"
# Ortholog species
species_input = "human"  # Options: "mouse", "zebrafish", # No sheep in inParanoid

# === INTERNAL MAPPINGS ===

orig_species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    "sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}


species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    "sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}

if orig_species_input not in orig_species_info:
    raise ValueError(f"Species '{orig_species_input}' not supported. Choose from: {list(orig_species_info)}")

if species_input not in species_info:
    raise ValueError(f"Species '{species_input}' not supported. Choose from: {list(species_info)}")

species = species_info[species_input]
taxid = species["taxid"]
code = species["code"]

orig_species = orig_species_info[orig_species_input]
orig_taxid = orig_species["taxid"]
orig_code = orig_species["code"]

# === Step 1: Download inParanoid file for human vs species ===
url = f"https://inparanoidb.sbc.su.se/download/sqltable/{orig_taxid}&{taxid}&prot"
r = requests.get(url)
r.raise_for_status()

df = pd.read_csv(StringIO(r.text.strip()), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "inparalog_score", "protein_id", "seed_score"]

# === Step 2: Add species labels ===
def infer_species(src):
    if orig_taxid in src:
        return orig_species_input
    elif taxid in src:
        return species_input
    return "unknown"


df["species"] = df["source_file"].apply(infer_species)

In [24]:
df

Unnamed: 0,cluster_id,bitscore,source_file,inparalog_score,protein_id,seed_score,species
0,1,60090,10090.fa,1.0,A2ASS6,1.0,mouse
1,1,60090,9606.fa,1.0,Q8WZ42,1.0,human
2,2,14503,10090.fa,1.0,Q6ZWR6,1.0,mouse
3,2,14503,9606.fa,1.0,Q8NF91,1.0,human
4,3,12156,10090.fa,1.0,A2AAJ9,1.0,mouse
...,...,...,...,...,...,...,...
35953,17096,41,9606.fa,1.0,P0DP42,1.0,human
35954,17097,41,10090.fa,1.0,A0A0G2JDY8,1.0,mouse
35955,17097,41,9606.fa,1.0,A0A075B6W3,1.0,human
35956,17098,40,10090.fa,1.0,Q64389,1.0,mouse


In [26]:
# === Step 3: Build ortholog pairs ===
# Assume 'species' column exists, and 'orig_species_input' is defined
records = []
for cid, grp in df.groupby("cluster_id"):
    orig_spec = grp[grp["species"] == orig_species_input]
    others = grp[grp["species"] == species_input]
    
    for h, o in product(orig_spec.itertuples(index=False), others.itertuples(index=False)):
        records.append({
            "cluster_id": cid,
            f"{orig_species_input}_protein": h.protein_id,
            f"{orig_species_input}_inparalog_score": h.inparalog_score,
            f"{orig_species_input}_seed_score": h.seed_score,
            f"{species_input}_protein": o.protein_id,
            f"{species_input}_inparalog_score": o.inparalog_score,
            f"{species_input}_seed_score": o.seed_score,
            "bitscore": (h.bitscore + o.bitscore) / 2
        })
df_orthologs = pd.DataFrame(records)
df_orthologs.to_csv(f"data/{orig_species_input}_centric_inParanoid_{species_input}.csv", index=False)

In [30]:
# === Step 4: UniProt → Gene Name ===
print("Starting annotation process...")

# --- Load Human UniProt Mapping once ---
orig_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{orig_species_input}.csv")
if not os.path.exists(orig_uniprot_file):
    print(f"Error: {orig_species_input} UniProt mapping file not found at {orig_uniprot_file}. Please run the UniProt fetching script for human first.")
    exit() # Exit if the essential human file is missing

print(f"Loading {orig_species_input} UniProt mapping from {orig_uniprot_file}...")
# We only need 'Entry' (Accession) and 'Gene Names'
orig_uniprot_df = pd.read_csv(orig_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as human gene name
orig_uniprot_df = orig_uniprot_df.rename(columns={"Gene Names": f"{orig_species_input}_Gene_Name"})
print(f"Loaded {len(orig_uniprot_df)} {orig_species_input} UniProt entries.")

# --- Process each species ---
for species_input in species_to_process:
    print(f"\n--- Processing {species_input} ---")

    inparanoid_file = os.path.join(data_dir, f"inParanoid_{species_input}.csv")
    species_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{species_input}.csv")
    output_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}_AnnWithUniProt.csv")

    # Check if input files exist
    if not os.path.exists(inparanoid_file):
        print(f"Skipping {species_input}: InParanoid file not found at {inparanoid_file}")
        continue
    if not os.path.exists(species_uniprot_file):
        print(f"Skipping {species_input}: UniProt mapping file not found at {species_uniprot_file}")
        continue

    print(f"Loading InParanoid data from {inparanoid_file}...")
    df_inparanoid = pd.read_csv(inparanoid_file)
    print(f"Loaded {len(df_inparanoid)} InParanoid entries for {species_input}.")

    print(f"Loading {species_input} UniProt mapping from {species_uniprot_file}...")
    species_uniprot_df = pd.read_csv(species_uniprot_file, usecols=["Entry", "Gene Names"])
    # Rename 'Gene Names' to distinguish it as the species' gene name
    species_uniprot_df = species_uniprot_df.rename(columns={"Gene Names": f"{species_input}_Gene_Name"})
    print(f"Loaded {len(species_uniprot_df)} {species_input} UniProt entries.")

    # 1. Annotate based on "orig_species_protein" with orig UniProt data
    print(f"Merging {orig_species_input} gene names...")
    # Use left merge to keep all rows from df_inparanoid
    df_merged = pd.merge(
        df_inparanoid,
        orig_uniprot_df,
        left_on=f"{orig_species_input}_protein",
        right_on="Entry",
        how="left"
    )
    # Drop the redundant 'Entry' column from the merge
    df_merged = df_merged.drop(columns=["Entry"])

    # 2. Annotate based on "{species_input}_protein" with species UniProt data
    print(f"Merging {species_input} gene names...")
    df_merged = pd.merge(
        df_merged, # Merge into the already merged dataframe
        species_uniprot_df,
        left_on=f"{orig_species_input}_protein",
        right_on="Entry",
        how="left"
    )
    # Drop the redundant 'Entry' column from the second merge
    df_merged = df_merged.drop(columns=["Entry"])

    # 3. Save merged data
    print(f"Saving merged data to {output_file}...")
    df_merged.to_csv(output_file, index=False)
    print(f"Successfully saved {len(df_merged)} annotated entries for {species_input}.")

print("\nAnnotation process completed for all specified species.")


Starting annotation process...
Loading mouse UniProt mapping from data/uniprotMapping_mouse.csv...
Loaded 32289 mouse UniProt entries.

--- Processing mouse ---
Loading InParanoid data from data/inParanoid_mouse.csv...
Loaded 20805 InParanoid entries for mouse.
Loading mouse UniProt mapping from data/uniprotMapping_mouse.csv...
Loaded 32289 mouse UniProt entries.
Merging mouse gene names...
Merging mouse gene names...
Saving merged data to data/mouse_centric_inParanoid_mouse_AnnWithUniProt.csv...
Successfully saved 81780 annotated entries for mouse.

Annotation process completed for all specified species.


In [37]:
# === Step 4: UniProt → Gene Name ===
print("Starting annotation process...")

# --- Load Human UniProt Mapping once ---
orig_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{orig_species_input}.csv")
if not os.path.exists(orig_uniprot_file):
    print(f"Error: {orig_species_input} UniProt mapping file not found at {orig_uniprot_file}. Please run the UniProt fetching script for human first.")
    exit() # Exit if the essential human file is missing

print(f"Loading {orig_species_input} UniProt mapping from {orig_uniprot_file}...")
# We only need 'Entry' (Accession) and 'Gene Names'
orig_uniprot_df = pd.read_csv(orig_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as human gene name
orig_uniprot_df = orig_uniprot_df.rename(columns={"Gene Names": f"{orig_species_input}_Gene_Name"})
print(f"Loaded {len(orig_uniprot_df)} {orig_species_input} UniProt entries.")

Starting annotation process...
Loading mouse UniProt mapping from data/uniprotMapping_mouse.csv...
Loaded 32289 mouse UniProt entries.


In [41]:
species_to_process = ["mouse", "rat", "human", "zebrafish", "chimpanzee", "chicken", "pig", "cow", "dog", "horse", "marmoset",   "macaque"]

In [42]:
species_input

'human'

In [45]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

### IMPORTANT ####
# Warning: First run "src/convertOrthUniprotToEnsembl.r" in R
##################

# === USER INPUT ===
# original species
orig_species_input = "mouse"
# Ortholog species
species_input = "human"  # Options: "mouse", "zebrafish", # No sheep in inParanoid

species_to_process = ["mouse", "rat", "human", "zebrafish", "chimpanzee", "chicken", "pig", "cow", "dog", "horse", "marmoset",   "macaque"]

# === INTERNAL MAPPINGS ===

orig_species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    "sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}


species_info = {
    "mouse":         {"taxid": "10090", "code": "mmusculus"},
    "rat":           {"taxid": "10116", "code": "rnorvegicus"},
    "zebrafish":     {"taxid": "7955",  "code": "drerio"},
    "chimpanzee":    {"taxid": "9598",  "code": "ptroglodytes"},
    "chicken":       {"taxid": "9031",  "code": "ggallus"},
    "pig":           {"taxid": "9823",  "code": "sscrofa"},
    "cow":           {"taxid": "9913",  "code": "btaurus"},
    "dog":           {"taxid": "9615",  "code": "clfamiliaris"},
    "horse":         {"taxid": "9796",  "code": "ecaballus"},
    "sheep":         {"taxid": "9940",  "code": "oarambouillet"},
    "marmoset":      {"taxid": "9483",  "code": "cjacchus"},
    "macaque":       {"taxid": "9544",  "code": "mmulatta"},
    "human":         {"taxid": "9606",  "code": "hsapiens"},
}

if orig_species_input not in orig_species_info:
    raise ValueError(f"Species '{orig_species_input}' not supported. Choose from: {list(orig_species_info)}")

if species_input not in species_info:
    raise ValueError(f"Species '{species_input}' not supported. Choose from: {list(species_info)}")

species = species_info[species_input]
taxid = species["taxid"]
code = species["code"]

orig_species = orig_species_info[orig_species_input]
orig_taxid = orig_species["taxid"]
orig_code = orig_species["code"]

# === Step 1: Download inParanoid file for human vs species ===
url = f"https://inparanoidb.sbc.su.se/download/sqltable/{orig_taxid}&{taxid}&prot"
r = requests.get(url)
r.raise_for_status()

df = pd.read_csv(StringIO(r.text.strip()), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "inparalog_score", "protein_id", "seed_score"]

# === Step 2: Add species labels ===
def infer_species(src):
    if orig_taxid in src:
        return orig_species_input
    elif taxid in src:
        return species_input
    return "unknown"

df["species"] = df["source_file"].apply(infer_species)

# === Step 3: Build ortholog pairs ===
records = []
for cid, grp in df.groupby("cluster_id"):
    orig_spec = grp[grp["species"] == orig_species_input]
    others = grp[grp["species"] == species_input]
    
    for h, o in product(orig_spec.itertuples(index=False), others.itertuples(index=False)):
        records.append({
            "cluster_id": cid,
            f"{orig_species_input}_protein": h.protein_id,
            f"{orig_species_input}_inparalog_score": h.inparalog_score,
            f"{orig_species_input}_seed_score": h.seed_score,
            f"{species_input}_protein": o.protein_id,
            f"{species_input}_inparalog_score": o.inparalog_score,
            f"{species_input}_seed_score": o.seed_score,
            "bitscore": (h.bitscore + o.bitscore) / 2
        })


df_orthologs = pd.DataFrame(records)
df_orthologs.to_csv(f"data/{orig_species_input}_centric_inParanoid_{species_input}.csv", index=False)

In [46]:
df_orthologs

Unnamed: 0,cluster_id,mouse_protein,mouse_inparalog_score,mouse_seed_score,human_protein,human_inparalog_score,human_seed_score,bitscore
0,1,A2ASS6,1.0,1.0,Q8WZ42,1.0,1.0,60090.0
1,2,Q6ZWR6,1.0,1.0,Q8NF91,1.0,1.0,14503.0
2,3,A2AAJ9,1.0,1.0,Q5VST9,1.0,1.0,12156.0
3,4,Q9QXZ0,1.0,1.0,Q9UPN3,1.0,1.0,10869.0
4,5,Q91ZU6,1.0,1.0,Q03001,1.0,1.0,10549.0
...,...,...,...,...,...,...,...,...
20800,17094,A0A0G2JGM9,1.0,1.0,A0A075B6W9,1.0,1.0,45.0
20801,17095,A0A0G2JFW2,1.0,1.0,A0A075B6W1,1.0,1.0,44.0
20802,17096,A0A494B9K2,1.0,1.0,P0DP42,1.0,1.0,41.0
20803,17097,A0A0G2JDY8,1.0,1.0,A0A075B6W3,1.0,1.0,41.0


In [47]:
# === Step 4: UniProt → Gene Name ===
print("Starting annotation process...")

# --- Load Human UniProt Mapping once ---
orig_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{orig_species_input}.csv")
if not os.path.exists(orig_uniprot_file):
    print(f"Error: {orig_species_input} UniProt mapping file not found at {orig_uniprot_file}. Please run the UniProt fetching script for human first.")
    exit() # Exit if the essential human file is missing

print(f"Loading {orig_species_input} UniProt mapping from {orig_uniprot_file}...")
# We only need 'Entry' (Accession) and 'Gene Names'
orig_uniprot_df = pd.read_csv(orig_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as human gene name
orig_uniprot_df = orig_uniprot_df.rename(columns={"Gene Names": f"{orig_species_input}_Gene_Name"})
print(f"Loaded {len(orig_uniprot_df)} {orig_species_input} UniProt entries.")
orig_uniprot_df

Starting annotation process...
Loading mouse UniProt mapping from data/uniprotMapping_mouse.csv...
Loaded 32289 mouse UniProt entries.


Unnamed: 0,Entry,mouse_Gene_Name
0,A0A087WPF7,Auts2
1,A0A087WPF7,Kiaa0442
2,A0A088MLT8,Iqcj-Schip1
3,A0A088MLT8,Iqschfp
4,A0A088MLT8,Schip1
...,...,...
32284,Q9WUQ7,Dexi
32285,Q9WUQ7,Myle
32286,Q9WVB6,Lenep
32287,Q9WVB6,Lep503


In [53]:
# --- Process each species ---
print(f"\n--- Processing {species_input} ---")

inparanoid_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}.csv")
species_uniprot_file = os.path.join(data_dir, f"uniprotMapping_{species_input}.csv")
output_file = os.path.join(data_dir, f"{orig_species_input}_centric_inParanoid_{species_input}_AnnWithUniProt.csv")

# Check if input files exist
if not os.path.exists(inparanoid_file):
    print(f"Skipping {species_input}: InParanoid file not found at {inparanoid_file}")

if not os.path.exists(species_uniprot_file):
    print(f"Skipping {species_input}: UniProt mapping file not found at {species_uniprot_file}")

print(f"Loading InParanoid data from {inparanoid_file}...")
df_inparanoid = pd.read_csv(inparanoid_file)
print(f"Loaded {len(df_inparanoid)} InParanoid entries for {species_input}.")

print(f"Loading {species_input} UniProt mapping from {species_uniprot_file}...")
species_uniprot_df = pd.read_csv(species_uniprot_file, usecols=["Entry", "Gene Names"])
# Rename 'Gene Names' to distinguish it as the species' gene name
species_uniprot_df = species_uniprot_df.rename(columns={"Gene Names": f"{species_input}_Gene_Name"})
print(f"Loaded {len(species_uniprot_df)} {species_input} UniProt entries.")



--- Processing human ---
Loading InParanoid data from data/mouse_centric_inParanoid_human.csv...
Loaded 20805 InParanoid entries for human.
Loading human UniProt mapping from data/uniprotMapping_human.csv...
Loaded 45583 human UniProt entries.


In [54]:
# 1. Annotate based on "orig_species_protein" with orig UniProt data
print(f"Merging {orig_species_input} gene names...")
# Use left merge to keep all rows from df_inparanoid
df_merged = pd.merge(
    df_inparanoid,
    orig_uniprot_df,
    left_on=f"{orig_species_input}_protein",
    right_on="Entry",
    how="left"
)
    # Drop the redundant 'Entry' column from the merge
df_merged = df_merged.drop(columns=["Entry"])

# 2. Annotate based on "{species_input}_protein" with species UniProt data
print(f"Merging {species_input} gene names...")
df_merged = pd.merge(
    df_merged, # Merge into the already merged dataframe
    species_uniprot_df,
    left_on=f"{species_input}_protein",
    right_on="Entry",
    how="left"
)
# Drop the redundant 'Entry' column from the second merge
df_merged = df_merged.drop(columns=["Entry"])

# 3. Save merged data
print(f"Saving merged data to {output_file}...")
df_merged.to_csv(output_file, index=False)
print(f"Successfully saved {len(df_merged)} annotated entries for {species_input}.")

print("\nAnnotation process completed for all specified species.")


Merging mouse gene names...
Merging human gene names...
Saving merged data to data/mouse_centric_inParanoid_human_AnnWithUniProt.csv...
Successfully saved 92169 annotated entries for human.

Annotation process completed for all specified species.


In [6]:
sheet_ID = "1JMo_tRbkuBwNtQlSSvhSS3FffjCJKCCsRybNbKuFyaI"
gene_pair = fetchGSheet.safe_fetch(sheet_ID, "curation", fetchGSheet.credentials_file)

In [7]:
triplicate= gene_pair[['LR_pair', 'triplet', 'ligand_species_ann', 'receptor_species_ann', 'used Human L', 'used Human R', 'notes', 'Remove decision']]

In [8]:
triplicate

Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
0,LPL LRP1,1281473 LPL LRP1,Bos taurus,Homo sapiens,,,1992,
1,FGF1 ITGAV,18441324 FGF1 ITGAV,Homo sapiens,Homo sapiens,,,,
2,CALCA RAMP1,18599553 CALCA RAMP1,Homo sapiens,Homo sapiens,,,,
3,PODXL2 SELE,18606703 PODXL2 SELE,Homo sapiens,Homo sapiens,,,,
4,TMPRSS6 HJV,18976966 TMPRSS6 HJV,Homo sapiens,Homo sapiens,,,,
...,...,...,...,...,...,...,...,...
5411,KISS1 KISS1R,12944565 KISS1 KISS1R,,,,,REMOVE-Not support,REMOVE
5412,CCL1 CCR8,12967681 CCL1 CCR8,,,,,REMOVE-Not support,REMOVE
5413,TNF TNFRSF21,9714541 TNF TNFRSF21,,,,,REMOVE-NOT SUPPORT - Al - Agrees,REMOVE
5414,NTNG1 LRRC4C,14595443 NTNG1 LRRC4C,,,,,REMOVE-Reversed Interaction,REMOVE


In [165]:
# Filter out rows where 'notes' contains 'remove'
triplicate['notes'] = triplicate['Remove decision'].str.lower()

# Remove rows where 'notes' contains 'remove'
triplicate_with_remove = triplicate[triplicate['notes'].str.contains('remove', na=False)]

# Remove rows where 'notes' contains 'remove'
triplicate = triplicate[~triplicate['notes'].str.contains('remove', na=False)]

triplicate
triplicate_with_remove

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  triplicate['notes'] = triplicate['Remove decision'].str.lower()


Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
0,ROBO1 ROBO2,26186094 ROBO1 ROBO2,,,,,remove,REMOVE
6,IL18 IL18RAP,15760905 IL18 IL18RAP,,,,,remove,REMOVE
152,BDNF NTRK1,2157470 BDNF NTRK1,,,,,remove,REMOVE
178,IL1A IL1RAP,2950091 IL1A IL1RAP,,,,,remove,REMOVE
221,HSPG2 LRP1,7526899 HSPG2 LRP1,,,,,remove,REMOVE
...,...,...,...,...,...,...,...,...
5376,COL4A3 CD93,,Homo sapiens,Homo sapiens,✅,✅,remove,REMOVE
5377,COL4A4 CD93,,Homo sapiens,Homo sapiens,✅,✅,remove,REMOVE
5378,COL4A5 CD93,,Homo sapiens,Homo sapiens,✅,✅,remove,REMOVE
5379,COL4A6 CD93,,Homo sapiens,Homo sapiens,✅,✅,remove,REMOVE


In [9]:
# Convert both columns to sets for easy set operations
set_filtered = set(triplicate['LR_pair'])
set_unfiltered = set(triplicate_with_remove['LR_pair'])

# Overlap (present in both)
overlap = list(set_filtered & set_unfiltered)

# Not covered (in filtered but NOT in unfiltered)
not_covered = list(set_unfiltered - set_filtered)

print("LR_pairs covered (overlap):", overlap, len(overlap))
print("LR_pairs not covered:", not_covered, len(not_covered))

NameError: name 'triplicate_with_remove' is not defined

In [167]:
triplicate_with_remove.to_csv("data/rows_with_remove.csv")

In [10]:
# Group by the two species columns and count the number of rows for each combination
summary_counts = triplicate.groupby(['ligand_species_ann', 'receptor_species_ann']).size().reset_index(name='count')

# Optional: sort by count descending
summary_counts = summary_counts.sort_values(by='count', ascending=False)
summary_counts

Unnamed: 0,ligand_species_ann,receptor_species_ann,count
36,Homo sapiens,Homo sapiens,3082
104,Mus Musculus,Mus Musculus,702
50,Homo sapiens,Mus Musculus,181
0,,,179
101,Mus Musculus,Homo sapiens,176
...,...,...,...
74,"Homo sapiens, Mus Musculus",Rattus norvegicus,1
71,"Homo sapiens, Mus Musculus","Homo sapiens, Mus Musculus, Rattus norvegicus,...",1
67,"Homo sapiens, Gallus gallus",Homo sapiens,1
62,Homo sapiens,Xenopus laevis,1


In [15]:
summary_counts.to_csv("data/pairs_temp_perPair.csv")

In [169]:
homo_sapiens_in_both = triplicate[
    triplicate['ligand_species_ann'].str.contains('Homo sapiens', case=False, na=False) &
    triplicate['receptor_species_ann'].str.contains('Homo sapiens', case=False, na=False)
]

print("Rows where both species columns contain 'Homo sapiens':", homo_sapiens_in_both.shape[0])


Rows where both species columns contain 'Homo sapiens': 3536


In [170]:
homo_sapiens_in_both.shape[0]/len(triplicate)

0.6742944317315027

In [171]:
len(triplicate)

5244

In [172]:
triplicate

Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
1,ANGPT2 ITGA5,16424009 ANGPT2 ITGA5,Homo sapiens,Homo sapiens,✅,✅,,
2,ANGPT2 ITGAV,16424009 ANGPT2 ITGAV,Homo sapiens,Homo sapiens,✅,✅,,
3,ANGPT2 ITGB1,16424009 ANGPT2 ITGB1,Homo sapiens,Homo sapiens,✅,✅,,
4,IL18 CD48,15760905 IL18 CD48,Homo sapiens,Homo sapiens,✅,✅,,
5,IL18 IL18R1,15760905 IL18 IL18R1,Homo sapiens,Homo sapiens,✅,✅,,
...,...,...,...,...,...,...,...,...
5406,LAMA5 ITGA7,,Homo sapiens,Homo sapiens,✅,✅,,
5407,LAMB2 ITGB4,,Homo sapiens,Homo sapiens,✅,✅,,
5408,LAMB2 ITGA6,,Homo sapiens,Homo sapiens,✅,✅,,
5409,LAMB1 ITGB4,,"Mus Musculus, Homo sapiens",Homo sapiens,✅,✅,,


In [177]:
homo_sapiens_in_both

Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
1,ANGPT2 ITGA5,16424009 ANGPT2 ITGA5,Homo sapiens,Homo sapiens,✅,✅,,
2,ANGPT2 ITGAV,16424009 ANGPT2 ITGAV,Homo sapiens,Homo sapiens,✅,✅,,
3,ANGPT2 ITGB1,16424009 ANGPT2 ITGB1,Homo sapiens,Homo sapiens,✅,✅,,
4,IL18 CD48,15760905 IL18 CD48,Homo sapiens,Homo sapiens,✅,✅,,
5,IL18 IL18R1,15760905 IL18 IL18R1,Homo sapiens,Homo sapiens,✅,✅,,
...,...,...,...,...,...,...,...,...
5406,LAMA5 ITGA7,,Homo sapiens,Homo sapiens,✅,✅,,
5407,LAMB2 ITGB4,,Homo sapiens,Homo sapiens,✅,✅,,
5408,LAMB2 ITGA6,,Homo sapiens,Homo sapiens,✅,✅,,
5409,LAMB1 ITGB4,,"Mus Musculus, Homo sapiens",Homo sapiens,✅,✅,,


In [12]:
LR_pair_flat = (
    triplicate
    .groupby('LR_pair', as_index=False)
    .agg(lambda x: ', '.join(sorted(set(x.dropna().astype(str)))))
)
LR_pair_flat

Unnamed: 0,LR_pair,triplet,ligand_species_ann,receptor_species_ann,used Human L,used Human R,notes,Remove decision
0,A2M HSPA5,"12194978 A2M HSPA5, 32541810 A2M HSPA5",Homo sapiens,Homo sapiens,,,", human α2M, human Grp 78 (HSPA5?)",
1,A2M LRP1,"10652313 A2M LRP1, 12194978 A2M LRP1, 1702392 ...",Homo sapiens,Homo sapiens,,,", LRP primary accession number Q0754 was perfo...",
2,ACE BDKRB2,10748135 ACE BDKRB2,Homo sapiens,Homo sapiens,,,,
3,ADA DPP4,15213224 ADA DPP4,Homo sapiens,Bos taurus,,,,
4,ADAM10 EFNA5,16239146 ADAM10 EFNA5,Bos taurus,Mus Musculus,,,NEW PAIR,
...,...,...,...,...,...,...,...,...
3540,ZG16B TLR2,20802527 ZG16B TLR2,Homo sapiens,"Homo sapiens, Mus Musculus",,,"The expression vectors pHA-mTLR2, -dominant mu...",
3541,ZG16B TLR4,20802527 ZG16B TLR4,Homo sapiens,"Homo sapiens, Mus Musculus",,,"Human TLR1, 2, 3, 4, 5, 8, 9, and 10, CXCR4 an...",
3542,ZG16B TLR5,20802527 ZG16B TLR5,Homo sapiens,Homo sapiens,,,PAUF is a mammalian ligand identified for the ...,
3543,ZG16B TLR6,20802527 ZG16B TLR6,Homo sapiens,"Homo sapiens, Mus Musculus",,,,


In [14]:
# Group by the two species columns and count the number of rows for each combination
summary_counts = LR_pair_flat.groupby(['ligand_species_ann', 'receptor_species_ann']).size().reset_index(name='count')

# Optional: sort by count descending
summary_counts = summary_counts.sort_values(by='count', ascending=False)
summary_counts

Unnamed: 0,ligand_species_ann,receptor_species_ann,count
83,Homo sapiens,Homo sapiens,1644
247,Mus Musculus,Mus Musculus,407
168,"Homo sapiens, Mus Musculus","Homo sapiens, Mus Musculus",175
111,Homo sapiens,Mus Musculus,93
0,,,87
...,...,...,...
133,"Homo sapiens, Homo sapiens, Mus Musculus","Homo sapiens, Mus Musculus, Homo sapiens",1
134,"Homo sapiens, Homo sapiens, Mus Musculus","Homo sapiens, Mus Musculus, Mus Musculus",1
135,"Homo sapiens, Homo sapiens, Mus Musculus","Mus Musculus, Mus Musculus, Homo sapiens",1
136,"Homo sapiens, Homo sapiens, Mus Musculus","Mus Musculus, Rattus norvegicus",1


In [178]:
homo_sapiens_in_both = LR_pair_flat[
    LR_pair_flat['ligand_species_ann'].str.contains('Homo sapiens', case=False, na=False) &
    LR_pair_flat['receptor_species_ann'].str.contains('Homo sapiens', case=False, na=False)
]

print("Rows where both species columns contain 'Homo sapiens':", homo_sapiens_in_both.shape[0])
homo_sapiens_in_both.shape[0]/len(LR_pair_flat)

Rows where both species columns contain 'Homo sapiens': 2444


0.7061542906674372

In [179]:
mus_musculus_in_both = (
    LR_pair_flat[
        LR_pair_flat['ligand_species_ann'].str.contains('Mus musculus', case=False, na=False) &
        LR_pair_flat['receptor_species_ann'].str.contains('Mus musculus', case=False, na=False)
    ]
)

result = mus_musculus_in_both[
    ~(
        LR_pair_flat['ligand_species_ann'].str.contains('Homo sapiens', case=False, na=False) &
        LR_pair_flat['receptor_species_ann'].str.contains('Homo sapiens', case=False, na=False)
    )
]

print(len(result))

529


  result = mus_musculus_in_both[


In [157]:
result.to_csv("data/mouse_as_main_pairs.csv")

In [12]:
import sys
import os
import pandas as pd
import warnings
import re
from itables import init_notebook_mode, show
from IPython.display import display, Javascript
import itables.options as opt
import createDataTable_perSpecies

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# === Species Parameter === #
species = "horse"  # Change to "zebrafish", "sheep", etc.
species_file_prefix = {
    "mouse": "mmusculus",
    "rat": "rnorvegicus",
    "zebrafish": "drerio",
    "chimpanzee": "ptroglodytes",
    "chicken": "ggallus",
    "pig": "sscrofa",
    "cow": "btaurus",
    "dog": "clfamiliaris",
    "horse": "ecaballus",
    "marmoset": "cjacchus",
    "macaque": "mmulatta",
    "sheep": "oarambouillet"
}[species]

# === Load gene pair === #
gene_pair_var = f"{species}_gene_pair1"
gene_pair_df = getattr(createDataTable_perSpecies, gene_pair_var)
ligand_ens_id = [col for col in gene_pair_df.columns if "Ligand Ensembl ID" in col][0]
receptor_ens_id = [col for col in gene_pair_df.columns if "Receptor Ensembl ID" in col][0]


gene_pair_df = gene_pair_df[[
    '<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
    '<span title="Double-click header of Ligand to ensure all values are shown">Ligand&nbsp;</span>',
    '<span title="Double-click header of Receptor to ensure all values are shown">Receptor&nbsp;</span>',
    'LR Pair Card', f'{species.capitalize()} LR Pair',
    '<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Ligand HGNC ID&nbsp;&nbsp;</span>',
    '<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Receptor HGNC ID&nbsp;&nbsp;</span>',
    'Ligand GOC score', 'Ligand WGA coverage',
    'Ligand % Identity', 'Ligand Target % Identity',
    'Ligand Orthology Confidence', ligand_ens_id,
    'Receptor GOC score', 'Receptor WGA coverage', 'Receptor % Identity',
    'Receptor Target % Identity', 'Receptor Orthology Confidence', receptor_ens_id]]

# Rename columns
rename_dict = dict(zip(gene_pair_df.columns[:7], [
    "Interaction ID", "Ligand", "Receptor", "LR Pair Card",
    f"{species.capitalize()} LR Pair", "Ligand HGNC ID", "Receptor HGNC ID"
]))
gene_pair_df.rename(columns=rename_dict, inplace=True)

gene_pair_df = gene_pair_df.rename(columns={
    ligand_ens_id: "Ligand Ensembl ID",
    receptor_ens_id: "Receptor Ensembl ID"
})
    

# Load ortholog mapping
biomart_df = pd.read_csv(f"data/{species_file_prefix}_ID_biomart.csv", dtype=str)
biomart_df = biomart_df.dropna(subset=[f"{species_file_prefix}_homolog_ensembl_gene", "ensembl_gene_id"])

# Extract ID from anchor tags
def extract_link_text(html_string):
    match = re.search(r'<a[^>]*>(.*?)</a>', html_string)
    return match.group(1).strip() if match else None

def extract_hgnc_id(col):
    match = re.search(r'HGNC:(\d+)', col)
    return 'HGNC:' + str(match.group(1)) if match else None
def extract_paircard_id(col):
    """Use regular expression to extract the HGNC ID after 'cards/'."""
    match = re.search(r'cards/([^/]+)\.html', col)
    if match:
        return str(match.group(1))
    return None
    
# Process columns
gene_pair_df['LR Pair Card'] = gene_pair_df['LR Pair Card'].apply(extract_paircard_id)
gene_pair_df['Ligand HGNC ID'] = gene_pair_df['Ligand HGNC ID'].apply(extract_hgnc_id)
gene_pair_df['Receptor HGNC ID'] = gene_pair_df['Receptor HGNC ID'].apply(extract_hgnc_id)

# Mapping
species_to_human_map = dict(zip(
    biomart_df[f"{species_file_prefix}_homolog_ensembl_gene"],
    biomart_df["ensembl_gene_id"]
))

gene_pair_df["Human Ligand Ensembl ID"] = gene_pair_df["Ligand Ensembl ID"].map(species_to_human_map)
gene_pair_df["Human Receptor Ensembl ID"] = gene_pair_df["Receptor Ensembl ID"].map(species_to_human_map)

# Load df_merged
merged_df = pd.read_csv(f"data/df_merged_with_{species}_ensembl.tsv", sep="\t")

# Index for merge
gene_pair_indexed = gene_pair_df.reset_index(drop=False).rename(columns={"index": "orig_row"})

# LIGAND MERGE
df_ligand = merged_df.add_prefix("Ligand_")
ligand_merge = gene_pair_indexed.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left"
)
if isinstance(ligand_merge.columns, pd.MultiIndex):
    ligand_merge.columns = ligand_merge.columns.get_level_values(0)
if ligand_merge.columns.duplicated().any():
    ligand_merge = ligand_merge.loc[:, ~ligand_merge.columns.duplicated()]

def resolve_ligand_row(group):
    match = group[group[f"Ligand_{species}_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    return match.iloc[[0]] if len(match) else group.iloc[[0]]

ligand_final = ligand_merge.groupby("orig_row", group_keys=False).apply(resolve_ligand_row).reset_index(drop=True)

# RECEPTOR MERGE
df_receptor = merged_df.add_prefix("Receptor_")
receptor_merge = ligand_final.merge(
    df_receptor,
    left_on="Human Receptor Ensembl ID",
    right_on="Receptor_human_ensembl_gene_id",
    how="left"
)
if isinstance(receptor_merge.columns, pd.MultiIndex):
    receptor_merge.columns = receptor_merge.columns.get_level_values(0)
if receptor_merge.columns.duplicated().any():
    receptor_merge = receptor_merge.loc[:, ~receptor_merge.columns.duplicated()]

def resolve_receptor_row(group):
    match = group[group[f"Receptor_{species}_ensembl_gene_id"] == group["Receptor Ensembl ID"]]
    return match.iloc[[0]] if len(match) else group.iloc[[0]]

final_result = receptor_merge.groupby("orig_row", group_keys=False).apply(resolve_receptor_row).reset_index(drop=True).drop(columns=["orig_row"])

assert len(final_result) == len(gene_pair_df), f"Row mismatch: {len(final_result)} != {len(gene_pair_df)}"

final_result.to_csv(f"data/human_{species}_merged_ensemblBiomaRt_inParanoid.csv")



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_pair_df.rename(columns=rename_dict, inplace=True)
  ligand_final = ligand_merge.groupby("orig_row", group_keys=False).apply(resolve_ligand_row).reset_index(drop=True)
  final_result = receptor_merge.groupby("orig_row", group_keys=False).apply(resolve_receptor_row).reset_index(drop=True).drop(columns=["orig_row"])


In [13]:
# summarize counts depending on filter
species = "horse"  # Change to desired species, e.g., "zebrafish", "sheep"
capital_species = species.capitalize()

# Load final result file
final_result = pd.read_csv(f"data/human_{species}_merged_ensemblBiomaRt_inParanoid.csv")

# Define score columns dynamically
score_cols = [
    f"Ligand_human_inparalog_score",
    f"Receptor_human_inparalog_score",
    f"Ligand_{species}_inparalog_score",
    f"Receptor_{species}_inparalog_score",
    f"Ligand_human_seed_score",
    f"Receptor_human_seed_score",
    f"Ligand_{species}_seed_score",
    f"Receptor_{species}_seed_score",
    f"Ligand_bitscore",
    f"Receptor_bitscore"
]

for col in score_cols:
    if col in final_result.columns:
        final_result[col] = pd.to_numeric(final_result[col], errors='coerce')

# Detect columns
confidence_orth_ligand = [col for col in final_result.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in final_result.columns if "Ligand GOC" in col][0]
percIdent_col_ligand = [col for col in final_result.columns if "Ligand % Identity" in col][0]
human_ligand_col = [col for col in final_result.columns if "Ligand HGNC ID" in col][0]
ligand_col = [col for col in final_result.columns if "Ligand" in col][0]
ligand_human_inparalog_score = [col for col in final_result.columns if "Ligand_human_inparalog_score" in col][0]
ligand_species_inparalog_score = [col for col in final_result.columns if f"Ligand_{species}_inparalog_score" in col][0]
ligand_human_seed_score = [col for col in final_result.columns if "Ligand_human_seed_score" in col][0]
ligand_species_seed_score = [col for col in final_result.columns if f"Ligand_{species}_seed_score" in col][0]
ligand_bit_score = [col for col in final_result.columns if "Ligand_bitscore" in col][0]

human_receptor_col = [col for col in final_result.columns if "Receptor HGNC ID" in col][0]
confidence_orth_receptor = [col for col in final_result.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in final_result.columns if "Receptor GOC" in col][0]
percIdent_col_receptor = [col for col in final_result.columns if "Receptor % Identity" in col][0]
receptor_col = [col for col in final_result.columns if "Receptor" in col][0]
receptor_human_inparalog_score = [col for col in final_result.columns if "Receptor_human_inparalog_score" in col][0]
receptor_species_inparalog_score = [col for col in final_result.columns if f"Receptor_{species}_inparalog_score" in col][0]
receptor_human_seed_score = [col for col in final_result.columns if "Receptor_human_seed_score" in col][0]
receptor_species_seed_score = [col for col in final_result.columns if f"Receptor_{species}_seed_score" in col][0]
receptor_bit_score = [col for col in final_result.columns if "Receptor_bitscore" in col][0]

# Define function

def summarize_orthologs(human_col, species_col, label,
                        confidence_orth_col=None, confidence_orth_threshold=None,
                        GOC_col=None, GOC_threshold=None,
                        perc_identity_col=None, perc_identity_thres=None,
                        ligand_human_inparalog_score_col=None, ligand_human_inparalog_score_threshold=None,
                        receptor_human_inparalog_score_col=None, receptor_human_inparalog_score_threshold=None,
                        ligand_species_inparalog_score_col=None, ligand_species_inparalog_score_threshold=None,
                        receptor_species_inparalog_score_col=None, receptor_species_inparalog_score_threshold=None,
                        ligand_human_seed_score_col=None, ligand_human_seed_score_threshold=None,
                        receptor_human_seed_score_col=None, receptor_human_seed_score_threshold=None,
                        ligand_species_seed_score_col=None, ligand_species_seed_score_threshold=None,
                        receptor_species_seed_score_col=None, receptor_species_seed_score_threshold=None,
                        ligand_bit_score_col=None, ligand_bit_score_threshold=None,
                        receptor_bit_score_col=None, receptor_bit_score_threshold=None):

    df = final_result.copy()

    filters = [
        (confidence_orth_col, lambda x: x == confidence_orth_threshold),
        (GOC_col, lambda x: x >= GOC_threshold),
        (perc_identity_col, lambda x: x >= perc_identity_thres),
        (ligand_human_inparalog_score_col, lambda x: x >= ligand_human_inparalog_score_threshold),
        (receptor_human_inparalog_score_col, lambda x: x >= receptor_human_inparalog_score_threshold),
        (ligand_species_inparalog_score_col, lambda x: x >= ligand_species_inparalog_score_threshold),
        (receptor_species_inparalog_score_col, lambda x: x >= receptor_species_inparalog_score_threshold),
        (ligand_human_seed_score_col, lambda x: x >= ligand_human_seed_score_threshold),
        (receptor_human_seed_score_col, lambda x: x >= receptor_human_seed_score_threshold),
        (ligand_species_seed_score_col, lambda x: x >= ligand_species_seed_score_threshold),
        (receptor_species_seed_score_col, lambda x: x >= receptor_species_seed_score_threshold),
        (ligand_bit_score_col, lambda x: x >= ligand_bit_score_threshold),
        (receptor_bit_score_col, lambda x: x >= receptor_bit_score_threshold),
    ]

    original_rows = df.shape[0]
    for col, condition in filters:
        if col and condition is not None:
            before = df.shape[0]
            df = df[df[col].apply(condition)]
            after = df.shape[0]
            print(f"Filtered {col}: {before - after} rows removed (remaining: {after})")

    unique_pairs = df[[human_col, species_col]].drop_duplicates()

    counts = (
        unique_pairs
        .groupby(human_col)[species_col]
        .count()
        .sort_values(ascending=False)
        .reset_index(name='count')
    )

    filter_tag = f"{label.lower()}"
    if confidence_orth_threshold is not None:
        filter_tag += f"_conf{confidence_orth_threshold}"
    if GOC_threshold is not None:
        filter_tag += f"_GOCge{GOC_threshold}"
    if ligand_human_inparalog_score_threshold is not None:
        filter_tag += f"_LHISge{ligand_human_inparalog_score_threshold}"
    if receptor_bit_score_threshold is not None:
        filter_tag += f"_RBSge{receptor_bit_score_threshold}"

    #counts.to_csv(f"data/human_{species}_orth_count_{filter_tag}.csv", index=False)

    summary_counts = counts['count'].value_counts().sort_index()
    total_human_genes = counts.shape[0]

    summary_lines = [
        f"Out of {total_human_genes} unique human {label.lower()} genes:",
        f" - Filters applied: " + "; ".join([
            f"{col} ≥ {threshold}" for col, threshold in [
                (confidence_orth_col, confidence_orth_threshold),
                (GOC_col, GOC_threshold),
                (perc_identity_col, perc_identity_thres),
                (ligand_human_inparalog_score_col, ligand_human_inparalog_score_threshold),
                (receptor_human_inparalog_score_col, receptor_human_inparalog_score_threshold),
                (ligand_species_inparalog_score_col, ligand_species_inparalog_score_threshold),
                (receptor_species_inparalog_score_col, receptor_species_inparalog_score_threshold),
                (ligand_human_seed_score_col, ligand_human_seed_score_threshold),
                (receptor_human_seed_score_col, receptor_human_seed_score_threshold),
                (ligand_species_seed_score_col, ligand_species_seed_score_threshold),
                (receptor_species_seed_score_col, receptor_species_seed_score_threshold),
                (ligand_bit_score_col, ligand_bit_score_threshold),
                (receptor_bit_score_col, receptor_bit_score_threshold),
            ] if threshold is not None and col is not None
        ])
    ]

    for orth_count, gene_count in summary_counts.items():
        summary_lines.append(
            f" - {gene_count} human {label.lower()} genes had {orth_count} {species} ortholog(s)"
        )

    return "\n".join(summary_lines)

In [18]:
# Ligand
ligand_summary = summarize_orthologs(
    human_col=human_ligand_col,
    species_col=ligand_col,
    label="Ligand",
    # GOC_col=GOC_col_ligand,
    # GOC_threshold=25,
    confidence_orth_col=confidence_orth_ligand,
    confidence_orth_threshold=1,
    # perc_identity_col = percIdent_col_ligand,
    # perc_identity_thres = 60
    # ligand_human_inparalog_score_col=ligand_human_inparalog_score,
    # ligand_human_inparalog_score_threshold=1,
    ligand_bit_score_col=ligand_bit_score,
    ligand_bit_score_threshold=40,
    # ligand_species_inparalog_score_col=ligand_species_inparalog_score,
    # ligand_species_inparalog_score_threshold=1,
)
print(ligand_summary)

# Receptor
receptor_summary = summarize_orthologs(
    human_col=human_receptor_col,
    species_col=receptor_col,
    label="Receptor",
    # GOC_col=GOC_col_receptor,
    # GOC_threshold=25,
    confidence_orth_col=confidence_orth_receptor,
    confidence_orth_threshold=1,
    # perc_identity_col = percIdent_col_receptor,
    # perc_identity_thres = 60
    # receptor_human_inparalog_score_col=receptor_human_inparalog_score,
    # receptor_human_inparalog_score_threshold=1,
    receptor_bit_score_col=receptor_bit_score,
    receptor_bit_score_threshold=40,
    # receptor_species_inparalog_score_col=receptor_species_inparalog_score,
    # receptor_species_inparalog_score_threshold=1,
)
print(receptor_summary)

Filtered Ligand Orthology Confidence: 341 rows removed (remaining: 2650)
Filtered Ligand_bitscore: 121 rows removed (remaining: 2529)
Out of 731 unique human ligand genes:
 - Filters applied: Ligand Orthology Confidence ≥ 1; Ligand_bitscore ≥ 40
 - 727 human ligand genes had 1 horse ortholog(s)
 - 3 human ligand genes had 2 horse ortholog(s)
 - 1 human ligand genes had 3 horse ortholog(s)
Filtered Receptor Orthology Confidence: 163 rows removed (remaining: 2828)
Filtered Receptor_bitscore: 92 rows removed (remaining: 2736)
Out of 644 unique human receptor genes:
 - Filters applied: Receptor Orthology Confidence ≥ 1; Receptor_bitscore ≥ 40
 - 641 human receptor genes had 1 horse ortholog(s)
 - 1 human receptor genes had 2 horse ortholog(s)
 - 2 human receptor genes had 3 horse ortholog(s)


In [2]:

df_orthologs.to_csv(f"data/inParanoid_{species_input}.csv", index=False)

# === Step 4: Human UniProt → HGNC/Ensembl mapping ===
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "ensembl_gene_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "ensembl_gene_id": "human_ensembl_gene_id"
}).drop(columns=["uniprot_id"])

df_merged = df_merged.dropna(subset=["human_ensembl_gene_id"])
df_merged.to_csv(f"data/{species_input}_inParanoid_withHGNC.tsv", sep="\t", index=False)

# === Step 5: Optional - Species UniProt → Ensembl mapping ===
map_path = f"data/{species_input}_uniprot_to_ensembl.tsv"
try:
    species_map = pd.read_csv(map_path, sep="\t", dtype=str)
    df_merged = df_merged.merge(
        species_map,
        left_on=f"{species_input}_protein",
        right_on="uniprotswissprot",
        how="left"
    ).rename(columns={"ensembl_gene_id": f"{species_input}_ensembl_gene_id"}) \
     .drop(columns=["uniprotswissprot"])
except FileNotFoundError:
    print(f"⚠️  Mapping file not found: {map_path}")

df_merged.to_csv(f"data/df_merged_with_{species_input}_ensembl.tsv", sep="\t", index=False)


EmptyDataError: No columns to parse from file

In [339]:
# Ligand
ligand_summary = summarize_orthologs(
    human_col=human_ligand_col,
    species_col=ligand_col,
    label="Ligand",
    # GOC_col=GOC_col_ligand,
    # GOC_threshold=25,
    # confidence_orth_col=confidence_orth_ligand,
    # confidence_orth_threshold=1,
    # perc_identity_col = percIdent_col_ligand,
    # perc_identity_thres = 60
    # ligand_human_inparalog_score_col=ligand_human_inparalog_score,
    # ligand_human_inparalog_score_threshold=1,
    ligand_bit_score_col=ligand_bit_score,
    ligand_bit_score_threshold=40,
    # ligand_species_inparalog_score_col=ligand_species_inparalog_score,
    # ligand_species_inparalog_score_threshold=1,
)
print(ligand_summary)

# Receptor
receptor_summary = summarize_orthologs(
    human_col=human_receptor_col,
    species_col=receptor_col,
    label="Receptor",
    # GOC_col=GOC_col_receptor,
    # GOC_threshold=25,
    # confidence_orth_col=confidence_orth_receptor,
    # confidence_orth_threshold=1,
    # perc_identity_col = percIdent_col_receptor,
    # perc_identity_thres = 60
    # receptor_human_inparalog_score_col=receptor_human_inparalog_score,
    # receptor_human_inparalog_score_threshold=1,
    receptor_bit_score_col=receptor_bit_score,
    receptor_bit_score_threshold=40,
    # receptor_species_inparalog_score_col=receptor_species_inparalog_score,
    # receptor_species_inparalog_score_threshold=1,
)
print(receptor_summary)



Filtered Ligand_bitscore: 1323 rows removed (remaining: 4193)
Out of 509 unique human ligand genes:
 - Filters applied: Ligand_bitscore ≥ 40
 - 311 human ligand genes had 1 zebrafish ortholog(s)
 - 178 human ligand genes had 2 zebrafish ortholog(s)
 - 9 human ligand genes had 3 zebrafish ortholog(s)
 - 4 human ligand genes had 4 zebrafish ortholog(s)
 - 4 human ligand genes had 5 zebrafish ortholog(s)
 - 2 human ligand genes had 7 zebrafish ortholog(s)
 - 1 human ligand genes had 8 zebrafish ortholog(s)
Filtered Receptor_bitscore: 1293 rows removed (remaining: 4223)
Out of 438 unique human receptor genes:
 - Filters applied: Receptor_bitscore ≥ 40
 - 273 human receptor genes had 1 zebrafish ortholog(s)
 - 139 human receptor genes had 2 zebrafish ortholog(s)
 - 12 human receptor genes had 3 zebrafish ortholog(s)
 - 7 human receptor genes had 4 zebrafish ortholog(s)
 - 3 human receptor genes had 5 zebrafish ortholog(s)
 - 3 human receptor genes had 7 zebrafish ortholog(s)
 - 1 human rece

In [322]:
extract_link_text

def extract_paircard_id(col):
    """Use regular expression to extract the HGNC ID after 'cards/'."""
    match = re.search(r'cards/([^/]+)\.html', col)
    if match:
        return str(match.group(1))
    return None



In [299]:

#  'Sheep LR Pair' data
update_connectomedb_qmd(
    qmd_file_path="database/other/sheepOrth.qmd", 
    lr_pair_data=createDataTable_perSpecies.sheep_gene_pair1["Sheep LR Pair"],
    species_name="Ovis aries rambouillet",
    species = "Sheep",
    ortholog = True
)


--- Updating database/other/sheepOrth.qmd for Ovis aries rambouillet ---
Successfully updated 'database/other/sheepOrth.qmd' for Ovis aries rambouillet.


In [203]:
import sys
import os
import pandas as pd
import warnings
import re
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
from itables import init_notebook_mode, show
from IPython.display import display, Javascript
import itables.options as opt
from createDataTable_perSpecies import mouse_gene_pair1

mouse_gene_pair1= mouse_gene_pair1[['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>','<span title="Double-click header of Ligand to ensure all values are shown">Ligand&nbsp;</span>',
       '<span title="Double-click header of Receptor to ensure all values are shown">Receptor&nbsp;</span>',
                  'LR Pair Card', 'Mouse LR Pair','<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Ligand HGNC ID&nbsp;&nbsp;</span>',
       '<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Receptor HGNC ID&nbsp;&nbsp;</span>','Ligand GOC score', 'Ligand WGA coverage',
       'Ligand % Identity', 'Ligand Target % Identity',
       'Ligand Orthology Confidence', 'Ligand Ensembl ID',
       'Receptor GOC score', 'Receptor WGA coverage', 'Receptor % Identity',
       'Receptor Target % Identity', 'Receptor Orthology Confidence','Receptor Ensembl ID']]

mouse_gene_pair1.columns = [
    "Interaction ID",
    "Ligand",
    "Receptor",
    "LR Pair Card", 
    "Mouse LR Pair",
    "Ligand HGNC ID",
    "Receptor HGNC ID",
    *mouse_gene_pair1.columns[7:]
]
mousebioM_df = pd.read_csv("data/mmusculus_ID_biomart.csv", dtype=str)
mousebioM_df = mousebioM_df.dropna(subset=["mmusculus_homolog_ensembl_gene", "ensembl_gene_id"])


def extract_link_text(html_string):
    """Extract visible text from an anchor tag <a>...</a>."""
    match = re.search(r'<a[^>]*>(.*?)</a>', html_string)
    if match:
        return match.group(1).strip()
    return None

mouse_gene_pair1['LR Pair Card'] = mouse_gene_pair1['LR Pair Card'].apply(extract_paircard_id)
# Create the mapping dictionary from mouse to human Ensembl gene ID
mouse_to_human_map = dict(zip(
    mousebioM_df["mmusculus_homolog_ensembl_gene"],
    mousebioM_df["ensembl_gene_id"]
))

# Map Ligand
mouse_gene_pair1["Human Ligand Ensembl ID"] = mouse_gene_pair1["Ligand Ensembl ID"].map(mouse_to_human_map)

# Map Receptor
mouse_gene_pair1["Human Receptor Ensembl ID"] = mouse_gene_pair1["Receptor Ensembl ID"].map(mouse_to_human_map)
def extract_hgnc_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'HGNC:(\d+)', col)
    if match:
        return 'HGNC:' +str(match.group(1))
    return None
    
mouse_gene_pair1['Ligand HGNC ID'] = mouse_gene_pair1['Ligand HGNC ID'].apply(extract_hgnc_id)
mouse_gene_pair1['Receptor HGNC ID'] = mouse_gene_pair1['Receptor HGNC ID'].apply(extract_hgnc_id)
df_merged =pd.read_csv("data/df_merged_with_mouse_ensembl.tsv",sep="\t")

# Step 0: Add 'orig_row' once, keep it clean
mouse_gene_pair1_indexed = mouse_gene_pair1.reset_index(drop=False).rename(columns={"index": "orig_row"})

### === LIGAND MERGE === ###
df_ligand = df_merged.add_prefix("Ligand_")
ligand_merge = mouse_gene_pair1_indexed.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left"
)

# Ensure orig_row is single column (sometimes merge creates duplicates with suffix)
if isinstance(ligand_merge.columns, pd.MultiIndex):
    ligand_merge.columns = ligand_merge.columns.get_level_values(0)

if ligand_merge.columns.duplicated().any():
    ligand_merge = ligand_merge.loc[:, ~ligand_merge.columns.duplicated()]

def resolve_ligand_row(group):
    match = group[group["Ligand_mouse_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    else:
        return group.iloc[[0]]

ligand_final = (
    ligand_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_ligand_row)
    .reset_index(drop=True)
)

### === RECEPTOR MERGE === ###
df_receptor = df_merged.add_prefix("Receptor_")
receptor_merge = ligand_final.merge(
    df_receptor,
    left_on="Human Receptor Ensembl ID",
    right_on="Receptor_human_ensembl_gene_id",
    how="left"
)

# Same cleanup for receptor_merge
if isinstance(receptor_merge.columns, pd.MultiIndex):
    receptor_merge.columns = receptor_merge.columns.get_level_values(0)

if receptor_merge.columns.duplicated().any():
    receptor_merge = receptor_merge.loc[:, ~receptor_merge.columns.duplicated()]

def resolve_receptor_row(group):
    match = group[group["Receptor_mouse_ensembl_gene_id"] == group["Receptor Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    else:
        return group.iloc[[0]]

final_result = (
    receptor_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_receptor_row)
    .reset_index(drop=True)
    .drop(columns=["orig_row"])
)

assert len(final_result) == len(mouse_gene_pair1), f"Row mismatch: {len(final_result)} != {len(mouse_gene_pair1)}"
final_result


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1['LR Pair Card'] = mouse_gene_pair1['LR Pair Card'].apply(extract_paircard_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1["Human Ligand Ensembl ID"] = mouse_gene_pair1["Ligand Ensembl ID"].map(mouse_to_human_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mo

Unnamed: 0,Interaction ID,Ligand,Receptor,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,...,Receptor_human_protein,Receptor_human_inparalog_score,Receptor_human_seed_score,Receptor_mouse_protein,Receptor_mouse_inparalog_score,Receptor_mouse_seed_score,Receptor_bitscore,Receptor_human_ensembl_gene_id,Receptor_human_gene,Receptor_mouse_ensembl_gene_id
0,CDB00001,A2m,Hspa5,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,...,P11021,1.0,1.0,P20029,1.0,1.000,1246.0,ENSG00000044574,HSPA5,ENSMUSG00000026864
1,CDB00002,A2m,Lrp1,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,...,Q07954,1.0,1.0,Q91ZX7,1.0,1.000,6853.0,ENSG00000123384,LRP1,
2,CDB00003,Ace,Bdkrb2,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,...,P30411,1.0,1.0,P32299,1.0,1.000,636.0,ENSG00000168398,BDKRB2,ENSMUSG00000021070
3,CDB00004,Ada,Dpp4,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,...,P27487,1.0,1.0,P28843,1.0,1.000,1365.0,ENSG00000197635,DPP4,ENSMUSG00000035000
4,CDB00005,Adam10,Epha3,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,...,P29320,1.0,1.0,P29319,1.0,1.000,1923.0,ENSG00000044524,EPHA3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21,Pcdhb21,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,...,,,,,,,,,,
3957,CDB03445,Pcdhb22,Pcdhb22,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3958,CDB03446,Pcdhgb8,Pcdhgb8,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,...,,,,,,,,,,
3959,CDB03447,Saa3,Tlr4,Saa3-Tlr4,Saa3 → Tlr4,,,,,,...,O00206,1.0,1.0,Q9QUK6,1.0,1.000,909.0,ENSG00000136869,TLR4,ENSMUSG00000039005


In [204]:
final_result.to_csv("data/human_mouse_merged_ensemblBiomaRt_inParanoid.csv")


In [136]:
mouse_gene_pair1.columns

Index(['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
       'LR Pair Card', 'Mouse LR Pair',
       '<span title="Double-click header of Ligand to ensure all values are shown">Ligand&nbsp;</span>',
       '<span title="Double-click header of Receptor to ensure all values are shown">Receptor&nbsp;</span>',
       '<span title="Genome Informatics (MGI) ID. Click on the link for more details">Ligand MGI ID</span>',
       '<span title="Genome Informatics (MGI) ID. Click on the link for more details">Receptor MGI ID</span>',
       'Ligand Ensembl ID', 'Ligand GOC score', 'Ligand WGA coverage',
       'Ligand % Identity', 'Ligand Target % Identity',
       'Ligand Orthology Confidence', 'Ligand Name', 'Receptor Ensembl ID',
       'Receptor GOC score', 'Receptor WGA coverage', 'Receptor % Identity',
       'Receptor Target % Identity', 'Receptor Orthology Confidence',
       'Receptor Name',
       '<span title="Official G

In [103]:
mouse_gene_pair1= mouse_gene_pair1[['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
                  'LR Pair Card', 'Mouse LR Pair','<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Ligand HGNC ID&nbsp;&nbsp;</span>',
       '<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">Receptor HGNC ID&nbsp;&nbsp;</span>','Ligand GOC score', 'Ligand WGA coverage',
       'Ligand % Identity', 'Ligand Target % Identity',
       'Ligand Orthology Confidence', 'Ligand Ensembl ID',
       'Receptor GOC score', 'Receptor WGA coverage', 'Receptor % Identity',
       'Receptor Target % Identity', 'Receptor Orthology Confidence','Receptor Ensembl ID']]

In [104]:
mouse_gene_pair1.columns = [
    "Interaction ID",
    "LR Pair Card",
    "Mouse LR Pair",
    "Ligand HGNC ID",
    "Receptor HGNC ID",
    *mouse_gene_pair1.columns[5:]
]
mouse_gene_pair1

Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID
0,CDB00001,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m → Hspa5,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864
1,CDB00002,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m → Lrp1,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249
2,CDB00003,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ace → Bdkrb2,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070
3,CDB00004,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ada → Dpp4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000
4,CDB00005,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Adam10 → Epha3,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb21 → Pcdhb21,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022
3957,CDB03445,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb22 → Pcdhb22,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591
3958,CDB03446,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhgb8 → Pcdhgb8,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081
3959,CDB03447,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Saa3 → Tlr4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005


In [105]:
mousebioM_df = pd.read_csv("data/mmusculus_ID_biomart.csv", dtype=str)
mousebioM_df = mousebioM_df.dropna(subset=["mmusculus_homolog_ensembl_gene", "ensembl_gene_id"])

In [106]:
mousebioM_df= mousebioM_df[["mmusculus_homolog_ensembl_gene", "ensembl_gene_id"]]

In [107]:
import re

def extract_link_text(html_string):
    """Extract visible text from an anchor tag <a>...</a>."""
    match = re.search(r'<a[^>]*>(.*?)</a>', html_string)
    if match:
        return match.group(1).strip()
    return None

mouse_gene_pair1['LR Pair Card'] = mouse_gene_pair1['LR Pair Card'].apply(extract_paircard_id)
mouse_gene_pair1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1['LR Pair Card'] = mouse_gene_pair1['LR Pair Card'].apply(extract_paircard_id)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID
0,CDB00001,A2M-HSPA5,A2m → Hspa5,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864
1,CDB00002,A2M-LRP1,A2m → Lrp1,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070
3,CDB00004,ADA-DPP4,Ada → Dpp4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022
3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591
3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081
3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005


In [108]:
# Create the mapping dictionary from mouse to human Ensembl gene ID
mouse_to_human_map = dict(zip(
    mousebioM_df["mmusculus_homolog_ensembl_gene"],
    mousebioM_df["ensembl_gene_id"]
))

# Map Ligand
mouse_gene_pair1["Human Ligand Ensembl ID"] = mouse_gene_pair1["Ligand Ensembl ID"].map(mouse_to_human_map)

# Map Receptor
mouse_gene_pair1["Human Receptor Ensembl ID"] = mouse_gene_pair1["Receptor Ensembl ID"].map(mouse_to_human_map)
mouse_gene_pair1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1["Human Ligand Ensembl ID"] = mouse_gene_pair1["Ligand Ensembl ID"].map(mouse_to_human_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1["Human Receptor Ensembl ID"] = mouse_gene_pair1["Receptor Ensembl ID"].map(mouse_to_human_map)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID,Human Ligand Ensembl ID,Human Receptor Ensembl ID
0,CDB00001,A2M-HSPA5,A2m → Hspa5,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864,ENSG00000175899,ENSG00000044574
1,CDB00002,A2M-LRP1,A2m → Lrp1,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249,ENSG00000175899,ENSG00000123384
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070,ENSG00000159640,ENSG00000168398
3,CDB00004,ADA-DPP4,Ada → Dpp4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000,ENSG00000196839,ENSG00000197635
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504,ENSG00000137845,ENSG00000044524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022,,
3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591,ENSG00000113248,ENSG00000113248
3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081,,
3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005,,ENSG00000136869


In [109]:
def extract_hgnc_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'HGNC:(\d+)', col)
    if match:
        return 'HGNC:' +str(match.group(1))
    return None
    
mouse_gene_pair1['Ligand HGNC ID'] = mouse_gene_pair1['Ligand HGNC ID'].apply(extract_hgnc_id)
mouse_gene_pair1['Receptor HGNC ID'] = mouse_gene_pair1['Receptor HGNC ID'].apply(extract_hgnc_id)
mouse_gene_pair1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1['Ligand HGNC ID'] = mouse_gene_pair1['Ligand HGNC ID'].apply(extract_hgnc_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mouse_gene_pair1['Receptor HGNC ID'] = mouse_gene_pair1['Receptor HGNC ID'].apply(extract_hgnc_id)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID,Human Ligand Ensembl ID,Human Receptor Ensembl ID
0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864,ENSG00000175899,ENSG00000044574
1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249,ENSG00000175899,ENSG00000123384
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070,ENSG00000159640,ENSG00000168398
3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000,ENSG00000196839,ENSG00000197635
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504,ENSG00000137845,ENSG00000044524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022,,
3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591,ENSG00000113248,ENSG00000113248
3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081,,
3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005,,ENSG00000136869


In [85]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
# Split uniprot_ids on comma and explode
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

uniprot_to_ensembl = hgnc_exploded.set_index("uniprot_id")["ensembl_gene_id"].to_dict()

# Left join on human_protein
df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "ensembl_gene_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

# Optionally rename
df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "ensembl_gene_id": "human_ensembl_gene_id"
}).drop(columns=["uniprot_id"])
df_merged = df_merged.dropna(subset=["human_ensembl_gene_id"])
df_merged.to_csv("data/mmusculus_inParanoid_uniProt_withHGNCAnn.tsv", sep="\t", index=False)

df_merged

Unnamed: 0,cluster_id,human_protein,human_inparalog_score,human_seed_score,mouse_protein,mouse_inparalog_score,mouse_seed_score,bitscore,human_ensembl_gene_id,human_gene
0,1,Q8WZ42,1.0,1.0,A2ASS6,1.000,1.0,60090.0,ENSG00000155657,TTN
1,2,Q8NF91,1.0,1.0,Q6ZWR6,1.000,1.0,14503.0,ENSG00000131018,SYNE1
2,3,Q5VST9,1.0,1.0,A2AAJ9,1.000,1.0,12156.0,ENSG00000154358,OBSCN
4,5,Q03001,1.0,1.0,Q91ZU6,1.000,1.0,10549.0,ENSG00000151914,DST
5,6,Q8WXG9,1.0,1.0,Q8VHN7,1.000,1.0,10049.0,ENSG00000164199,ADGRV1
...,...,...,...,...,...,...,...,...,...,...
20946,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSI2,0.212,,46.0,ENSG00000225396,FAM236D
20947,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSB3,0.178,,46.0,ENSG00000225396,FAM236D
20948,17092,Q96LM9,1.0,1.0,E9Q1X6,1.000,1.0,46.0,ENSG00000125975,C20orf173
20952,17096,P0DP42,1.0,1.0,A0A494B9K2,1.000,1.0,41.0,ENSG00000244219,TMEM225B


In [100]:
# Step 1: Load the mapping file
mouse_map = pd.read_csv("data/mouse_uniprot_to_ensembl.tsv", sep="\t", dtype=str)

# Step 2: Merge with df_merged on UniProt ID
# Assuming your UniProt column in df_merged is named 'mouse_protein'
df_merged = df_merged.merge(
    mouse_map,
    left_on="mouse_protein",
    right_on="uniprotswissprot",
    how="left"
)

# Step 3: Rename the new column for clarity (optional)
df_merged = df_merged.rename(columns={"ensembl_gene_id": "mouse_ensembl_gene_id"})

# Step 4: Drop the helper merge column if not needed
df_merged = df_merged.drop(columns=["uniprotswissprot"])

# Optional: Save
df_merged.to_csv("data/df_merged_with_mouse_ensembl.tsv", sep="\t", index=False)
df_merged

Unnamed: 0,cluster_id,human_protein,human_inparalog_score,human_seed_score,mouse_protein,mouse_inparalog_score,mouse_seed_score,bitscore,human_ensembl_gene_id,human_gene,mouse_ensembl_gene_id
0,1,Q8WZ42,1.0,1.0,A2ASS6,1.000,1.0,60090.0,ENSG00000155657,TTN,ENSMUSG00000051747
1,2,Q8NF91,1.0,1.0,Q6ZWR6,1.000,1.0,14503.0,ENSG00000131018,SYNE1,ENSMUSG00000096054
2,3,Q5VST9,1.0,1.0,A2AAJ9,1.000,1.0,12156.0,ENSG00000154358,OBSCN,ENSMUSG00000061462
3,5,Q03001,1.0,1.0,Q91ZU6,1.000,1.0,10549.0,ENSG00000151914,DST,ENSMUSG00000026131
4,6,Q8WXG9,1.0,1.0,Q8VHN7,1.000,1.0,10049.0,ENSG00000164199,ADGRV1,ENSMUSG00000069170
...,...,...,...,...,...,...,...,...,...,...,...
20920,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSI2,0.212,,46.0,ENSG00000225396,FAM236D,
20921,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSB3,0.178,,46.0,ENSG00000225396,FAM236D,
20922,17092,Q96LM9,1.0,1.0,E9Q1X6,1.000,1.0,46.0,ENSG00000125975,C20orf173,
20923,17096,P0DP42,1.0,1.0,A0A494B9K2,1.000,1.0,41.0,ENSG00000244219,TMEM225B,


In [112]:
# 1. Prepare df_merged with prefix
df_ligand = df_merged.add_prefix("Ligand_")

# 2. Merge on Human Ligand Ensembl ID (left join)
ligand_merged = mouse_gene_pair1.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left",
    suffixes=('', '_merged')
)
ligand_merged

Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,...,Ligand_human_protein,Ligand_human_inparalog_score,Ligand_human_seed_score,Ligand_mouse_protein,Ligand_mouse_inparalog_score,Ligand_mouse_seed_score,Ligand_bitscore,Ligand_human_ensembl_gene_id,Ligand_human_gene,Ligand_mouse_ensembl_gene_id
0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,...,P12821,1.0,1.0,P09470,1.0,1.000,2342.0,ENSG00000159640,ACE,ENSMUSG00000020681
3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,...,P00813,1.0,1.0,P03958,1.0,1.000,620.0,ENSG00000196839,ADA,ENSMUSG00000017697
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,...,O14672,1.0,1.0,O35598,1.0,1.000,1461.0,ENSG00000137845,ADAM10,ENSMUSG00000054693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9562,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,...,,,,,,,,,,
9563,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
9564,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,...,,,,,,,,,,
9565,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,...,,,,,,,,,,


In [114]:
def resolve_ligand(group):
    if group.shape[0] == 1:
        return group
    # Try to keep rows where mouse Ensembl ID also matches
    match = group[group["Ligand_mouse_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]  # arbitrarily pick one if multiple match
    else:
        return group.iloc[[0]]  # fallback to first

# Apply resolver only where duplication exists
resolved = (
    ligand_merged.groupby("Interaction ID", group_keys=False)
    .apply(resolve_ligand)
    .reset_index(drop=True)
)
resolved

  .apply(resolve_ligand)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,...,Ligand_human_protein,Ligand_human_inparalog_score,Ligand_human_seed_score,Ligand_mouse_protein,Ligand_mouse_inparalog_score,Ligand_mouse_seed_score,Ligand_bitscore,Ligand_human_ensembl_gene_id,Ligand_human_gene,Ligand_mouse_ensembl_gene_id
0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,...,P12821,1.0,1.0,P09470,1.0,1.000,2342.0,ENSG00000159640,ACE,ENSMUSG00000020681
3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,...,P00813,1.0,1.0,P03958,1.0,1.000,620.0,ENSG00000196839,ADA,ENSMUSG00000017697
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,...,O14672,1.0,1.0,O35598,1.0,1.000,1461.0,ENSG00000137845,ADAM10,ENSMUSG00000054693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,...,,,,,,,,,,
3157,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3158,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,...,,,,,,,,,,
3159,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,...,,,,,,,,,,


In [116]:
print("mouse_gene_pair1:", mouse_gene_pair1.shape)
print("df_merged:", df_merged.shape)

mouse_gene_pair1: (3961, 19)
df_merged: (20925, 11)


In [122]:
import pandas as pd

# Step 0: Add row ID to preserve original structure
mouse_gene_pair1 = mouse_gene_pair1.reset_index(drop=False).rename(columns={"index": "orig_row"})

# Step 1: Add prefix to merged data
df_ligand = df_merged.add_prefix("Ligand_")

# Step 2: Merge on Human Ligand Ensembl ID (left join, allows multiple matches per row)
ligand_merge = mouse_gene_pair1.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left",
    suffixes=('', '_dup')
)


In [123]:
# Step 3: For each original row, select best match
def resolve_per_row(group):
    # Prefer rows where mouse Ensembl also matches
    match = group[group["Ligand_mouse_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]  # pick first among valid
    elif len(group) > 0:
        return group.iloc[[0]]  # fallback: first available (human matched)
    else:
        return pd.DataFrame([group.iloc[0] * pd.NA])  # unlikely fallback

# Step 4: Apply per original row (not Interaction ID)
ligand_final = (
    ligand_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_per_row)
    .reset_index(drop=True)
    .drop(columns=["orig_row"])
)
ligand_final

  .apply(resolve_per_row)


Unnamed: 0,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,...,Ligand_human_protein,Ligand_human_inparalog_score,Ligand_human_seed_score,Ligand_mouse_protein,Ligand_mouse_inparalog_score,Ligand_mouse_seed_score,Ligand_bitscore,Ligand_human_ensembl_gene_id,Ligand_human_gene,Ligand_mouse_ensembl_gene_id
0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,...,P01023,1.0,1.0,Q6GQT1,1.0,1.000,2065.0,ENSG00000175899,A2M,ENSMUSG00000030111
2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,...,P12821,1.0,1.0,P09470,1.0,1.000,2342.0,ENSG00000159640,ACE,ENSMUSG00000020681
3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,...,P00813,1.0,1.0,P03958,1.0,1.000,620.0,ENSG00000196839,ADA,ENSMUSG00000017697
4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,...,O14672,1.0,1.0,O35598,1.0,1.000,1461.0,ENSG00000137845,ADAM10,ENSMUSG00000054693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,...,,,,,,,,,,
3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,...,,,,,,,,,,
3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,...,,,,,,,,,,


In [124]:
# Step 0: Add row ID again if not already there
if "orig_row" not in mouse_gene_pair1.columns:
    mouse_gene_pair1 = mouse_gene_pair1.reset_index(drop=False).rename(columns={"index": "orig_row"})

# Step 1: Add prefix to df_merged for receptor
df_receptor = df_merged.add_prefix("Receptor_")

# Step 2: Merge on Human Receptor Ensembl ID
receptor_merge = mouse_gene_pair1.merge(
    df_receptor,
    left_on="Human Receptor Ensembl ID",
    right_on="Receptor_human_ensembl_gene_id",
    how="left",
    suffixes=('', '_dup')
)

# Step 3: Resolve best match per row using mouse Ensembl ID
def resolve_receptor_per_row(group):
    match = group[group["Receptor_mouse_ensembl_gene_id"] == group["Receptor Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    elif len(group) > 0:
        return group.iloc[[0]]
    else:
        return pd.DataFrame([group.iloc[0] * pd.NA])  # safe fallback

# Step 4: Apply
receptor_final = (
    receptor_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_receptor_per_row)
    .reset_index(drop=True)
    .drop(columns=["orig_row"])
)

# Step 5: Check consistency
assert len(receptor_final) == len(mouse_gene_pair1), f"Row mismatch: {len(receptor_final)} != {len(mouse_gene_pair1)}"


  .apply(resolve_receptor_per_row)


In [128]:
mouse_gene_pair1

Unnamed: 0,orig_row,Interaction ID,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,Ligand Target % Identity,Ligand Orthology Confidence,Ligand Ensembl ID,Receptor GOC score,Receptor WGA coverage,Receptor % Identity,Receptor Target % Identity,Receptor Orthology Confidence,Receptor Ensembl ID,Human Ligand Ensembl ID,Human Receptor Ensembl ID
0,0,CDB00001,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,75.0,100.0,98.6239,98.4733,1.0,ENSMUSG00000026864,ENSG00000175899,ENSG00000044574
1,1,CDB00002,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,72.4559,1.0,ENSMUSG00000030111,100.0,100.0,97.9974,97.9758,1.0,ENSMUSG00000040249,ENSG00000175899,ENSG00000123384
2,2,CDB00003,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,82.9268,1.0,ENSMUSG00000020681,100.0,100.0,79.7954,79.5918,1.0,ENSMUSG00000021070,ENSG00000159640,ENSG00000168398
3,3,CDB00004,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,83.2386,1.0,ENSMUSG00000017697,100.0,100.0,84.4648,85.1316,1.0,ENSMUSG00000035000,ENSG00000196839,ENSG00000197635
4,4,CDB00005,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,95.9947,1.0,ENSMUSG00000054693,100.0,100.0,96.5412,96.4431,1.0,ENSMUSG00000052504,ENSG00000137845,ENSG00000044524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,3956,CDB03444,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,,,,ENSMUSG00000044022,,,,,,ENSMUSG00000044022,,
3957,3957,CDB03445,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,,,,ENSMUSG00000073591,,,,,,ENSMUSG00000073591,ENSG00000113248,ENSG00000113248
3958,3958,CDB03446,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,,,,ENSMUSG00000103081,,,,,,ENSMUSG00000103081,,
3959,3959,CDB03447,Saa3-Tlr4,Saa3 → Tlr4,,,,,,,,ENSMUSG00000040026,,,,,,ENSMUSG00000039005,,ENSG00000136869


In [199]:
# Step 0: Add 'orig_row' once, keep it clean
mouse_gene_pair1_indexed = mouse_gene_pair1.reset_index(drop=False).rename(columns={"index": "orig_row"})

### === LIGAND MERGE === ###
df_ligand = df_merged.add_prefix("Ligand_")
ligand_merge = mouse_gene_pair1_indexed.merge(
    df_ligand,
    left_on="Human Ligand Ensembl ID",
    right_on="Ligand_human_ensembl_gene_id",
    how="left"
)

# Ensure orig_row is single column (sometimes merge creates duplicates with suffix)
if isinstance(ligand_merge.columns, pd.MultiIndex):
    ligand_merge.columns = ligand_merge.columns.get_level_values(0)

if ligand_merge.columns.duplicated().any():
    ligand_merge = ligand_merge.loc[:, ~ligand_merge.columns.duplicated()]

def resolve_ligand_row(group):
    match = group[group["Ligand_mouse_ensembl_gene_id"] == group["Ligand Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    else:
        return group.iloc[[0]]

ligand_final = (
    ligand_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_ligand_row)
    .reset_index(drop=True)
)

### === RECEPTOR MERGE === ###
df_receptor = df_merged.add_prefix("Receptor_")
receptor_merge = ligand_final.merge(
    df_receptor,
    left_on="Human Receptor Ensembl ID",
    right_on="Receptor_human_ensembl_gene_id",
    how="left"
)

# Same cleanup for receptor_merge
if isinstance(receptor_merge.columns, pd.MultiIndex):
    receptor_merge.columns = receptor_merge.columns.get_level_values(0)

if receptor_merge.columns.duplicated().any():
    receptor_merge = receptor_merge.loc[:, ~receptor_merge.columns.duplicated()]

def resolve_receptor_row(group):
    match = group[group["Receptor_mouse_ensembl_gene_id"] == group["Receptor Ensembl ID"]]
    if len(match) == 1:
        return match
    elif len(match) > 1:
        return match.iloc[[0]]
    else:
        return group.iloc[[0]]

final_result = (
    receptor_merge.groupby("orig_row", group_keys=False)
    .apply(resolve_receptor_row)
    .reset_index(drop=True)
    .drop(columns=["orig_row"])
)

assert len(final_result) == len(mouse_gene_pair1), f"Row mismatch: {len(final_result)} != {len(mouse_gene_pair1)}"


  .apply(resolve_ligand_row)
  .apply(resolve_receptor_row)


In [200]:
final_result

Unnamed: 0,Interaction ID,LR Pair Card,Ligand,Receptor,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,Ligand % Identity,...,Receptor_human_protein,Receptor_human_inparalog_score,Receptor_human_seed_score,Receptor_mouse_protein,Receptor_mouse_inparalog_score,Receptor_mouse_seed_score,Receptor_bitscore,Receptor_human_ensembl_gene_id,Receptor_human_gene,Receptor_mouse_ensembl_gene_id
0,CDB00001,,Hspa5,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,72.4559,...,P11021,1.0,1.0,P20029,1.0,1.000,1246.0,ENSG00000044574,HSPA5,ENSMUSG00000026864
1,CDB00002,,Lrp1,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,72.4559,...,Q07954,1.0,1.0,Q91ZX7,1.0,1.000,6853.0,ENSG00000123384,LRP1,
2,CDB00003,,Bdkrb2,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,83.3078,...,P30411,1.0,1.0,P32299,1.0,1.000,636.0,ENSG00000168398,BDKRB2,ENSMUSG00000021070
3,CDB00004,,Dpp4,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,80.7163,...,P27487,1.0,1.0,P28843,1.0,1.000,1365.0,ENSG00000197635,DPP4,ENSMUSG00000035000
4,CDB00005,,Epha3,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,96.1230,...,P29320,1.0,1.0,P29319,1.0,1.000,1923.0,ENSG00000044524,EPHA3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,CDB03444,,Pcdhb21,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb21 → Pcdhb21,,,,,,...,,,,,,,,,,
3957,CDB03445,,Pcdhb22,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhb22 → Pcdhb22,,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3958,CDB03446,,Pcdhgb8,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Pcdhgb8 → Pcdhgb8,,,,,,...,,,,,,,,,,
3959,CDB03447,,Tlr4,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Saa3 → Tlr4,,,,,,...,O00206,1.0,1.0,Q9QUK6,1.0,1.000,909.0,ENSG00000136869,TLR4,ENSMUSG00000039005


In [191]:
final_result.to_csv("data/human_mouse_merged_ensemblBiomaRt_inParanoid.csv")

In [205]:
final_result= pd.read_csv("data/human_mouse_merged_ensemblBiomaRt_inParanoid.csv")
final_result

Unnamed: 0.1,Unnamed: 0,Interaction ID,Ligand,Receptor,LR Pair Card,Mouse LR Pair,Ligand HGNC ID,Receptor HGNC ID,Ligand GOC score,Ligand WGA coverage,...,Receptor_human_protein,Receptor_human_inparalog_score,Receptor_human_seed_score,Receptor_mouse_protein,Receptor_mouse_inparalog_score,Receptor_mouse_seed_score,Receptor_bitscore,Receptor_human_ensembl_gene_id,Receptor_human_gene,Receptor_mouse_ensembl_gene_id
0,0,CDB00001,A2m,Hspa5,A2M-HSPA5,A2m → Hspa5,HGNC:7,HGNC:5238,75.0,100.0,...,P11021,1.0,1.0,P20029,1.0,1.000,1246.0,ENSG00000044574,HSPA5,ENSMUSG00000026864
1,1,CDB00002,A2m,Lrp1,A2M-LRP1,A2m → Lrp1,HGNC:7,HGNC:6692,75.0,100.0,...,Q07954,1.0,1.0,Q91ZX7,1.0,1.000,6853.0,ENSG00000123384,LRP1,
2,2,CDB00003,Ace,Bdkrb2,ACE-BDKRB2,Ace → Bdkrb2,HGNC:2707,HGNC:1030,100.0,100.0,...,P30411,1.0,1.0,P32299,1.0,1.000,636.0,ENSG00000168398,BDKRB2,ENSMUSG00000021070
3,3,CDB00004,Ada,Dpp4,ADA-DPP4,Ada → Dpp4,HGNC:186,HGNC:3009,100.0,100.0,...,P27487,1.0,1.0,P28843,1.0,1.000,1365.0,ENSG00000197635,DPP4,ENSMUSG00000035000
4,4,CDB00005,Adam10,Epha3,ADAM10-EPHA3,Adam10 → Epha3,HGNC:188,HGNC:3387,100.0,100.0,...,P29320,1.0,1.0,P29319,1.0,1.000,1923.0,ENSG00000044524,EPHA3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3956,3956,CDB03444,Pcdhb21,Pcdhb21,Pcdhb21-Pcdhb21,Pcdhb21 → Pcdhb21,,,,,...,,,,,,,,,,
3957,3957,CDB03445,Pcdhb22,Pcdhb22,Pcdhb22-Pcdhb22,Pcdhb22 → Pcdhb22,,,,,...,Q9Y5E8,1.0,1.0,Q91XZ8,1.0,0.989,1191.0,ENSG00000113248,PCDHB15,
3958,3958,CDB03446,Pcdhgb8,Pcdhgb8,Pcdhgb8-Pcdhgb8,Pcdhgb8 → Pcdhgb8,,,,,...,,,,,,,,,,
3959,3959,CDB03447,Saa3,Tlr4,Saa3-Tlr4,Saa3 → Tlr4,,,,,...,O00206,1.0,1.0,Q9QUK6,1.0,1.000,909.0,ENSG00000136869,TLR4,ENSMUSG00000039005


In [288]:
final_result.columns

Index(['Unnamed: 0', 'Interaction ID', 'Ligand', 'Receptor', 'LR Pair Card',
       'Mouse LR Pair', 'Ligand HGNC ID', 'Receptor HGNC ID',
       'Ligand GOC score', 'Ligand WGA coverage', 'Ligand % Identity',
       'Ligand Target % Identity', 'Ligand Orthology Confidence',
       'Ligand Ensembl ID', 'Receptor GOC score', 'Receptor WGA coverage',
       'Receptor % Identity', 'Receptor Target % Identity',
       'Receptor Orthology Confidence', 'Receptor Ensembl ID',
       'Human Ligand Ensembl ID', 'Human Receptor Ensembl ID',
       'Ligand_cluster_id', 'Ligand_human_protein',
       'Ligand_human_inparalog_score', 'Ligand_human_seed_score',
       'Ligand_mouse_protein', 'Ligand_mouse_inparalog_score',
       'Ligand_mouse_seed_score', 'Ligand_bitscore',
       'Ligand_human_ensembl_gene_id', 'Ligand_human_gene',
       'Ligand_mouse_ensembl_gene_id', 'Receptor_cluster_id',
       'Receptor_human_protein', 'Receptor_human_inparalog_score',
       'Receptor_human_seed_score', 'Rec

In [206]:
score_cols = [
    ligand_human_inparalog_score,
    receptor_human_inparalog_score,
    ligand_mouse_inparalog_score,
    receptor_mouse_inparalog_score,
    ligand_human_seed_score,
    receptor_human_seed_score,
    ligand_mouse_seed_score,
    receptor_mouse_seed_score,
    ligand_bit_score,
    receptor_bit_score
]

for col in score_cols:
    if col in final_result.columns:
        final_result[col] = pd.to_numeric(final_result[col], errors='coerce')


In [293]:
def summarize_orthologs(human_col, species_col, label,
                        confidence_orth_col=None, confidence_orth_threshold=None,
                        GOC_col=None, GOC_threshold=None,
                        perc_identity_col= None, perc_identity_thres= None,
                        ligand_human_inparalog_score_col=None, ligand_human_inparalog_score_threshold=None,
                        receptor_human_inparalog_score_col=None, receptor_human_inparalog_score_threshold=None,
                        ligand_mouse_inparalog_score_col=None, ligand_mouse_inparalog_score_threshold=None,
                        receptor_mouse_inparalog_score_col=None, receptor_mouse_inparalog_score_threshold=None,
                        ligand_human_seed_score_col=None, ligand_human_seed_score_threshold=None,
                        receptor_human_seed_score_col=None, receptor_human_seed_score_threshold=None,
                        ligand_mouse_seed_score_col=None, ligand_mouse_seed_score_threshold=None,
                        receptor_mouse_seed_score_col=None, receptor_mouse_seed_score_threshold=None,
                        ligand_bit_score_col=None, ligand_bit_score_threshold=None,
                        receptor_bit_score_col=None, receptor_bit_score_threshold=None):
    
    df = final_result.copy()

    # Apply filters one by one if thresholds are given
    filters = [
        (confidence_orth_col, lambda x: x == confidence_orth_threshold),
        (GOC_col, lambda x: x >= GOC_threshold),
        (perc_identity_col, lambda x: x >= perc_identity_thres),
        (ligand_human_inparalog_score_col, lambda x: x >= ligand_human_inparalog_score_threshold),
        (receptor_human_inparalog_score_col, lambda x: x >= receptor_human_inparalog_score_threshold),
        (ligand_mouse_inparalog_score_col, lambda x: x >= ligand_mouse_inparalog_score_threshold),
        (receptor_mouse_inparalog_score_col, lambda x: x >= receptor_mouse_inparalog_score_threshold),
        (ligand_human_seed_score_col, lambda x: x >= ligand_human_seed_score_threshold),
        (receptor_human_seed_score_col, lambda x: x >= receptor_human_seed_score_threshold),
        (ligand_mouse_seed_score_col, lambda x: x >= ligand_mouse_seed_score_threshold),
        (receptor_mouse_seed_score_col, lambda x: x >= receptor_mouse_seed_score_threshold),
        (ligand_bit_score_col, lambda x: x >= ligand_bit_score_threshold),
        (receptor_bit_score_col, lambda x: x >= receptor_bit_score_threshold),
    ]


    # Apply filters and track how many rows were removed
    original_rows = df.shape[0]
    for col, condition in filters:
        if col and condition is not None:
            before = df.shape[0]
            df = df[df[col].apply(condition)]
            after = df.shape[0]
            print(f"Filtered {col}: {before - after} rows removed (remaining: {after})")

    # Compute unique ortholog pairs
    unique_pairs = df[[human_col, species_col]].drop_duplicates()

    # Count mouse orthologs per human gene
    counts = (
        unique_pairs
        .groupby(human_col)[species_col]
        .count()
        .sort_values(ascending=False)
        .reset_index(name='count')
    )

    # Build tag from filters
    filter_tag = label.lower()
    if confidence_orth_threshold is not None:
        filter_tag += f"_conf{confidence_orth_threshold}"
    if GOC_threshold is not None:
        filter_tag += f"_GOCge{GOC_threshold}"
    if ligand_human_inparalog_score_threshold is not None:
        filter_tag += f"_LHISge{ligand_human_inparalog_score_threshold}"
    if receptor_bit_score_threshold is not None:
        filter_tag += f"_RBSge{receptor_bit_score_threshold}"

    counts.to_csv(f"data/human_mouse_orth_count_{filter_tag}.csv", index=False)

    summary_counts = counts['count'].value_counts().sort_index()
    total_human_genes = counts.shape[0]

    # Collect active filters for reporting
    active_filters = []
    if confidence_orth_col and confidence_orth_threshold is not None:
        if isinstance(confidence_orth_threshold, (list, set, tuple)):
            active_filters.append(f"{confidence_orth_col} in {sorted(confidence_orth_threshold)}")
        else:
            active_filters.append(f"{confidence_orth_col} equals '{confidence_orth_threshold}'")
    if GOC_col and GOC_threshold is not None:
        active_filters.append(f"{GOC_col} ≥ {GOC_threshold}")
    if perc_identity_col and perc_identity_thres is not None:
        active_filters.append(f"{perc_identity_col} ≥ {perc_identity_thres}")
    if ligand_human_inparalog_score_col and ligand_human_inparalog_score_threshold is not None:
        active_filters.append(f"{ligand_human_inparalog_score_col} ≥ {ligand_human_inparalog_score_threshold}")
    if receptor_human_inparalog_score_col and receptor_human_inparalog_score_threshold is not None:
        active_filters.append(f"{receptor_human_inparalog_score_col} ≥ {receptor_human_inparalog_score_threshold}")
    if ligand_mouse_inparalog_score_col and ligand_mouse_inparalog_score_threshold is not None:
        active_filters.append(f"{ligand_mouse_inparalog_score_col} ≥ {ligand_mouse_inparalog_score_threshold}")
    if receptor_mouse_inparalog_score_col and receptor_mouse_inparalog_score_threshold is not None:
        active_filters.append(f"{receptor_mouse_inparalog_score_col} ≥ {receptor_mouse_inparalog_score_threshold}")
    if ligand_human_seed_score_col and ligand_human_seed_score_threshold is not None:
        active_filters.append(f"{ligand_human_seed_score_col} ≥ {ligand_human_seed_score_threshold}")
    if receptor_human_seed_score_col and receptor_human_seed_score_threshold is not None:
        active_filters.append(f"{receptor_human_seed_score_col} ≥ {receptor_human_seed_score_threshold}")
    if ligand_mouse_seed_score_col and ligand_mouse_seed_score_threshold is not None:
        active_filters.append(f"{ligand_mouse_seed_score_col} ≥ {ligand_mouse_seed_score_threshold}")
    if receptor_mouse_seed_score_col and receptor_mouse_seed_score_threshold is not None:
        active_filters.append(f"{receptor_mouse_seed_score_col} ≥ {receptor_mouse_seed_score_threshold}")
    if ligand_bit_score_col and ligand_bit_score_threshold is not None:
        active_filters.append(f"{ligand_bit_score_col} ≥ {ligand_bit_score_threshold}")
    if receptor_bit_score_col and receptor_bit_score_threshold is not None:
        active_filters.append(f"{receptor_bit_score_col} ≥ {receptor_bit_score_threshold}")
    
    filter_text = "; ".join(active_filters) if active_filters else "No filters applied"
    
    summary_lines = [
        f"Out of {total_human_genes} unique human {label.lower()} genes:",
        f" - Filters applied: {filter_text}"
    ]

    for orth_count, gene_count in summary_counts.items():
        summary_lines.append(
            f" - {gene_count} human {label.lower()} genes had {orth_count} mouse ortholog(s)"
        )

    return "\n".join(summary_lines)


In [294]:
# Detect columns
# Ligand
confidence_orth_ligand = [col for col in final_result.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in final_result.columns if "Ligand GOC" in col][0]
percIdent_col_ligand = [col for col in final_result.columns if "Ligand % Identity" in col][0]
human_ligand_col = [col for col in final_result.columns if "Ligand HGNC ID" in col][0]
ligand_col = [col for col in final_result.columns if "Ligand" in col][0]
ligand_col = [col for col in final_result.columns if "Ligand" in col][0]
ligand_human_inparalog_score = [col for col in final_result.columns if "Ligand_human_inparalog_score" in col][0]
ligand_mouse_inparalog_score = [col for col in final_result.columns if "Ligand_mouse_inparalog_score" in col][0]
ligand_human_seed_score = [col for col in final_result.columns if "Ligand_human_seed_score" in col][0]
ligand_mouse_seed_score = [col for col in final_result.columns if "Ligand_mouse_seed_score" in col][0]
ligand_bit_score = [col for col in final_result.columns if "Ligand_bitscore" in col][0]

#Receptor
human_receptor_col = [col for col in final_result.columns if "Receptor HGNC ID" in col][0]
confidence_orth_receptor = [col for col in final_result.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in final_result.columns if "Receptor GOC" in col][0]
percIdent_col_receptor = [col for col in final_result.columns if "Receptor % Identity" in col][0]
receptor_col = [col for col in final_result.columns if "Receptor" in col][0]
receptor_human_inparalog_score = [col for col in final_result.columns if "Receptor_human_inparalog_score" in col][0]
receptor_mouse_inparalog_score = [col for col in final_result.columns if "Receptor_mouse_inparalog_score" in col][0]
receptor_human_seed_score = [col for col in final_result.columns if "Receptor_human_seed_score" in col][0]
receptor_mouse_seed_score = [col for col in final_result.columns if "Receptor_mouse_seed_score" in col][0]
receptor_bit_score = [col for col in final_result.columns if "Receptor_bitscore" in col][0]

In [295]:
ligand_summary = summarize_orthologs(
    human_col=human_ligand_col,
    species_col=ligand_col,
    label="Ligand",
    confidence_orth_col=confidence_orth_ligand,
    confidence_orth_threshold=1,
    perc_identity_col = percIdent_col_ligand,
    perc_identity_thres = 60
    # GOC_col=GOC_col_ligand,
    # GOC_threshold=100,
    # ligand_human_inparalog_score_col=ligand_human_inparalog_score,
    # ligand_human_inparalog_score_threshold=1,
    # ligand_human_seed_score_col=ligand_human_seed_score,
    # ligand_human_seed_score_threshold=1,
    # ligand_mouse_seed_score_col=ligand_mouse_seed_score,
    # ligand_mouse_seed_score_threshold=1,
    # ligand_bit_score_col=ligand_bit_score,
    # ligand_bit_score_threshold=40
)

print(ligand_summary)


Filtered Ligand Orthology Confidence: 657 rows removed (remaining: 3304)
Filtered Ligand % Identity: 246 rows removed (remaining: 3058)
Out of 835 unique human ligand genes:
 - Filters applied: Ligand Orthology Confidence equals '1'; Ligand % Identity ≥ 60
 - 814 human ligand genes had 1 mouse ortholog(s)
 - 13 human ligand genes had 2 mouse ortholog(s)
 - 4 human ligand genes had 3 mouse ortholog(s)
 - 4 human ligand genes had 5 mouse ortholog(s)


In [297]:
receptor_summary = summarize_orthologs(
    human_col=human_receptor_col,
    species_col=receptor_col,
    label="Receptor",
    confidence_orth_col=confidence_orth_receptor,
    confidence_orth_threshold=1,
    perc_identity_thres = 60,
    perc_identity_col = percIdent_col_receptor,
    # GOC_col=GOC_col_receptor,
    # GOC_threshold=100,
    # receptor_human_inparalog_score_col=receptor_human_inparalog_score,
    # receptor_human_inparalog_score_threshold=1,
    # receptor_human_seed_score_col=receptor_human_seed_score,
    # receptor_human_seed_score_threshold=1,
    # receptor_mouse_seed_score_col=receptor_mouse_seed_score,
    # receptor_mouse_seed_score_threshold=1,
    # receptor_bit_score_col=receptor_bit_score,
    # receptor_bit_score_threshold=40
)

print(receptor_summary)


Filtered Receptor Orthology Confidence: 779 rows removed (remaining: 3182)
Filtered Receptor % Identity: 107 rows removed (remaining: 3075)
Out of 684 unique human receptor genes:
 - Filters applied: Receptor Orthology Confidence equals '1'; Receptor % Identity ≥ 60
 - 679 human receptor genes had 1 mouse ortholog(s)
 - 4 human receptor genes had 2 mouse ortholog(s)
 - 1 human receptor genes had 3 mouse ortholog(s)


In [21]:
import pandas as pd
import requests
from io import StringIO
from itertools import product

# Download InParanoid prot table
url = "https://inparanoidb.sbc.su.se/download/sqltable/9606&10090&prot"
r = requests.get(url)
r.raise_for_status()

df = pd.read_csv(StringIO(r.text.strip()), sep="\t", header=None)
df.columns = ["cluster_id", "bitscore", "source_file", "inparalog_score", "protein_id", "seed_score"]

# Tag each row by species
def infer_species(src):
    if "9606" in src:
        return "human"
    elif "10090" in src:
        return "mouse"
    return "unknown"

df["species"] = df["source_file"].apply(infer_species)


In [22]:
# Expand ortholog pairs within each cluster
records = []
for cid, grp in df.groupby("cluster_id"):
    humans = grp[grp["species"] == "human"]
    mice = grp[grp["species"] == "mouse"]
    for h, m in product(humans.itertuples(index=False), mice.itertuples(index=False)):
        records.append({
            "cluster_id": cid,
            "human_protein": h.protein_id,
            "human_inparalog_score": h.inparalog_score,
            "human_seed_score": h.seed_score,
            "mouse_protein": m.protein_id,
            "mouse_inparalog_score": m.inparalog_score,
            "mouse_seed_score": m.seed_score,
            "bitscore": (h.bitscore + m.bitscore) / 2  # average for now
        })

df_orthologs = pd.DataFrame(records)


In [24]:
df_orthologs.to_csv("data/inParanoid_mmusculus.csv")

In [30]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids"])
# Split uniprot_ids on comma and explode
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()
hgnc_exploded

Unnamed: 0,hgnc_id,symbol,name,locus_group,locus_type,status,location,location_sortable,alias_symbol,alias_name,...,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_id,lncipedia,gtrnadb,agr,mane_select,gencc,uniprot_id
0,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,...,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,,P04217
2,HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,ACF|ASP|ACF64|ACF65|APOBEC1CF,,...,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,,Q9NQ94
3,HGNC:7,A2M,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FWP007|S863-7|CPAMD5,,...,,,,,,,HGNC:7,ENST00000318602.12|NM_000014.6,HGNC:7,P01023
5,HGNC:23336,A2ML1,alpha-2-macroglobulin like 1,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,FLJ25179|p170,,...,,,,,,,HGNC:23336,ENST00000299698.12|NM_144670.6,HGNC:23336,A8K2U0
9,HGNC:30005,A3GALT2,"alpha 1,3-galactosyltransferase 2",protein-coding gene,gene with protein product,Approved,1p35.1,01p35.1,IGBS3S|IGB3S,iGb3 synthase|isoglobotriaosylceramide synthase,...,,,,,,,HGNC:30005,ENST00000442999.3|NM_001080438.1,,U3KPV4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44099,HGNC:3556,FABP2,fatty acid binding protein 2,protein-coding gene,gene with protein product,Approved,4q26,04q26,I-FABP,,...,,,,,,,HGNC:3556,ENST00000274024.4|NM_000134.4,,P12104
44100,HGNC:3557,FABP3,fatty acid binding protein 3,protein-coding gene,gene with protein product,Approved,1p35.2,01p35.2,H-FABP|O-FABP,mammary-derived growth inhibitor,...,,,,,,,HGNC:3557,ENST00000373713.7|NM_004102.5,,P05413
44102,HGNC:3559,FABP4,fatty acid binding protein 4,protein-coding gene,gene with protein product,Approved,8q21.13,08q21.13,A-FABP|aP2,adipocyte fatty acid binding protein,...,,,,,,,HGNC:3559,ENST00000256104.5|NM_001442.3,HGNC:3559,P15090
44103,HGNC:3560,FABP5,fatty acid binding protein 5,protein-coding gene,gene with protein product,Approved,8q21.13,08q21.13,E-FABP|PA-FABP|KFABP,,...,,,,,,,HGNC:3560,ENST00000297258.11|NM_001444.3,,Q01469


In [31]:
# Left join on human_protein
df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "hgnc_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

# Optionally rename
df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "hgnc_id": "human_hgnc_id"
}).drop(columns=["uniprot_id"])
df_merged = df_merged.dropna(subset=["human_hgnc_id"])
df_merged

Unnamed: 0,cluster_id,human_protein,human_inparalog_score,human_seed_score,mouse_protein,mouse_inparalog_score,mouse_seed_score,bitscore,human_hgnc_id,human_gene
0,1,Q8WZ42,1.0,1.0,A2ASS6,1.000,1.0,60090.0,HGNC:12403,TTN
1,2,Q8NF91,1.0,1.0,Q6ZWR6,1.000,1.0,14503.0,HGNC:17089,SYNE1
2,3,Q5VST9,1.0,1.0,A2AAJ9,1.000,1.0,12156.0,HGNC:15719,OBSCN
4,5,Q03001,1.0,1.0,Q91ZU6,1.000,1.0,10549.0,HGNC:1090,DST
5,6,Q8WXG9,1.0,1.0,Q8VHN7,1.000,1.0,10049.0,HGNC:17416,ADGRV1
...,...,...,...,...,...,...,...,...,...,...
20948,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSI2,0.212,,46.0,HGNC:52642,FAM236D
20949,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSB3,0.178,,46.0,HGNC:52642,FAM236D
20950,17092,Q96LM9,1.0,1.0,E9Q1X6,1.000,1.0,46.0,HGNC:16166,C20orf173
20954,17096,P0DP42,1.0,1.0,A0A494B9K2,1.000,1.0,41.0,HGNC:53075,TMEM225B


In [35]:
interaction_id_col = [col for col in mouse_gene_pair1.columns if "Interaction ID" in col][0]

In [36]:
human_ligand_col = [col for col in mouse_gene_pair1.columns if "Human Ligand" in col][0]
ligand_col = [col for col in mouse_gene_pair1.columns if "Ligand" in col][0]

In [37]:
human_receptor_col = [col for col in mouse_gene_pair1.columns if "Human Receptor" in col][0]
receptor_col = [col for col in mouse_gene_pair1.columns if "Receptor" in col][0]

In [52]:
GOC_col = [col for col in mouse_gene_pair1.columns if "GOC" in col][0]
mouse_gene_pair1[GOC_col]

0        75.0
1        75.0
2       100.0
3       100.0
4       100.0
        ...  
3956      NaN
3957      NaN
3958      NaN
3959      NaN
3960      NaN
Name: Ligand GOC score, Length: 3961, dtype: float64

In [58]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
# Split uniprot_ids on comma and explode
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

uniprot_to_ensembl = hgnc_exploded.set_index("uniprot_id")["ensembl_gene_id"].to_dict()

In [61]:
hgnc_df = pd.read_csv("data/HGNC_gene_info_full.tsv", sep="\t", dtype=str)
hgnc_df = hgnc_df.dropna(subset=["uniprot_ids", "ensembl_gene_id"])
# Split uniprot_ids on comma and explode
hgnc_exploded = hgnc_df.assign(uniprot_id=hgnc_df["uniprot_ids"].str.split(",")).explode("uniprot_id")
hgnc_exploded["uniprot_id"] = hgnc_exploded["uniprot_id"].str.strip()

uniprot_to_ensembl = hgnc_exploded.set_index("uniprot_id")["ensembl_gene_id"].to_dict()

# Left join on human_protein
df_merged = df_orthologs.merge(
    hgnc_exploded[["uniprot_id", "ensembl_gene_id", "symbol"]],
    left_on="human_protein",
    right_on="uniprot_id",
    how="left"
)

# Optionally rename
df_merged = df_merged.rename(columns={
    "symbol": "human_gene",
    "ensembl_gene_id": "human_ensembl_gene_id"
}).drop(columns=["uniprot_id"])
df_merged = df_merged.dropna(subset=["human_ensembl_gene_id"])
df_merged.to_csv("data/mmusculus_inParanoid_uniProt_withHGNCAnn.tsv", sep="\t", index=False)

In [62]:
df_merged

Unnamed: 0,cluster_id,human_protein,human_inparalog_score,human_seed_score,mouse_protein,mouse_inparalog_score,mouse_seed_score,bitscore,human_ensembl_gene_id,human_gene
0,1,Q8WZ42,1.0,1.0,A2ASS6,1.000,1.0,60090.0,ENSG00000155657,TTN
1,2,Q8NF91,1.0,1.0,Q6ZWR6,1.000,1.0,14503.0,ENSG00000131018,SYNE1
2,3,Q5VST9,1.0,1.0,A2AAJ9,1.000,1.0,12156.0,ENSG00000154358,OBSCN
4,5,Q03001,1.0,1.0,Q91ZU6,1.000,1.0,10549.0,ENSG00000151914,DST
5,6,Q8WXG9,1.0,1.0,Q8VHN7,1.000,1.0,10049.0,ENSG00000164199,ADGRV1
...,...,...,...,...,...,...,...,...,...,...
20946,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSI2,0.212,,46.0,ENSG00000225396,FAM236D
20947,17091,A0A1B0GTK5,1.0,1.0,A0A1B0GSB3,0.178,,46.0,ENSG00000225396,FAM236D
20948,17092,Q96LM9,1.0,1.0,E9Q1X6,1.000,1.0,46.0,ENSG00000125975,C20orf173
20952,17096,P0DP42,1.0,1.0,A0A494B9K2,1.000,1.0,41.0,ENSG00000244219,TMEM225B


In [68]:
def summarize_orthologs(human_col, species_col, label,
                        confidence_orth_col=None, confidence_orth_threshold=None,
                        GOC_col=None, GOC_threshold=None):
    df = mouse_gene_pair1.copy()

    if confidence_orth_col and confidence_orth_threshold is not None:
        df = df[df[confidence_orth_col] == confidence_orth_threshold]

    if GOC_col and GOC_threshold is not None:
        df = df[df[GOC_col] >= GOC_threshold]  # Use >= instead of ==

    unique_pairs = df[[human_col, species_col]].drop_duplicates()

    counts = (
        unique_pairs
        .groupby(human_col)[species_col]
        .count()
        .sort_values(ascending=False)
        .reset_index(name='count')
    )

    filter_tag = label.lower()
    if confidence_orth_threshold is not None:
        filter_tag += f"_conf{confidence_orth_threshold}"
    if GOC_threshold is not None:
        filter_tag += f"_GOCge{GOC_threshold}"

    counts.to_csv(f"data/human_mouse_orth_count_{filter_tag}.csv", index=False)

    summary_counts = counts['count'].value_counts().sort_index()
    total_human_genes = counts.shape[0]

    summary_lines = [
        f"Out of {total_human_genes} unique human {label.lower()} genes "
        f"(Orthology Confidence = {confidence_orth_threshold}, GOC ≥ {GOC_threshold}):"
    ]
    for orth_count, gene_count in summary_counts.items():
        summary_lines.append(
            f" - {gene_count} human {label.lower()} genes had {orth_count} mouse ortholog(s)"
        )

    return "\n".join(summary_lines)


# Detect columns
confidence_orth_ligand = [col for col in mouse_gene_pair1.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in mouse_gene_pair1.columns if "Ligand GOC" in col][0]

confidence_orth_receptor = [col for col in mouse_gene_pair1.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in mouse_gene_pair1.columns if "Receptor GOC" in col][0]

# Generate summaries
ligand_summary = summarize_orthologs(
    human_ligand_col, ligand_col, "Ligand",
    confidence_orth_col=confidence_orth_ligand, confidence_orth_threshold=None,
    GOC_col=GOC_col_ligand, GOC_threshold=0
)

receptor_summary = summarize_orthologs(
    human_receptor_col, receptor_col, "Receptor",
    confidence_orth_col=confidence_orth_receptor, confidence_orth_threshold=None,
    GOC_col=GOC_col_receptor, GOC_threshold=0
)

# Print
print(ligand_summary)
print()
print(receptor_summary)

In [69]:
mouse_gene_pair1[GOC_col_ligand]

0        75.0
1        75.0
2       100.0
3       100.0
4       100.0
        ...  
3956      NaN
3957      NaN
3958      NaN
3959      NaN
3960      NaN
Name: Ligand GOC score, Length: 3961, dtype: float64

In [74]:

# Detect columns
confidence_orth_ligand = [col for col in mouse_gene_pair1.columns if "Ligand Orthology Confidence" in col][0]
GOC_col_ligand = [col for col in mouse_gene_pair1.columns if "Ligand GOC" in col][0]

confidence_orth_receptor = [col for col in mouse_gene_pair1.columns if "Receptor Orthology Confidence" in col][0]
GOC_col_receptor = [col for col in mouse_gene_pair1.columns if "Receptor GOC" in col][0]

# Generate summaries
ligand_summary = summarize_orthologs(
    human_ligand_col, ligand_col, "Ligand",
    confidence_orth_col=confidence_orth_ligand, confidence_orth_threshold=None,
    GOC_col=GOC_col_ligand, GOC_threshold=0
)

receptor_summary = summarize_orthologs(
    human_receptor_col, receptor_col, "Receptor",
    confidence_orth_col=confidence_orth_receptor, confidence_orth_threshold=None,
    GOC_col=GOC_col_receptor, GOC_threshold=0
)

# Print
print(ligand_summary)
print()
print(receptor_summary)

Out of 985 unique human ligand genes (Orthology Confidence = None, GOC ≥ 0):
 - 925 human ligand genes had 1 mouse ortholog(s)
 - 31 human ligand genes had 2 mouse ortholog(s)
 - 5 human ligand genes had 3 mouse ortholog(s)
 - 2 human ligand genes had 4 mouse ortholog(s)
 - 5 human ligand genes had 5 mouse ortholog(s)
 - 3 human ligand genes had 6 mouse ortholog(s)
 - 2 human ligand genes had 7 mouse ortholog(s)
 - 12 human ligand genes had 14 mouse ortholog(s)

Out of 780 unique human receptor genes (Orthology Confidence = None, GOC ≥ 0):
 - 742 human receptor genes had 1 mouse ortholog(s)
 - 15 human receptor genes had 2 mouse ortholog(s)
 - 7 human receptor genes had 3 mouse ortholog(s)
 - 3 human receptor genes had 4 mouse ortholog(s)
 - 3 human receptor genes had 5 mouse ortholog(s)
 - 4 human receptor genes had 6 mouse ortholog(s)
 - 6 human receptor genes had 7 mouse ortholog(s)


In [42]:
len(pd.unique(mouse_gene_pair1[human_receptor_col]))

781