# Python Notebook

In [None]:
def add_row(change):
    global gene_pair
    # Add a new row at the top with None values
    new_row = {col: None for col in gene_pair.columns}
    gene_pair = pd.DataFrame([new_row] + gene_pair.to_dict(orient="records"))
    update_table()

# Function to remove the last row of the dataframe
def remove_row(change):
    global gene_pair
    if len(gene_pair) > 0:
        gene_pair = gene_pair[:-1]  # Remove the last row
        update_table()

In [None]:
## Function to create Ligand-Receptor pair cards

import os
import jinja2
import sys
import pandas as pd
import numpy as np

sys.path.append(os.path.abspath("src"))  
import fetchGSheet
from createDataTable import gene_pair0

# Paths
TEMPLATE_PATH = 'HTML/cardTemplate.html'
OUTPUT_DIR = 'data/cards/'

def load_template(template_path):
    """Load Jinja2 template from a file."""
    with open(template_path, 'r') as file:
        return jinja2.Template(file.read())

# Function to convert the links
def convert_hgnc_url(col):
   # Extract the HGNC ID from the original URL
    hgnc_id = col.split("HGNC:")[1].split('"')[0]  # Extract the ID number (e.g., "31702")
    # Extract the visible text inside the <a> tag
    visible_text = "genecard.org" # simplify col.split(">")[1].split("<")[0]  # Extract "HGNC:31702"
    # Construct the new link, keeping the text intact
    new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}" target="_blank">{visible_text}</a>'
    return new_link


def convert_hgnc_url_disease(col):
   # Extract the HGNC ID from the original URL
    hgnc_id = col.split("HGNC:")[1].split('"')[0]  # Extract the ID number (e.g., "31702")
    # Extract the visible text inside the <a> tag
    visible_text = "see here" #col.split(">")[1].split("<")[0]  # Extract "HGNC:31702"
    # Construct the new link, keeping the text intact
    new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}#diseases" target="_blank">{visible_text}</a>'
    return new_link
    
def convert_hgnc_url_exp(col):
   # Extract the HGNC ID from the original URL
    hgnc_id = col.split("HGNC:")[1].split('"')[0]  # Extract the ID number (e.g., "31702")
    # Extract the visible text inside the <a> tag
    visible_text = "see here" #col.split(">")[1].split("<")[0]  # Extract "HGNC:31702"
    # Construct the new link, keeping the text intact
    new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}#expression" target="_blank">{visible_text}</a>'
    return new_link

def prepare_dataframes(gene_pair0):
    """Prepare interaction, ligand, and receptor dataframes."""
    DBlength = len(gene_pair0)
    gene_pair0["Interaction ID"] = [f"CDB{str(i).zfill(4)}" for i in range(1, DBlength + 1)]
    gene_pair0["Interaction Type"] = [
        f'{ligand} {ligandLocation} ligand binds to {receptor} {receptorLocation} receptor'
        for ligand, ligandLocation, receptor, receptorLocation in zip(
            gene_pair0["Ligand"], gene_pair0["Ligand location"],
            gene_pair0["Receptor"], gene_pair0["Receptor location"]
        )
    ]
    interaction_card = gene_pair0[["Interaction ID", "Human LR Pair", "Interaction Type", "Perplexity", "PMID support"]]
    interaction_card["Perplexity"] = interaction_card["Perplexity"].str.replace('size=30', 'size=80')

    pop_up_info_lim = fetchGSheet.pop_up_info[
        ["Approved symbol", "Alias symbol", "Previous symbol", "Date symbol changed"]
    ].drop_duplicates(subset="Approved symbol", keep="first")
    
    ligand_card = gene_pair0[["Human LR Pair", "Ligand", "Ligand name", "Ligand HGNC ID", "Ligand MGI ID", "Ligand RGD ID", "Ligand location"]].merge(
        pop_up_info_lim, how='left', left_on='Ligand', right_on='Approved symbol'
    ).drop_duplicates(subset='Human LR Pair', keep="first").drop(columns=["Ligand", "Approved symbol"])

    ligand_card_1 = ligand_card[["Human LR Pair", "Alias symbol", "Date symbol changed", "Ligand name"]] 
    ligand_card_2 = ligand_card[["Human LR Pair", "Ligand HGNC ID", "Ligand location"]] 
    # convert links
    ligand_card_2["HGNC gene card"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url)
    ligand_card_2["Disease relevance"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url_disease)
    ligand_card_2["Expression Profile"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url_exp)
    ligand_card_2 = ligand_card_2[["Human LR Pair", "Ligand HGNC ID", "HGNC gene card", "Disease relevance", "Expression Profile", "Ligand location"]]       

          
    receptor_card = gene_pair0[["Human LR Pair", "Receptor", "Receptor name", "Receptor HGNC ID", "Receptor MGI ID", "Receptor RGD ID", "Receptor location"]].merge(
        pop_up_info_lim, how='left', left_on='Receptor', right_on='Approved symbol'
    ).drop_duplicates(subset='Human LR Pair', keep="first").drop(columns=["Receptor", "Approved symbol"])
    
    receptor_card_1 = receptor_card[["Human LR Pair", "Alias symbol", "Date symbol changed", "Receptor name"]] 
    receptor_card_2 = receptor_card[["Human LR Pair", "Receptor HGNC ID", "Receptor location"]] 
    receptor_card_2["HGNC gene card"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url)
    receptor_card_2["Disease relevance"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url_disease)
    receptor_card_2["Expression Profile"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url_exp)
    receptor_card_2 = receptor_card_2[["Human LR Pair", "Receptor HGNC ID", "HGNC gene card", "Disease relevance", "Expression Profile", "Receptor location"]]       

    return interaction_card, ligand_card_1, ligand_card_2,receptor_card_1, receptor_card_2

def generate_html_files(template, interaction_card, ligand_card_1, receptor_card_1, ligand_card_2, receptor_card_2, output_dir):
    """Generate HTML files for each Human LR Pair."""
    column_values = interaction_card["Human LR Pair"].dropna().unique()
    os.makedirs(output_dir, exist_ok=True)
    
    for value in column_values:
        value1, value2 = value.split()
        row0 = interaction_card[interaction_card['Human LR Pair'] == value]
        row1 = ligand_card_1[ligand_card_1['Human LR Pair'] == value]
        row2 = receptor_card_1[receptor_card_1['Human LR Pair'] == value]
        row3 = ligand_card_2[ligand_card_2['Human LR Pair'] == value]
        row4 = receptor_card_2[receptor_card_2['Human LR Pair'] == value]
        

        table0_data = row0.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row0.empty else {}
        table1_data = row1.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row1.empty else {}
        table2_data = row2.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row2.empty else {}
        table3_data = row3.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row3.empty else {}
        table4_data = row4.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row4.empty else {}

        rendered_content = template.render(
            value1=value1,
            value2=value2,
            table0_data=table0_data,
            table1_data=table1_data,
            table2_data=table2_data,
            table3_data=table3_data,
            table4_data=table4_data
        )
        
        output_file = os.path.join(output_dir, f"{value1} {value2}.html")
        with open(output_file, 'w') as file:
            file.write(rendered_content)

# Main execution
if __name__ == "__main__":
    template = load_template(TEMPLATE_PATH)
    interaction_card, ligand_card_1, receptor_card_1, ligand_card_2, receptor_card_2 = prepare_dataframes(gene_pair0)
    generate_html_files(template, interaction_card, ligand_card_1, receptor_card_1, ligand_card_2, receptor_card_2, OUTPUT_DIR)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction_card["Perplexity"] = interaction_card["Perplexity"].str.replace('size=30', 'size=80')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ligand_card_2["HGNC gene card"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ligand_card_2["Di

In [11]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
sys.path.append(os.path.abspath("src"))  
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Select only the relevant columns from pop_up_info

pop_up_info = fetchGSheet.pop_up_info.rename(columns={"Mouse genome informatics (MGI) ID": "MGI ID", 
                                                      "Zebrafish genome database (ZFIN) ID": "ZFIN ID"})

pop_up_info_lim = pop_up_info[["Approved symbol", "Approved name", "MGI ID", "ZFIN ID"]]
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="Approved symbol", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair.dropna(axis=1, how='all')
# Fetch species IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC ID" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "Ligand receptor pair": "Human LR Pair",
    "Ligand gene symbol": "Ligand",
    "Receptor gene symbol": "Receptor",
    "Perplexity link": "Perplexity",
    "Source": "Interaction Source"
})

# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand', right_on='Approved symbol')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand name", 
                                     "MGI ID": "Ligand MGI ID",
                                     "ZFIN ID": "Ligand ZFIN ID"},
                            )

# Add MGI annotation
MGI_info = pd.read_csv("data/MGI_ID_biomart.csv")
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Ligand MGI ID', right_on='MGI ID')

# Add ZFIN annotation
ZFIN_info = pd.read_csv("data/ZFIN_ID_biomart.csv")
ZFIN_info['ZFIN ID'] = "ZFIN:" + ZFIN_info['ZFIN ID'].astype(str)
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Ligand ZFIN ID', right_on='ZFIN ID')

# Add ZFIN id and symbol
ZFIN_info = pd.read_csv("data/ZFIN_ID_human_orthos.txt", sep="\t", skiprows=1)
ZFIN_info = ZFIN_info[['ZFIN ID', 'ZFIN Symbol', 'ZFIN Name', 'HGNC ID']]
ZFIN_info = ZFIN_info.dropna(subset=['HGNC ID'])
ZFIN_info = ZFIN_info.drop_duplicates(subset=['HGNC ID'])
ZFIN_info['HGNC ID'] = ZFIN_info['HGNC ID'].apply(lambda x: f'HGNC:{int(x)}')


gene_pair = gene_pair.drop(columns=["ZFIN ID", "MGI ID", "HGNC ID"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Ligand", 
                                     "ZFIN name": "Zebrafish Ligand",
                                     "ZFIN ID": "Ligand ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Ligand",
                                     "ZFIN Name": "Zebrafish Ligand name"}
                            )

gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Receptor', right_on='Approved symbol')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')
gene_pair = gene_pair.rename(columns={"Approved name": "Receptor name",
                                      "MGI ID": "Receptor MGI ID",
                                      "ZFIN ID": "Receptor ZFIN ID"}
                            )

# Add MGI name
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Receptor MGI ID', right_on='MGI ID')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Receptor ZFIN ID', right_on='ZFIN ID')


KeyError: "['ZFIN ID'] not in index"

In [1]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
sys.path.append(os.path.abspath("src"))  
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Select only the relevant columns from pop_up_info

pop_up_info = fetchGSheet.pop_up_info.rename(columns={"Mouse genome informatics (MGI) ID": "MGI ID", 
                                                      "Rat genome database (RGD) ID": "RGD ID"})

pop_up_info_lim = pop_up_info[["Approved symbol", "Approved name", "MGI ID", "RGD ID"]]
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="Approved symbol", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair.dropna(axis=1, how='all')
# Fetch species IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC ID" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "Ligand receptor pair": "Human LR Pair",
    "Ligand gene symbol": "Ligand",
    "Receptor gene symbol": "Receptor",
    "Perplexity link": "Perplexity",
    "Source": "Interaction Source"
})

# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand', right_on='Approved symbol')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand name", 
                                     "MGI ID": "Ligand MGI ID",
                                     "RGD ID": "Ligand RGD ID"},
                            )

# Add MGI annotation
MGI_info = pd.read_csv("data/MGI_ID_biomart.csv")
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Ligand MGI ID', right_on='MGI ID')

# Add RGD annotation
RGD_info = pd.read_csv("data/RGD_ID_biomart.csv")
RGD_info['RGD ID'] = "RGD:" + RGD_info['RGD ID'].astype(str)
gene_pair = gene_pair.merge(RGD_info, how='left', left_on='Ligand RGD ID', right_on='RGD ID')

# Add ZFIN id and symbol
ZFIN_info = pd.read_csv("data/ZFIN_ID_human_orthos.txt", sep="\t", skiprows=1)
ZFIN_info = ZFIN_info[['ZFIN ID', 'ZFIN Symbol', 'ZFIN Name', 'HGNC ID']]

ZFIN_info = ZFIN_info.dropna(subset=['HGNC ID'])
ZFIN_info = ZFIN_info.drop_duplicates(subset=['HGNC ID'])
ZFIN_info['HGNC ID'] = ZFIN_info['HGNC ID'].apply(lambda x: f'HGNC:{int(x)}')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.drop(columns=["RGD ID", "MGI ID", "HGNC ID"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Ligand", 
                                     "RGD name": "Rat Ligand",
                                     "ZFIN ID": "Ligand ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Ligand",
                                     "ZFIN Name": "Zebrafish Ligand name"}
                            )

gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Receptor', right_on='Approved symbol')
gene_pair = gene_pair.rename(columns={"Approved name": "Receptor name",
                                      "MGI ID": "Receptor MGI ID",
                                      "RGD ID": "Receptor RGD ID"}
                            )

# Add MGI name
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Receptor MGI ID', right_on='MGI ID')
gene_pair = gene_pair.merge(RGD_info, how='left', left_on='Receptor RGD ID', right_on='RGD ID')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')
gene_pair = gene_pair.drop(columns=["RGD ID", "MGI ID", "HGNC ID"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Receptor", 
                                     "RGD name": "Rat Receptor",
                                     "ZFIN ID": "Receptor ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Receptor",
                                     "ZFIN Name": "Zebrafish Receptor name"}
                            )

gene_pair = gene_pair.drop(columns=["Approved symbol_x", "Approved symbol_y"])

# Drop columns where all values are NA in gene_pair
gene_pair = gene_pair.dropna(axis=1, how='all')

gene_pair = gene_pair.fillna(" ")
gene_pair = gene_pair[gene_pair['Human LR Pair'] != ' ']

if "PMID link" in gene_pair.columns:
    gene_pair = gene_pair.drop(columns=["PMID link"])

# Add
first_columns=['Human LR Pair', 'Ligand', 'Receptor', 'Interaction Source']

end_columns=['HGNC L R', 'sanity check', 'curator', 'secondary source?']
gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns + end_columns] + end_columns]


# number of unique vars

lrPairsCount = len(gene_pair["Human LR Pair"].unique())

ligandCount = len(gene_pair["Ligand"].unique())

receptorCount = len(gene_pair["Receptor"].unique())

# Mouse Orthologue
MouseLigandCount = len(gene_pair["Ligand MGI ID"].unique())

MouseReceptorCount = len(gene_pair["Receptor MGI ID"].unique())

# Rat Orthologue
RatLigandCount = len(gene_pair["Ligand RGD ID"].unique())

RatReceptorCount = len(gene_pair["Receptor RGD ID"].unique())

gene_pair["PMID support"] = [value.replace(" ", "") for value in gene_pair["PMID support"]]

source = np.array(gene_pair["PMID support"].unique())
source = source.astype(str)
source = ",".join(sorted(set(filter(lambda x: x.lower() != 'nan', source))))

# Split the string into individual elements, filter out empty strings, and get unique values
source = sorted(
    set(filter(lambda x: x.strip() and x.strip().lower() != 'nan', source.split(',')))
)
source = [value.replace(" ", "") for value in source]
sourceCount = len(source)

# for creating PMIDs
gene_pair00 = gene_pair[['Human LR Pair', 'PMID support']]

# create URLs for the HGNC IDs

# ligand
gene_pair["Ligand HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(ligand, ligand)
    for ligand in gene_pair["Ligand HGNC ID"]
]

# receptor
gene_pair["Receptor HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(receptor, receptor)
    for receptor in gene_pair["Receptor HGNC ID"]
]

# Perplexity
gene_pair["Perplexity"] = [
    '<a href="{}" target="_blank"> <img src="https://img.icons8.com/?size=30&id=0NbBuNOxUwps&format=png&color=000000" alt="Perplexity AI" /></a>'.format(url)
    for url in gene_pair["Perplexity"]
]

# Function to generate hyperlinks for the "PMID support" column
# Function to generate hyperlinks for the "PMID support" column
def generate_links_with_doi(df, gene_column, pmid_column):
    def create_link(gene, sources):
        # Replace spaces with "——" in the gene name for the link
        gene_name = gene.replace(" ", "——")
        
        if len(sources) == 1:
            source = sources[0]
            if source.startswith("https://www.biorxiv.org/content/"):
                # If the value starts with "https://doi.org/", use it as the hyperlink
                return f'<a href="{source}" target="_blank">BioRxiv preprint</a>'
            else:
                # If it's a single PMID, hyperlink the PMID text
                return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/pubmed/{gene_name}_pmid_details.html">{source}</a>'
        else:
            # If multiple PMIDs, show the count and hyperlink to the page
            return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/pubmed/{gene_name}_pmid_details.html" target="_blank">{len(sources)} PMIDs</a>'

    # Process each row to generate the "PMID support" column
    df["PMID support"] = [
        create_link(
            gene=row[gene_column], 
            sources=[s.strip() for s in row[pmid_column].split(',') if s.strip()]
        )
        for _, row in df.iterrows()
    ]
    return df


# Generate the links for the "PMID support" column
gene_pair = generate_links_with_doi(gene_pair, gene_column="Human LR Pair", pmid_column="PMID support")

gene_pair["Ligand MGI ID"] = [
        f'<a href="https://www.informatics.jax.org/marker/{mouseOrth}" target="_blank">{mouseOrth}</a>' 
        if pd.notna(mouseOrth) and mouseOrth.strip() else "" 
        for mouseOrth in gene_pair["Ligand MGI ID"]
    ]

gene_pair["Receptor MGI ID"] = [
        f'<a href="https://www.informatics.jax.org/marker/{mouseOrth}" target="_blank">{mouseOrth}</a>' 
        if pd.notna(mouseOrth) and mouseOrth.strip() else "" 
        for mouseOrth in gene_pair["Receptor MGI ID"]
    ]

gene_pair["Ligand RGD ID"] = [
        f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={ratOrth.replace("RGD:", "")}" target="_blank">{ratOrth}</a>' 
        if pd.notna(ratOrth) and ratOrth.strip() else "" 
        for ratOrth in gene_pair["Ligand RGD ID"]
    ]

gene_pair["Receptor RGD ID"] = [
        f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={ratOrth.replace("RGD:", "")}" target="_blank">{ratOrth}</a>' 
        if pd.notna(ratOrth) and ratOrth.strip() else "" 
        for ratOrth in gene_pair["Receptor RGD ID"]
    ]

gene_pair["Zebrafish Ligand"] = [
    f'<span title="{ligand_name}">{ligand_symbol}</span>'
    for ligand_name, ligand_symbol in zip(gene_pair["Zebrafish Ligand name"], 
                                          gene_pair["Zebrafish Ligand"])
]
gene_pair["Zebrafish Receptor"] = [
    f'<span title="{ligand_name}">{ligand_symbol}</span>'
    for ligand_name, ligand_symbol in zip(gene_pair["Zebrafish Receptor name"], 
                                          gene_pair["Zebrafish Receptor"])
]

mouse_columns = ['Mouse Ligand', 'Mouse Receptor','Ligand MGI ID','Receptor MGI ID'] 
rat_columns = ['Rat Ligand','Rat Receptor','Ligand RGD ID','Receptor RGD ID']
zebrafish = ['Zebrafish Ligand','Zebrafish Receptor','Ligand ZFIN ID','Receptor ZFIN ID']

gene_pair0 = gene_pair[['Human LR Pair', 'Ligand', 'Receptor', 'Perplexity', 'PMID support',
       'Ligand HGNC ID', 'Ligand location', 'Receptor HGNC ID',
       'Receptor location', 'Ligand name', 'Receptor name'] + mouse_columns + rat_columns]

gene_pair = gene_pair[['Human LR Pair', 'Ligand', 'Receptor', 'Interaction Source', 'Perplexity', 'PMID support',
        'Ligand HGNC ID', 'Receptor HGNC ID', 'Ligand location', 'Receptor location',
        'Ligand name', 'Receptor name'] + mouse_columns + rat_columns + zebrafish + end_columns]


# gene symbol
gene_pair["Ligand"] = [
    f'<span title="{ligand_name}">{ligand_symbol}</span>'
    for ligand_name, ligand_symbol in zip(gene_pair["Ligand name"], 
                                          gene_pair["Ligand"])
]

# gene symbol
gene_pair["Receptor"] = [
    f'<span title="{receptor_name}">{receptor_symbol}</span>'
    for receptor_name, receptor_symbol in zip(gene_pair["Receptor name"], 
                                              gene_pair["Receptor"])
]

def replace_spaces(row):
    if row['Ligand location'] == 'secreted':
        return row['Human LR Pair'].replace(" ", " <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> ")
    elif row['Ligand location'] == '':
        return row['Human LR Pair'].replace(" ", " <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> ")
    elif row['Ligand location'] == 'plasma membrane':
        return row['Human LR Pair'].replace(" ", " <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> ")
    else:
        return row['Human LR Pair'].replace(" ", " \u2192 ")

# Apply the function to the 'LR Pair' column
gene_pair['Human LR Pair'] = gene_pair.apply(replace_spaces, axis=1)

gene_pair = gene_pair.drop(columns=["Ligand name", "Receptor name"])


# Create the links to the HTML cards
gene_pair["Human LR Pair"] = [
    f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrPairOrig}.html">{lrPair}</a>'
    for lrPairOrig, lrPair in zip(gene_pair0["Human LR Pair"], gene_pair["Human LR Pair"])
]


# Add tooltips to the column headers
gene_pair.columns = [
    f'<span title=" Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">{col}</span>' if col == "Human LR Pair" else
    f'<span title="Click the logo below to run Perplexity on the Human LR pair">{col}&nbsp;</span>' if col == "Perplexity" else
    f'<span title=" Official Gene Symbol; Hover on symbols below to show gene names">{col}&nbsp;&nbsp;&nbsp;</span>' if col in ["Ligand", "Receptor"] else
    f'<span title="HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details">{col}&nbsp;&nbsp;</span>' if col in ["Ligand HGNC ID", "Receptor HGNC ID"] else
    f'<span title=" PubMed IDs (PMID) with Literature Evidence for LR Interaction. Click on the link for more details">{col}</span>' if col == "PMID support" else
    f'<span title="Rat Genome Database (RGD) ID. Click on the link for more details">{col}</span>' if col in ["Ligand RGD ID", "Receptor RGD ID"] else
    f'<span title="Mouse Genome Informatics (MGI) ID. Click on the link for more details">{col}</span>' if col in ["Ligand MGI ID", "Receptor MGI ID"]else
    f'<span title="Zebrafish Information Network (ZFIN) ID. Click on the link for more details">{col}</span>' if col in ["Ligand ZFIN ID", "Receptor ZFIN ID"] else
    f'<span title="Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)">{col}</span>' if col in ["Ligand location", "Receptor location"] else
    f'<span title="Double-click header of {col} to ensure all values are shown">{col}&nbsp;</span>'
    for col in gene_pair.columns
]

gene_pair = gene_pair.reset_index(drop=True)  # Remove the index
gene_pair000 = gene_pair.copy()

keywords_to_modify = ["Ligand", "Receptor"]
exclude_keywords = ["HGNC ID", "Location", "Human"]  # Columns containing this will not be modified

# Copy the original columns so we can modify only the first 10
new_columns = gene_pair000.columns.tolist()

# Modify only the first 10 columns
new_columns[:10] = [
    f'{col.split(">")[0]}">Human {col.split(">")[1]}</span>'
    if any(keyword in col for keyword in keywords_to_modify) and not any(exclude in col for exclude in exclude_keywords)
    else col
    for col in new_columns[:10]
]

# Assign the modified column names back to the DataFrame
gene_pair000.columns = new_columns
human_columns = [col for col in gene_pair000.columns][:10]


### MOUSE ###
# Find columns with "Mouse" in the name
mouse_columns = [col for col in gene_pair.columns if "MGI" in col or "Mouse" in col]

# Filter rows where all "Mouse" columns are not " "
mouse_gene_pair = gene_pair000[(gene_pair000[mouse_columns].map(str.strip) != "").all(axis=1)]
# Dynamically identify columns containing "Ligand" and "Receptor" in their names 
# since it is now in span format

new_columns = mouse_gene_pair.columns.tolist()

new_columns = [
    col.replace("Mouse ", "").strip()
    if "Mouse Ligand" in col or "Mouse Receptor" in col
    else col
    for col in new_columns
]
mouse_gene_pair.columns = new_columns
        
ligand_col = [col for col in mouse_gene_pair.columns if "Ligand&nbsp;" in col][1]
receptor_col = [col for col in mouse_gene_pair.columns if "Receptor&nbsp;" in col][1]
ligand_location = [col for col in mouse_gene_pair.columns if "Ligand location" in col][0]
receptor_location = [col for col in mouse_gene_pair.columns if "Receptor location" in col][0]


# Combine columns into "Mouse LR Pair" with appropriate replacements
def format_lr_pair(row):
    if row[ligand_location] == 'secreted':
        return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    elif row[ligand_location] == '':
        return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    elif row[receptor_location] == 'plasma membrane':
        return f"{row[ligand_col]} <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    else:
        return f"{row[ligand_col]} \u2192 {row[receptor_col]}"

# Apply the function row-wise and assign to the new column using .loc
mouse_gene_pair1 = mouse_gene_pair.copy() 
mouse_gene_pair1.loc[:, "Mouse LR Pair"] = mouse_gene_pair1.apply(format_lr_pair, axis=1)
mouse_columns = [col for col in mouse_gene_pair1.columns if "MGI" in col]
# Reorder the DataFrame
new_order = ["Mouse LR Pair", ligand_col, receptor_col] + mouse_columns + human_columns
mouse_gene_pair1 = mouse_gene_pair1[new_order]
MouselrPairsCount = len(mouse_gene_pair1["Mouse LR Pair"].unique())
HumanMouseLRPairsPer = (MouselrPairsCount/lrPairsCount)*100
HumanMouseLRPairsPer = round(HumanMouseLRPairsPer, 2)

mouse_gene_pair1 = mouse_gene_pair1.reset_index(drop=True)  


### RAT ###

## Limit to those with either Rat Ligand or Receptor
rat_columns = [col for col in gene_pair.columns if "RGD" in col or "Rat" in col]
# Filter rows where all "Rat" columns are not " "
rat_gene_pair = gene_pair000[(gene_pair000[rat_columns].map(str.strip) != "").all(axis=1)]


new_columns = rat_gene_pair.columns.tolist()

new_columns = [
    col.replace("Rat ", "").strip()
    if "Ligand" in col or "Receptor" in col
    else col
    for col in new_columns
]
rat_gene_pair.columns = new_columns

# Dynamically identify columns containing "Ligand" and "Receptor" in their names 
# since it is now in span format
ligand_col = [col for col in rat_gene_pair.columns if "Ligand&nbsp;" in col][2]
receptor_col = [col for col in rat_gene_pair.columns if "Receptor&nbsp;" in col][2]
ligand_location = [col for col in rat_gene_pair.columns if "Ligand location" in col][0]
receptor_location = [col for col in rat_gene_pair.columns if "Receptor location" in col][0]

def format_lr_pair(row):
    if row[ligand_location] == 'secreted':
        return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    elif row[ligand_location] == '':
        return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    elif row[receptor_location] == 'plasma membrane':
        return f"{row[ligand_col]} <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    else:
        return f"{row[ligand_col]} \u2192 {row[receptor_col]}"

rat_gene_pair1 = rat_gene_pair.copy() 
rat_gene_pair1.loc[:, "Rat LR Pair"] = rat_gene_pair1.apply(format_lr_pair, axis=1)
rat_columns = [col for col in rat_gene_pair1.columns if "RGD" in col]
# Reorder the DataFrame
new_order = ["Rat LR Pair", ligand_col, receptor_col] + rat_columns + human_columns
rat_gene_pair1 = rat_gene_pair1[new_order]
rat_gene_pair1 = rat_gene_pair1.reset_index(drop=True)  

### ZEBRAFISH ###

## Limit to those with either Zebrafish Ligand or Receptor
Zebrafish_columns = [col for col in gene_pair.columns if "ZFIN" in col or "Zebrafish" in col]
# Filter rows where all "Zebrafish" columns are not " "
Zebrafish_gene_pair = gene_pair000[(gene_pair000[Zebrafish_columns].map(str.strip) != "").all(axis=1)]


new_columns = Zebrafish_gene_pair.columns.tolist()

new_columns = [
    col.replace("Zebrafish ", "").strip()
    if "Ligand" in col or "Receptor" in col
    else col
    for col in new_columns
]
Zebrafish_gene_pair.columns = new_columns

# Dynamically identify columns containing "Ligand" and "Receptor" in their names 
# since it is now in span format
ligand_col = [col for col in Zebrafish_gene_pair.columns if "Ligand&nbsp;" in col][3]
receptor_col = [col for col in Zebrafish_gene_pair.columns if "Receptor&nbsp;" in col][3]
ligand_location = [col for col in Zebrafish_gene_pair.columns if "Ligand location" in col][0]
receptor_location = [col for col in Zebrafish_gene_pair.columns if "Receptor location" in col][0]

def format_lr_pair(row):
    if row[ligand_location] == 'secreted':
        return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    elif row[ligand_location] == '':
        return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    elif row[receptor_location] == 'plasma membrane':
        return f"{row[ligand_col]} <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
    else:
        return f"{row[ligand_col]} \u2192 {row[receptor_col]}"

Zebrafish_gene_pair1 = Zebrafish_gene_pair.copy() 
Zebrafish_gene_pair1.loc[:, "Zebrafish LR Pair"] = Zebrafish_gene_pair1.apply(format_lr_pair, axis=1)
Zebrafish_columns = [col for col in Zebrafish_gene_pair1.columns if "ZFIN" in col]
# Reorder the DataFrame
new_order = ["Zebrafish LR Pair", ligand_col, receptor_col] + Zebrafish_columns + human_columns
Zebrafish_gene_pair1 = Zebrafish_gene_pair1[new_order]
Zebrafish_gene_pair1 = Zebrafish_gene_pair1.reset_index(drop=True)  

In [65]:
gene_pair.columns

Index(['Human LR Pair', 'Ligand', 'Receptor', 'Interaction Source',
       'PMID support', 'Ligand HGNC ID', 'Ligand location', 'Receptor HGNC ID',
       'Receptor location', 'Perplexity', 'Ligand name', 'Ligand MGI ID',
       'Ligand RGD ID', 'Mouse Ligand', 'Rat Ligand', 'Ligand ZFIN ID',
       'Ligand ZFIN ID', 'Zebrafish Ligand', 'Zebrafish Ligand name',
       'Receptor name', 'Receptor MGI ID', 'Receptor RGD ID', 'Mouse Receptor',
       'Rat Receptor', 'Ligand ZFIN ID', 'Ligand ZFIN ID',
       'Zebrafish Receptor', 'Zebrafish Receptor name', 'HGNC L R',
       'sanity check', 'curator', 'secondary source?'],
      dtype='object')

In [9]:
duplicates = gene_pair00[gene_pair00["Human LR Pair"].duplicated()]
print(duplicates["Human LR Pair"])


2312       TAFA4 FPR1
2333    SLAMF1 SLAMF1
2342     FLRT3 ADGRL3
2344     TENM2 ADGRL1
Name: Human LR Pair, dtype: object
