# Python Notebook

In [1]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
import pandas as pd
# Change working directory to ConnectomeDB
project_root = "/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories/ConnectomeDB"
os.chdir(project_root)
sys.path.append(os.path.abspath("src"))

In [2]:
project_root

'/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories/ConnectomeDB'

In [3]:
## Function to rename yaml titles for the different species
import sys
import os
import pandas as pd
import yaml

from createDataTable_perSpecies import mouse_gene_pair1
MouselrPairsCount = len(mouse_gene_pair1["Mouse LR Pair"])
species_name = "Mus musculus"

# --- 2. Create the YAML header
yaml_data = {
    "title": f"{{{{< fa database >}}}} ConnectomeDB2025: <span class='highlight'>{MouselrPairsCount} Mouse ({species_name})</span> Ortholog LR Pairs",
    "execute": {"echo": False},
    "format": {
        "html": {"table": False}
    },
    "header-includes": "<script src='../js/keepDropdownMenuGold.js'></script>"
}
yaml_block = "---\n" + yaml.dump(yaml_data, sort_keys=False) + "---"

# --- 3. Load template and inject YAML
with open("database/mouseOrth.qmd") as f:
    content = f.read()

final_qmd = content.replace("{{yaml_header}}", yaml_block)

# --- 4. Write to a temporary QMD file
output_qmd = "database/mouseOrth.qmd"
with open(output_qmd, "w") as f:
    f.write(final_qmd)

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


In [17]:
# Fill NaN values in 'Abstract' column with an empty string
df['Abstract'] = df['Abstract'].fillna('')

pmids_without_period = df[~df['Abstract'].str.endswith(('.', '?', 'available', ')', 'Review', '...'))]['PMID'] # Corrected the condition to find abstracts that *do not* end with a period or question mark
pmid_check = pmids_without_period.tolist()

print("These " + str(len(pmid_check)) + " PMIDs have abstracts that might be incomplete (do not end with a period or question mark):")
print(pmid_check)

These 1 PMIDs have abstracts that might be incomplete (do not end with a period or question mark):
[6172602]


In [24]:
df['Abstract'] = df['Abstract'].fillna('')
pmids_without_period = df[~df['Abstract'].str.startswith(('No abstract available'))]['PMID']
pmid_check = pmids_without_period.tolist()

In [25]:
pmid_check

[10025398,
 10037686,
 10037743,
 10050855,
 10051567,
 10066262,
 10073957,
 10074428,
 10077672,
 10085134,
 10100920,
 10102268,
 10103110,
 10187774,
 10188995,
 10189055,
 10190900,
 10190906,
 10193788,
 10194432,
 10196157,
 10196161,
 10196194,
 10196234,
 10196235,
 10196546,
 10202040,
 10206645,
 10207021,
 10209034,
 10212223,
 10214951,
 10225955,
 10229797,
 10231374,
 10233762,
 10233851,
 10318773,
 10318826,
 10318834,
 10318947,
 10330424,
 10336501,
 10339405,
 10342833,
 10342881,
 10342886,
 10347172,
 10347248,
 10348342,
 10352278,
 10358030,
 10364174,
 10364178,
 10364234,
 10366627,
 10366629,
 10368033,
 10369126,
 10369464,
 10371504,
 10377245,
 10381577,
 10381815,
 10381885,
 10382758,
 10385705,
 10390077,
 10395410,
 10395669,
 10397731,
 10399917,
 10399920,
 10400673,
 10409677,
 10419462,
 10421367,
 10421368,
 10421785,
 10422787,
 10426993,
 10429675,
 10433822,
 10438490,
 10438732,
 10438935,
 10446903,
 10454496,
 10461027,
 10464311,
 10470109,

In [4]:
gene_pair0["Human LR Pair"]

0          A2M HSPA5
1           A2M LRP1
2         ACE BDKRB2
3           ADA DPP4
4       ADAM10 EPHA3
            ...     
3431      ZG16B TLR2
3432      ZG16B TLR4
3433      ZG16B TLR5
3434      ZG16B TLR6
3435      ZP3 CHRNA7
Name: Human LR Pair, Length: 3436, dtype: object

In [7]:
gene_pair0["Ligand HGNC ID"][2]

'<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:2707" target="_blank">HGNC:2707</a>'

In [9]:
# Identify the column(s) that contain '(PMID)' and temporarily remove
pmid_cols = [col for col in gene_pair.columns if '(PMID)' in col]
gene_pair = gene_pair.drop(columns=pmid_cols)
gene_pair.columns

Index(['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
       '<span title="Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">Human LR Pair</span>',
       '<span title="Official Gene Symbol; Hover on symbols below to show gene names">Ligand&nbsp;&nbsp;&nbsp;</span>',
       '<span title="Official Gene Symbol; Hover on symbols below to show gene names">Receptor&nbsp;&nbsp;&nbsp;</span>',
       '<span title="Double-click header of Ligand Symbols to ensure all values are shown">Ligand Symbols&nbsp;</span>',
       '<span title="Double-click header of Receptor Symbols to ensure all values are shown">Receptor Symbols&nbsp;</span>',
       '<span title="Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)">Ligand Location</span>',
       '<span title="Location based on the predicted subcellular localization of the human p

In [6]:
# Temporarily remove the column(s)
gene_pair = gene_pair.drop(columns=pmid_cols)
gene_pair

[]

In [52]:
pop_up_info_lim.columns

Index(['Approved symbol', 'Alias symbol', 'Previous symbol',
       'Date symbol changed', 'ensembl_gene_id', 'uniprot_ids', 'omim_id',
       'Other Symbols', 'ensembl_peptide_id'],
      dtype='object')

In [54]:
pop_up_info

Unnamed: 0,HGNC ID,Approved symbol,Approved name,locus_group,locus_type,status,location,location_sortable,Alias symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_id,lncipedia,gtrnadb,agr,mane_select,gencc
0,HGNC:5,A1BG,alpha-1-B glycoprotein,protein-coding gene,gene with protein product,Approved,19q13.43,19q13.43,,,...,,,,,,,,HGNC:5,ENST00000263100.8|NM_130786.4,
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,19q13.43,19q13.43,FLJ23569,,...,,,,,URS00007E4F6E,A1BG-AS1,,HGNC:37133,,
2,HGNC:24086,A1CF,APOBEC1 complementation factor,protein-coding gene,gene with protein product,Approved,10q11.23,10q11.23,"ACF, ASP, ACF64, ACF65, APOBEC1CF",,...,,,,,,,,HGNC:24086,ENST00000373997.8|NM_014576.4,
3,HGNC:7,A2M,alpha-2-macroglobulin,protein-coding gene,gene with protein product,Approved,12p13.31,12p13.31,"FWP007, S863-7, CPAMD5",,...,,,,,,,,HGNC:7,ENST00000318602.12|NM_000014.6,HGNC:7
4,HGNC:27057,A2M-AS1,A2M antisense RNA 1,non-coding RNA,"RNA, long non-coding",Approved,12p13.31,12p13.31,,,...,,,,,URS00001F234A,A2M-AS1,,HGNC:27057,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44079,HGNC:3562,FABP7,fatty acid binding protein 7,protein-coding gene,gene with protein product,Approved,6q22.31,06q22.31,"B-FABP, BLBP",brain lipid binding protein,...,,,,,,,,HGNC:3562,ENST00000368444.8|NM_001446.5,
44080,HGNC:41951,FABP7P1,fatty acid binding protein 7 pseudogene 1,pseudogene,pseudogene,Approved,1q44,01q44,,,...,,,,,,,,HGNC:41951,,
44081,HGNC:41952,FABP7P2,fatty acid binding protein 7 pseudogene 2,pseudogene,pseudogene,Approved,2q11.1,02q11.1,,,...,,,,,,,,HGNC:41952,,
44082,HGNC:3563,FABP9,fatty acid binding protein 9,protein-coding gene,gene with protein product,Approved,8q21.13,08q21.13,"PERF, T-FABP, PERF15",testis fatty acid binding protein,...,,,,,,,,HGNC:3563,ENST00000379071.4|NM_001080526.2,


In [59]:
pop_up_info = pop_up_info.groupby('HGNC ID').agg(agg_func).reset_index()
pop_up_info

Unnamed: 0,HGNC ID,Approved symbol,Approved name,locus_group,locus_type,status,location,location_sortable,Alias symbol,alias_name,...,cd,lncrnadb,enzyme_id,intermediate_filament_db,rna_central_id,lncipedia,gtrnadb,agr,mane_select,gencc
0,HGNC:100,ASIC1,acid sensing ion channel subunit 1,protein-coding gene,gene with protein product,Approved,12q13.12,12q13.12,"BNaC2, hBNaC2",,...,,,,,,,,HGNC:100,ENST00000447966.7|NM_001095.4,
1,HGNC:10000,RGS4,regulator of G protein signaling 4,protein-coding gene,gene with protein product,Approved,1q23.3,01q23.3,,,...,,,,,,,,HGNC:10000,ENST00000367909.11|NM_005613.6,
2,HGNC:10001,RGS5,regulator of G protein signaling 5,protein-coding gene,gene with protein product,Approved,1q23.3,01q23.3,,,...,,,,,,,,HGNC:10001,ENST00000313961.10|NM_003617.4,HGNC:10001
3,HGNC:10002,RGS6,regulator of G protein signaling 6,protein-coding gene,gene with protein product,Approved,14q24.2,14q24.2,,,...,,,,,,,,HGNC:10002,ENST00000553525.6|NM_001204424.2,
4,HGNC:10003,RGS7,regulator of G protein signaling 7,protein-coding gene,gene with protein product,Approved,1q43,01q43,,,...,,,,,,,,HGNC:10003,ENST00000440928.6|NM_001364886.1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44079,HGNC:9995,RGS13,regulator of G protein signaling 13,protein-coding gene,gene with protein product,Approved,1q31.2,01q31.2,,,...,,,,,,,,HGNC:9995,ENST00000391995.7|NM_002927.5,
44080,HGNC:9996,RGS14,regulator of G protein signaling 14,protein-coding gene,gene with protein product,Approved,5q35.3,05q35.3,,,...,,,,,,,,HGNC:9996,ENST00000408923.8|NM_006480.5,
44081,HGNC:9997,RGS16,regulator of G protein signaling 16,protein-coding gene,gene with protein product,Approved,1q25.3,01q25.3,"A28-RGS14, RGS-r",,...,,,,,,,,HGNC:9997,ENST00000367558.6|NM_002928.4,
44082,HGNC:9998,RGS2,regulator of G protein signaling 2,protein-coding gene,gene with protein product,Approved,1q31.2,01q31.2,,,...,,,,,,,,HGNC:9998,ENST00000235382.7|NM_002923.4,


In [44]:
gene_pair_annot_ligand["Ligand group"] = gene_pair_annot_ligand["Ligand group"].fillna("unknown")
gene_pair_annot_ligand

Unnamed: 0,Ligand HGNC ID,Ligand Symbols,Ligand Location,Ligand group,root_group_id
0,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""A2M (FWP007, S863-7,...","<span title=""based on perplexity, uniprot"">sec...","<a href=""https://www.genenames.org/data/genegr...",1234
1,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""ACE (DCP1, ACE1, CD1...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/genegr...",471
2,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""ADA (ADA1)"">ADA (ADA...","<span title=""based on hpa, uniprot"">cell membr...","<a href=""https://www.genenames.org/data/genegr...",1302
3,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""ADAM10 (kuz, MADM, H...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/genegr...",2104
4,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""ADAM10 (kuz, MADM, H...","<span title=""based on hpa, perplexity, uniprot...","<a href=""https://www.genenames.org/data/genegr...",471
...,...,...,...,...,...
1583,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""YBX1 (NSEP1, YB-1, Y...","<span title=""based on perplexity, uniprot"">sec...","<a href=""https://www.genenames.org/data/genegr...",106
1584,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""YBX1 (NSEP1, YB-1, Y...","<span title=""based on perplexity, uniprot"">sec...","<a href=""https://www.genenames.org/data/genegr...",1520
1585,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""YBX1 (NSEP1, YB-1, Y...","<span title=""based on perplexity, uniprot"">sec...","<a href=""https://www.genenames.org/data/genegr...",2045
1586,"<a href=""https://www.genenames.org/data/gene-s...","<span title=""<span title=""ZG16B (HRPE773, PRO1...","<span title=""based on perplexity, uniprot"">cel...","<a href=""https://www.genenames.org/data/genegr...",


In [None]:
# drop duplicates 
gene_pair_annot_ligand = gene_pair_annot_ligand.drop_duplicates().reset_index(drop=True)


In [None]:
gene_pair_annot_ligand

In [None]:
# Receptor group merge and tooltip
gene_pair_annot_receptor = gene_pair_annot2[['Receptor HGNC ID', 
                                           'Receptor Symbols',
                                           'receptor_hgnc_id',
                                           'Receptor Location']].copy()
# drop duplicates 
gene_pair_annot_receptor = gene_pair_annot_receptor.merge(gene_group_lim, how='left', left_on='receptor_hgnc_id', right_on='hgnc_id')
gene_pair_annot_receptor = gene_pair_annot_receptor.rename(columns={"root_group_name": "Receptor group"}).drop(columns=["hgnc_id","receptor_hgnc_id"])

gene_pair_annot_receptor["Receptor group"] = gene_pair_annot_receptor["Receptor group"].fillna("unknown")
# gene_pair_annot_receptor["Receptor group"] = gene_pair_annot_receptor["Receptor group"].apply(
#     lambda x: f'<span title="{x}">{x}</span>' if pd.notna(x) else "unknown"
# )
gene_pair_annot_receptor = gene_pair_annot_receptor.drop_duplicates().reset_index(drop=True)
gene_pair_annot_receptor

In [None]:
gene_pair_annot_ligand

In [None]:
import os
import jinja2
import sys
import pandas as pd
import numpy as np
import time
import base64
import re

# Add the src directory to the path for importing modules
sys.path.append(os.path.abspath("src"))

# Import necessary modules from your existing src files
# Ensure createDataTable and createFunctionalAnnotTable are in your 'src' directory
from createDataTable import pop_up_info, gene_pair0, generate_perplexity_links, gene_pair00
from createFunctionalAnnotTable import gene_pair_annot_ligand, gene_pair_annot_receptor
gene_pair_annot_ligand

In [None]:

# Test or all
test = True
test_genes = ["VEGFA ITGB1", "VEGFA KDR", "VEGFA NRP1", "THPO MPL", "FGF1 FGFR3"] # Example genes
# --- Paths ---
MERGED_TEMPLATE_PATH = 'HTML/mergedCardWithPMIDTemplate.html'
OUTPUT_DIR = 'data/cards/' # New output directory for combined files

# --- Load and Preprocess Data (Combined from both scripts) ---

# Load PubMed data (from createPMIDpages.py)
pubmed_data = pd.read_csv("data/pubmed_results.csv")
pubmed_data["Year"] = pubmed_data["Year"].astype(str).str.replace(".0", "", regex=False).astype(int)
pubmed_data["PMID"] = pubmed_data["PMID"].astype(str)
pubmed_data = pubmed_data.reset_index(drop=True)

# Load LLM results (from createPMIDpages.py)
bio_keywords = pd.read_csv("data/llm_results.csv")

# --- Prepare gene_pair00 for PMID section (from createPMIDpages.py) ---
# gene_pair00 is used for PMID and Keywords, so it needs the '——' placeholder
# Ensure gene_pair00 is a copy to avoid SettingWithCopyWarning later
gene_pair00_copy = gene_pair00.copy()
gene_pair00_copy["Human LR Pair"] = gene_pair00_copy["Human LR Pair"].str.replace(" ", "——")

# Merge with LLM results
gene_pair000 = gene_pair00_copy.merge(bio_keywords, how='left', left_on="Human LR Pair", right_on='Human LR Pair')
gene_pair000["Relevance Keywords"] = gene_pair000["Relevance Keywords"].astype(str)
gene_pair000["Human LR Pair"] = gene_pair000["Human LR Pair"].astype(str) # Ensure string type

# --- Prepare gene_pair0 for Card section (from createCards.py) ---
# gene_pair0 is used for card details, it should retain spaces for splitting gene names
# Ensure gene_pair0 is a copy to avoid SettingWithCopyWarning later
gene_pair0_copy = gene_pair0.copy()

# Add Disease (specific) to cards
df_disease = pd.read_csv("data/disease_annotations_per_pair.csv")
df_disease = df_disease.groupby('interaction')['disease'].apply(', '.join).reset_index()
mapping_disease = dict(zip(df_disease['interaction'], df_disease['disease']))
gene_pair0_copy["Disease"] = gene_pair0_copy['Human LR Pair'].map(mapping_disease).fillna("unknown")

gene_pair0_copy = generate_perplexity_links(
    gene_pair0_copy,
    pathway_col="Disease",
    default_query_template="What-diseases-is-the-ligand-receptor-pair-{pair}-associated-with"
)

gene_pair0_copy["Interaction ID"] = gene_pair0_copy["Interaction ID"].apply(
    lambda x: f"<a href='https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/database/filter/{x}.html'>{x}</a>"
)
# Add external link icon
icon_html = '<i class="fa-solid fa-arrow-up-right-from-square" style="margin-left:4px;"></i></a>'
columns_to_update = [
    "KEGG Pathway", "PROGENy Pathway", "Cancer-related",
    "Disease Type", "Disease"
]
for col in columns_to_update:
    gene_pair0_copy[col] = gene_pair0_copy[col].str.replace(
        "</a>", icon_html, regex=False
    )


In [None]:
gene_pair_annot_ligand

In [None]:
import sys
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
from itables import init_notebook_mode, show
from IPython.display import display, Javascript
import itables.options as opt
# Change working directory to ConnectomeDB
project_root = os.path.dirname(os.getcwd())
os.chdir(project_root)
sys.path.append(os.path.abspath("src"))
from createDataTable_perSpecies import mouse_gene_pair1

mouse_gene_pair1.columns

In [None]:
mapping_ID = dict(zip(gene_pair0['Human LR Pair'], gene_pair0['Interaction ID']))
gene_pair_PMID = fetchGSheet.gene_pair.dropna(axis=1, how='all')
gene_pair_PMID = gene_pair_PMID[gene_pair_PMID['LR pair'] != '']
gene_pair_PMID= gene_pair_PMID[["LR pair", "PMID", "original source"]]
gene_pair_PMID

# Mapping for replacements
mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# Replace values in the column based on the mapping
gene_pair_PMID['Database Source'] = gene_pair_PMID['original source'].replace(mapping)

# Replace values in the column based on the mapping
gene_pair_PMID["Interaction ID"] = gene_pair_PMID['LR pair'].replace(mapping_ID)
df_pub = pd.read_csv("data/pubmed_results.csv", usecols=[0,1,3,4,5])
gene_pair_PMID["PMID"] = gene_pair_PMID["PMID"].astype(str)
df_pub["PMID"] = df_pub["PMID"].astype(str)
gene_pair_trip = pd.merge(gene_pair_PMID, df_pub, how='left', on='PMID')
### patch for the BioRxiV ###
gene_pair_trip["Year"] = gene_pair_trip["Year"].apply(
    lambda x: "2024" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "Species Specificity", "NaN", ""] else x
)

In [None]:
gene_pair.columns[11]

In [None]:
gene_pair_trip

In [None]:
gene_pair_trip

In [None]:
gene_group_lim['root_group_name']  =[
        f'<a href="https://www.genenames.org/data/genegroup/#!/group/{root_group_id}" target="_blank">{root_group_name}</a>' 
        if pd.notna(root_group_id) and root_group_id.strip() else "" 
        for root_group_id, root_group_name in zip(gene_group_lim["root_group_id"], gene_group_lim["root_group_name"])
    ]

In [None]:
gene_group_lim[gene_group_lim["root_group_id"]=="NA"]
gene_group_lim["root_group_name"] = gene_group_lim["root_group_name"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "na", ""] else x
)

In [None]:
gene_group_lim

In [None]:
gene_group_lim[['root_group_name']]  =[
        f'<a href="https://www.genenames.org/data/genegroup/#!/group/{root_group_id}" target="_blank">{root_group_name}</a>' 
        if pd.notna(root_group_id) and root_group_id.strip() else "" 
        for root_group_id, root_group_name in zip(gene_group_lim["root_group_id"], gene_group_lim["root_group_name"])
    ]
gene_group

In [None]:
gene_pair_annot_ligand["Ligand group"]
ligand_mapping = dict(zip(gene_pair_annot_ligand['Ligand HGNC ID'], gene_pair_annot_ligand['Ligand group']))

In [None]:
ligand_mapping

In [None]:
# Add Disease Category per pair
df= pd.read_csv("data/disease_annotations_per_pair.csv")
df_cat=pd.read_csv("data/disease_categories.csv")
mapping = dict(zip(df_cat['Disease Name'], df_cat['Category']))
# Replace values in the column based on the mapping
df["Disease Type"] = df['disease'].replace(mapping)
df = df[["interaction", "Disease Type"]].drop_duplicates()
df['Disease Type'] = df['Disease Type'].astype(str)
df = df.sort_values(by='Disease Type', ascending=True)
# Group by 'col1' and combine 'col2' values with ', '
df = df.groupby('interaction')['Disease Type'].apply(', '.join).reset_index()
# Create "Cancer-related" column based on whether "Cancers & Neoplasms" is in col2
df['Cancer-related'] = df['Disease Type'].apply(lambda x: 'Yes' if 'Cancer' in x else 'No')
disease_df = df[df["interaction"].isin(LR_pairs)]
# Function to update the "Cancer-related" column and modify "col2" if needed

gene_pair = gene_pair.merge(disease_df, how='left', left_on='Human LR Pair', right_on='interaction')

In [None]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings
import urllib.parse

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet",
    "cjacchus", "mmulatta"
]

# Select only the relevant columns from pop_up_info
cols_to_keep = cols_to_keep = list(range(0, 30)) 
# Step 3: Load file using only the desired columns
df = pd.read_table("data/HGNC_gene_info_full.tsv", usecols=cols_to_keep)
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          "rgd_id": "RGD ID",
                                          "mgd_id": "MGI ID", 
                                          "rgd_id": "RGD ID",
                                          "alias_symbol": "Alias symbol", # add to table
                                          "prev_symbol": "Previous symbol", # add to table
                                          "date_symbol_changed": "Date symbol changed"
                                         })

# Keep only first MGI/RGD ID
pop_up_info["MGI ID"] = pop_up_info["MGI ID"].str.split("|").str[0]
pop_up_info["RGD ID"] = pop_up_info["RGD ID"].str.split("|").str[0]

pop_up_info["Alias symbol"] = pop_up_info["Alias symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

pop_up_info["Previous symbol"] = pop_up_info["Previous symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

# Replace "|" with ", "
pop_up_info["Alias symbol"] = [value.replace("|", ", ") for value in pop_up_info["Alias symbol"]]
pop_up_info["Previous symbol"] = [value.replace("|", ", ") for value in pop_up_info["Previous symbol"]]

pop_up_info["Date symbol changed"] = pop_up_info["Date symbol changed"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)


pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "MGI ID", "RGD ID", "Alias symbol",
                               "Approved symbol", "Previous symbol"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair.dropna(axis=1, how='all')
gene_pair = gene_pair[gene_pair['LR pair'] != '']
# for now set source count as triplicates
sourceCount = len(gene_pair[['LR pair']])

# for now, keep only the following columns
gene_pair = gene_pair[['LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
                       'perplexity link', 'PMID', 'binding location', 
                       'bind in trans?', 'bidirectional signalling?',
                       'interaction type', 'original source']]

gene_pair = gene_pair.dropna(subset=['LR pair'])

# some PMIDs kick in with "," so replace
gene_pair["PMID"] = [value.replace(",", "") for value in gene_pair["PMID"]]
gene_pair = gene_pair.dropna(subset=['PMID'])

# Mapping for replacements
mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# Replace values in the column based on the mapping
gene_pair['original source'] = gene_pair['original source'].replace(mapping)

## add Ligand/Receptor Location
def dedup_locations(loc_str):
    # Split, strip, deduplicate, and sort
    parts = [loc.strip() for loc in loc_str.split(',') if loc.strip()]
    unique_sorted = sorted(set(parts), key=str.lower)  # case-insensitive sort
    return unique_sorted

def generate_LocToolTip(row, geneloc, loc_col):
    ligand = row[loc_col]
    original_locations = [loc.strip() for loc in row["location"].split(',')]
    original_sources = [src.strip() for src in row["source"].split(',')]

    # Get deduplicated locations
    unique_locations = dedup_locations(row["location"])

    if len(unique_locations) == 1:
        # Single tooltip case
        location = unique_locations[0]
        matching_rows = geneloc[(geneloc[loc_col] == ligand) & (geneloc["location"].str.contains(location))]
        all_sources = matching_rows["source"].unique()
        sources_str = ", ".join(sorted(set(all_sources)))
        return f'<span title="based on {sources_str}">{location}</span>'
    else:
        # Multiple tooltips — find each (ligand, location) match in original df
        spans = []
        for loc in unique_locations:
            matching_rows = geneloc[
                (geneloc[loc_col] == ligand) &
                (geneloc["location"].str.contains(loc))
            ]
            all_sources = matching_rows["source"].unique()
            sources_str = ", ".join(sorted(set(all_sources)))
            spans.append(f'<span title="based on {sources_str}">{loc}</span>')
        return ", ".join(spans)


# Group the original loc_info by Ligand
ligand_loc = fetchGSheet.ligand_loc.dropna(axis=1, how='all')
grouped = ligand_loc.groupby("Ligand").agg({
    "location": lambda x: ', '.join(x),
    "source": lambda x: ', '.join(x)
}).reset_index()

# Generate tooltips
grouped["Ligand location"] = grouped.apply(lambda row: generate_LocToolTip(row, ligand_loc,loc_col="Ligand"), axis=1)
# create dict
mapping_loc = dict(zip(grouped['Ligand'], grouped['Ligand location'])) 
gene_pair['Ligand location'] = gene_pair['Ligand'].replace(mapping_loc)


# Group the original loc_info by Receptor
receptor_loc = fetchGSheet.receptor_loc.dropna(axis=1, how='all')
grouped = receptor_loc.groupby("Receptor").agg({
    "location": lambda x: ', '.join(x),
    "source": lambda x: ', '.join(x)
}).reset_index()

# Generate tooltips
grouped["Receptor location"] = grouped.apply(lambda row: generate_LocToolTip(row, receptor_loc,loc_col="Receptor"), axis=1)
# create dict
mapping_loc = dict(zip(grouped['Receptor'], grouped['Receptor location'])) 
gene_pair['Receptor location'] = gene_pair['Receptor'].replace(mapping_loc)


# Set missing mappings to 'unknown'
gene_pair.loc[gene_pair['Ligand location'] == gene_pair['Ligand'], 'Ligand location'] = 'unknown'
gene_pair.loc[gene_pair['Receptor location'] == gene_pair['Receptor'], 'Receptor location'] = 'unknown'
# Set "n/a" to unknown
gene_pair['Ligand location'] = [value.replace("n/a", "unknown") for value in gene_pair['Ligand location']]
gene_pair['Receptor location'] = [value.replace("n/a", "unknown") for value in gene_pair['Receptor location']]

# Fetch species IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "LR pair": "Human LR Pair",
    "Ligand.HGNC": "Ligand HGNC ID",
    "Receptor.HGNC": "Receptor HGNC ID",
    "perplexity link": "Perplexity", # will be replaced with actual link later
    "original source": "Database Source",
    "Ligand location": "Ligand Location",
    "Receptor location": "Receptor Location",
    "binding location": "Binding Location",
    "bind in trans?" : "Trans-binding", 
    "bidirectional signalling?": "Bidirectional Signalling",
    "interaction type" : "Interaction Type"
    #"PMID": "PMID support" # was PMID support
})


# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand name", 
                                     "MGI ID": "Ligand MGI ID",
                                     "RGD ID": "Ligand RGD ID",
                                      "Alias symbol": "Ligand Aliases",
                                      "Previous symbol": "Ligand Old symbol",
                                     },
                            )
gene_pair = gene_pair.drop(columns=["HGNC ID", "Approved symbol"])
# Add top pathway per pair
LR_pairs = gene_pair["Human LR Pair"].unique()
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
#df = df[df["interaction"].isin(LR_pairs)]
# Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
#df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
#Keep ALL
df = df_sorted.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.groupby('interaction')['source'].apply(', '.join).reset_index()
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "PROGENy Pathway"
})
top_pathway_df["interaction"] = [value.replace("^", " ") for value in top_pathway_df["interaction"]]
gene_pair = gene_pair.merge(top_pathway_df, how='left', left_on='Human LR Pair', right_on='interaction')
gene_pair = gene_pair.drop(columns=["interaction"])
#df = df_unique.reset_index(drop=True)
top_pathway_df=fetchGSheet.kegg_pathway_info[["LR Pair", "kegg_pathway_id", "kegg_relationship", "kegg_pathway_name"]].copy()
# add link to kegg_pathway_name
top_pathway_df["kegg_pathway_name"] = [
    f'<a href="https://www.kegg.jp/pathway/{kegg_id}" target="_blank">{name}</a>'
    for kegg_id, name in zip(top_pathway_df["kegg_pathway_id"], top_pathway_df["kegg_pathway_name"])
]

# link to kegg_pathway_id
top_pathway_df["kegg_pathway_id"] = [
    f'<a href="https://www.kegg.jp/pathway/{id}" target="_blank">{id}</a>'
    for id in top_pathway_df["kegg_pathway_id"]]


top_pathway_df = top_pathway_df.rename(columns={
                                      "kegg_pathway_name": "KEGG Pathway",
                                      "kegg_relationship": "KEGG relationship",
                                      "kegg_pathway_id": "KEGG Pathway ID"
    
})
top_pathway_df1 = top_pathway_df[["LR Pair", "KEGG Pathway"]].drop_duplicates()
top_pathway_df1 = top_pathway_df1.groupby('LR Pair')['KEGG Pathway'].apply(', '.join).reset_index()
gene_pair = gene_pair.merge(top_pathway_df1, how='left', left_on='Human LR Pair', right_on='LR Pair')
gene_pair = gene_pair.drop(columns=["LR Pair"])

# Add Disease Category per pair
df= pd.read_csv("data/disease_annotations_per_pair.csv")
df_cat=pd.read_csv("data/disease_categories.csv")
mapping = dict(zip(df_cat['Disease Name'], df_cat['Category']))
# Replace values in the column based on the mapping
df["Disease Type"] = df['disease'].replace(mapping)
df = df[["interaction", "Disease Type"]].drop_duplicates()
df['Disease Type'] = df['Disease Type'].astype(str)
df = df.sort_values(by='Disease Type', ascending=True)
# Group by 'col1' and combine 'col2' values with ', '
df = df.groupby('interaction')['Disease Type'].apply(', '.join).reset_index()
# Create "Cancer-related" column based on whether "Cancers & Neoplasms" is in col2
df['Cancer-related'] = df['Disease Type'].apply(lambda x: 'Yes' if 'Cancer' in x else 'No')
disease_df = df[df["interaction"].isin(LR_pairs)]
# Function to update the "Cancer-related" column and modify "col2" if needed

gene_pair = gene_pair.merge(disease_df, how='left', left_on='Human LR Pair', right_on='interaction')

# Add MGI annotation
MGI_info = pd.read_csv("data/MGI_ID_biomart.csv")
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Ligand MGI ID', right_on='MGI ID')

# Find rows where Ligand HGNC ID is missing & copy Ligand to MGI name for those rows
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'MGI name'] = gene_pair.loc[mask, 'Ligand']
# Map MGI ID using the MGI_info table
gene_pair = gene_pair.merge(MGI_info, left_on='MGI name', right_on='MGI name', how='left', suffixes=('', '_from_info'))
# Fill missing 'MGI ID' only where it was previously missing
gene_pair['Ligand MGI ID'] = gene_pair['Ligand MGI ID'].combine_first(gene_pair['MGI ID_from_info'])
gene_pair = gene_pair.drop(columns=['MGI ID_from_info'])

# Add RGD annotation
RGD_info = pd.read_csv("data/RGD_ID_biomart.csv")
RGD_info['RGD ID'] = "RGD:" + RGD_info['RGD ID'].astype(str)
gene_pair = gene_pair.merge(RGD_info, how='left', left_on='Ligand RGD ID', right_on='RGD ID')

# Add ZFIN id and symbol
ZFIN_info = pd.read_csv("data/ZFIN_ID_human_orthos.txt", sep="\t", skiprows=1)
ZFIN_info = ZFIN_info[['ZFIN ID', 'ZFIN Symbol', 'ZFIN Name', 'HGNC ID']]

ZFIN_info = ZFIN_info.dropna(subset=['HGNC ID'])
ZFIN_info = ZFIN_info.drop_duplicates(subset=['HGNC ID'])
ZFIN_info['HGNC ID'] = ZFIN_info['HGNC ID'].apply(lambda x: f'HGNC:{int(x)}')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.drop(columns=["RGD ID", "MGI ID", "HGNC ID", "interaction"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Ligand", 
                                     "RGD name": "Rat Ligand",
                                     "ZFIN ID": "Ligand ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Ligand",
                                     "ZFIN Name": "Zebrafish Ligand name"}
                            )

gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Receptor name",
                                      "MGI ID": "Receptor MGI ID",
                                      "RGD ID": "Receptor RGD ID",
                                      "Alias symbol": "Receptor Aliases",
                                      "Previous symbol": "Receptor Old symbol",}
                            )


gene_pair = gene_pair.drop(columns=["HGNC ID"])

# Add new columns where all Ligand Symbol & Aliases and Receptor Symbol & Aliases merged in one column
def format_symbol_aliases(symbol, old_symbol, aliases):
    # Filter out "N/A" values
    parts = [p for p in (old_symbol, aliases) if p != "N/A"]
    # Return just the symbol if no valid aliases or old symbols
    return f"{symbol} ({', '.join(parts)})" if parts else symbol

gene_pair['Ligand Symbol & Aliases'] = gene_pair.apply(
    lambda row: format_symbol_aliases(row['Ligand'], row['Ligand Old symbol'], row['Ligand Aliases']),
    axis=1
)

gene_pair['Receptor Symbol & Aliases'] = gene_pair.apply(
    lambda row: format_symbol_aliases(row['Receptor'], row['Receptor Old symbol'], row['Receptor Aliases']),
    axis=1
)

### tooltips 
gene_pair["Ligand Symbol & Aliases"] = [
    f'<span title="{aliases}">{aliases}</span>'
    for aliases in gene_pair["Ligand Symbol & Aliases"]
]
gene_pair["Receptor Symbol & Aliases"] = [
    f'<span title="{aliases}">{aliases}</span>'
    for aliases in gene_pair["Receptor Symbol & Aliases"]
]


# Add MGI name
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Receptor MGI ID', right_on='MGI ID')
# Find rows where Receptor HGNC ID is missing & copy Receptor to MGI name for those rows
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'MGI name'] = gene_pair.loc[mask, 'Receptor']
# Map MGI ID using the MGI_info table
gene_pair = gene_pair.merge(MGI_info, left_on='MGI name', right_on='MGI name', how='left', suffixes=('', '_from_info'))
# Fill missing 'MGI ID' only where it was previously missing
gene_pair['Receptor MGI ID'] = gene_pair['Receptor MGI ID'].combine_first(gene_pair['MGI ID_from_info'])
gene_pair = gene_pair.drop(columns=['MGI ID_from_info'])

gene_pair = gene_pair.merge(RGD_info, how='left', left_on='Receptor RGD ID', right_on='RGD ID')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')
gene_pair = gene_pair.drop(columns=["RGD ID", "MGI ID", "HGNC ID"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Receptor", 
                                     "RGD name": "Rat Receptor",
                                     "ZFIN ID": "Receptor ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Receptor",
                                     "ZFIN Name": "Zebrafish Receptor name"}
                            )

#gene_pair = gene_pair.drop(columns=["Approved symbol_x", "Approved symbol_y"])

# Function to add species-specific species Enseml ID and symbol for all other species except for mouse, rat, and zebrafish
def appendOtherSpeciesInfo(species, origDF):
    species_name = {
    "ptroglodytes": "Chimpanzee",
    "ggallus": "Chicken",
    "sscrofa": "Pig",
    "btaurus": "Cow",
    "clfamiliaris": "Dog",
    "ecaballus": "Horse",
    "oarambouillet": "Sheep",
    "cjacchus": "Marmoset",
    "mmulatta": "Rhesus Monkey"    
    }.get(species, "Unknown species")
    
    # Load species-specific data
    species_info = pd.read_csv(f"data/{species}_ID_biomart.csv")

    # Keep relevant columns
    species_info = species_info[[f"{species}_homolog_ensembl_gene", 
                                 f"{species}_homolog_associated_gene_name", 
                                 'hgnc_id']]

    # Remove rows where 'hgnc_id' is NaN and drop duplicates
    species_info = species_info.dropna(subset=['hgnc_id'])
    species_info = species_info.drop_duplicates(subset=['hgnc_id'])

    # Merge with ligand data
    origDF = origDF.merge(species_info, how='left', 
                           left_on='Ligand HGNC ID', right_on='hgnc_id')
    
    # Rename columns for ligand info
    origDF = origDF.rename(columns={
        f"{species}_homolog_associated_gene_name": f"{species_name} Ligand", 
        f"{species}_homolog_ensembl_gene": f"{species_name} Ligand Ensembl ID"
    })

    # Drop duplicate 'hgnc_id' column
    origDF = origDF.drop(columns=['hgnc_id'])

    # Merge with receptor data
    origDF = origDF.merge(species_info, how='left', 
                           left_on='Receptor HGNC ID', right_on='hgnc_id')

    # Rename columns for receptor info
    origDF = origDF.rename(columns={
        f"{species}_homolog_associated_gene_name": f"{species_name} Receptor", 
        f"{species}_homolog_ensembl_gene": f"{species_name} Receptor Ensembl ID"
    })

        # Drop duplicate 'hgnc_id' column
    origDF = origDF.drop(columns=['hgnc_id'])

    # Drop columns where all values are NaN
    origDF = origDF.dropna(axis=1, how='all')

    return origDF


# Loop through each species and update gene_pair
for species in species_list:
    gene_pair = appendOtherSpeciesInfo(species, gene_pair)

# Drop columns where all values are NA in gene_pair
gene_pair = gene_pair.dropna(axis=1, how='all')

gene_pair = gene_pair.fillna(" ")
gene_pair = gene_pair[gene_pair['Human LR Pair'] != ' ']

# if "PMID link" in gene_pair.columns:
#    gene_pair = gene_pair.drop(columns=["PMID link"])

# Add
first_columns=['Human LR Pair', 'Ligand', 'Receptor', 'Database Source']

#end_columns=['HGNC L R', 'sanity check', 'curator', 'secondary source?']
#gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns + end_columns] + end_columns]
gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns]]

# number of unique vars

lrPairsCount = len(gene_pair["Human LR Pair"].unique())

ligandCount = len(gene_pair["Ligand"].unique())

receptorCount = len(gene_pair["Receptor"].unique())

# Mouse Orthologue
MouseLigandCount = len(gene_pair["Ligand MGI ID"].unique())

MouseReceptorCount = len(gene_pair["Receptor MGI ID"].unique())

# Rat Orthologue
RatLigandCount = len(gene_pair["Ligand RGD ID"].unique())

RatReceptorCount = len(gene_pair["Receptor RGD ID"].unique())

gene_pair["PMID"] = [value.replace(" ", "") for value in gene_pair["PMID"]] # was'PMID support'


source = np.array(gene_pair["PMID"].unique())
source = source.astype(str)
source = ",".join(sorted(set(filter(lambda x: x.lower() != 'nan', source))))
# Split the string into individual elements, filter out empty strings, and get unique values
source = sorted(
    set(filter(lambda x: x.strip() and x.strip().lower() != 'nan', source.split(',')))
)
source = [value.replace(" ", "") for value in source]



# Function to join unique sorted values
agg_func = lambda x: ', '.join(sorted(set(map(str, x))))

# Group and aggregate all columns except 'LR pair'
gene_pair = gene_pair.groupby('Human LR Pair').agg(agg_func).reset_index()
gene_pair = gene_pair[gene_pair['Human LR Pair'] != '']
DBlength = len(gene_pair)
gene_pair["Interaction ID"] = [f"CDB{str(i).zfill(5)}" for i in range(1, DBlength + 1)]

# for creating PMIDs
gene_pair00 = gene_pair[['Human LR Pair', 'PMID']] # was "PMID support"

# Recreate Perplexity link
def create_url_basic(perplexity_col):
    query = f"What is the primary evidence that {perplexity_col} bind-each-other-as-a-ligand-and-receptor-pair. Exclude reviews, uniprot, wiki, genecards, PIPS, iuphar as sources."
    encoded_query = query.replace(" ", "%20")
    return f"https://www.perplexity.ai/search?q={encoded_query}"
# Option 2 -- new query all together
def generate_perplexity_link_pmid(row): 
    query = f"What-is-the-biological-relevance-of-the-ligand-and-receptor-pair-{row['Human LR Pair']}-based-on-Pubmed-ID-{row['PMID']}"
    return (
        f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank">'
        f'<img src="https://img.icons8.com/?size=30&id=0NbBuNOxUwps&format=png&color=000000" alt="Perplexity AI" /></a>'
    )
# Apply function to the DataFrame
gene_pair["Perplexity"] = gene_pair.apply(generate_perplexity_link_pmid, axis=1)

# create URLs for the HGNC IDs

# ligand
gene_pair["Ligand HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(ligand, ligand)
    for ligand in gene_pair["Ligand HGNC ID"]
]

# receptor
gene_pair["Receptor HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(receptor, receptor)
    for receptor in gene_pair["Receptor HGNC ID"]
]


# Function to generate hyperlinks for the "PMID support" column
def generate_links_with_doi(df, gene_column, pmid_column, id_column):
    def create_link(gene, id_col, sources):
        # Replace spaces with "——" in the gene name for the link
        gene_name = gene.replace(" ", "——")
        
        if len(sources) == 1:
            source = sources[0]
            if source.startswith("https://www.biorxiv.org/content/"):
                # If the value starts with "https://doi.org/", use it as the hyperlink
                return f'<a href="{source}" target="_blank">BioRxiv</a>'
            else:
                # If it's a single PMID, hyperlink the PMID text
                return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{gene}_{id_col}.html">{source}</a>'
        else:
            # If multiple PMIDs, show the count and hyperlink to the page
            return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{gene}_{id_col}.html" target="_blank">{len(sources)} PMIDs</a>'

    # Process each row to generate the "PMID" column # was "PMID support"
    df["PMID"] = [
        create_link(
            gene=row[gene_column], 
            id_col = row[id_column],
            sources=[s.strip() for s in row[pmid_column].split(',') if s.strip()]
        )
        for _, row in df.iterrows()
    ]
    return df


# Generate the links for the "PMID" column # was "PMID support"
gene_pair = generate_links_with_doi(gene_pair, gene_column="Human LR Pair", 
                                    pmid_column="PMID", id_column= "Interaction ID")

In [None]:
pd.unique(gene_pair["KEGG Pathway"])

In [None]:
gene_pair

In [None]:
# Group and aggregate all columns except 'LR pair'
agg_func = lambda x: ','.join(sorted(set(map(str, x))))
gene_pair1 = gene_pair.groupby('Human LR Pair').agg(agg_func).reset_index()

In [None]:
gene_pair1

In [None]:
gene_pair_keywords_df=gene_pair000
template=template
interaction_card_df=interaction_card
ligand_card_1_df=ligand_card_1
receptor_card_1_df=receptor_card_1
ligand_card_2_df=ligand_card_2
receptor_card_2_df=receptor_card_2
pubmed_data_df=pubmed_data
gene_pair_main_df=gene_pair0
output_dir=OUTPUT_DIR

In [None]:
gene_pair0_copy

In [None]:
# Use the 'Human LR Pair' from the gene_pair_keywords_df for iteration
for idx, row in gene_pair_keywords_df.iterrows():
    lr_pair_name = row["Human LR Pair"] # This is the gene_name (e.g., VEGFA——KDR)
    keywords = row["Relevance Keywords"] # Keywords from llm_results
    pmids_str = row["PMID"] # PMIDs associated with this LR Pair

    # Split LR Pair for file naming and data filtering
    value1, value2 = lr_pair_name.replace("——", " ").split()

    # Data rows for each section, filtered by the current LR Pair
    row0 = interaction_card_df[interaction_card_df['Human LR Pair'] == lr_pair_name]
    row1 = ligand_card_1_df[ligand_card_1_df['Human LR Pair'] == lr_pair_name]
    row2 = receptor_card_1_df[receptor_card_1_df['Human LR Pair'] == lr_pair_name]
    row3 = ligand_card_2_df[ligand_card_2_df['Human LR Pair'] == lr_pair_name]
    row4 = receptor_card_2_df[receptor_card_2_df['Human LR Pair'] == lr_pair_name]

    # make the pair cards here
    def convert_pair_url(df_pairs):
        df_pairs["Human LR Pair"] = [
            f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrpair}.html" target="_blank" '
            f'title="Open {lrpair} card" style="color: #0000EE; text-decoration: underline;">'
            f'{lrpair}</a>'
            if pd.notna(lrpair) and lrpair.strip() else ""
            for lrpair in df_pairs["Human LR Pair"]
        ]
        return df_pairs

    # Related ligand pairs
    ligand_pairs = gene_pair_main_df[gene_pair_main_df['Ligand'] == value1]
    ligand_pairs = ligand_pairs[ligand_pairs["Human LR Pair"].str.replace(" ", "——") != lr_pair_name] # Ensure comparison with placeholder
    ligand_pairs = convert_pair_url(ligand_pairs[["Human LR Pair"]])
    ligand_pairs_str = ' ・ '.join([btn for btn in ligand_pairs["Human LR Pair"] if btn])

    # Related receptor pairs
    receptor_pairs = gene_pair_main_df[gene_pair_main_df['Receptor'] == value2]
    receptor_pairs = receptor_pairs[receptor_pairs["Human LR Pair"].str.replace(" ", "——") != lr_pair_name] # Ensure comparison with placeholder
    receptor_pairs = convert_pair_url(receptor_pairs[["Human LR Pair"]])
    receptor_pairs_str = ' ・ '.join([btn for btn in receptor_pairs["Human LR Pair"] if btn])
    
    # Prepare table data (convert to dict)
    table0_data = row0.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row0.empty else {}
    table1_data = row1.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row1.empty else {}
    table2_data = row2.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row2.empty else {}
    table3_data = row3.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row3.empty else {}
    table4_data = row4.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row4.empty else {}

    # PMID stuff
    tab_headers = []
    tab_contents = []
    sources = [pmid.strip() for pmid in str(pmids_str).split(',') if pmid.strip()]

    if sources:
        for i, pmid in enumerate(sources):
            pubmed_row = pubmed_data_df[pubmed_data_df["PMID"] == pmid]

            if not pubmed_row.empty:
                title = pubmed_row["Title"].values[0]
                abstract = pubmed_row["Abstract"].values[0]
                journal = pubmed_row["Journal"].values[0]
                year = pubmed_row["Year"].values[0]
            else:
                title = "No Title Found"
                abstract = "No Abstract Found"
                journal = "Journal Unknown"
                year = "Year Unknown"

            active_class = "active" if i == 0 else ""
            tab_headers.append(f'<button class="tablinks {active_class}" onclick="openTab(event, \'tab{pmid}\')">{pmid}</button>')
            tab_contents.append(f"""
            <div id="tab{pmid}" class="tabcontent {active_class}">
                <h2>{title}</h2>
                <p><strong>{journal}, {year}; <a href="https://pubmed.ncbi.nlm.nih.gov/{pmid}/" target="_blank">For more details, see PubMed</a></strong></p>
                <p>{abstract}</p>
            </div>
            """)

    # Assuming you have images for ligand and receptor or they are not needed in the template
    # If you intend to use them, define them here, e.g.:
    ligand_image = encode_image("path/to/ligand_image.png") # Replace with actual path
    receptor_image = encode_image("path/to/receptor_image.png") # Replace with actual path

    # Render the template with all data
    rendered_content = template.render(
        gene_name=lr_pair_name.replace("——", " "), # Pass the human-readable pair name
        value1=value1,
        value2=value2,
        table0_data=table0_data,
        table1_data=table1_data,
        table2_data=table2_data,
        table3_data=table3_data,
        table4_data=table4_data,
        ligand_image=ligand_image,
        receptor_image=receptor_image,
        ligand_pairs=ligand_pairs_str,
        receptor_pairs=receptor_pairs_str,
        tab_headers="".join(tab_headers),
        tab_contents="".join(tab_contents),
        keywords=keywords
    )


In [None]:
import os
import jinja2
import sys
import pandas as pd
import numpy as np
import time
import base64
import re

# Add the src directory to the path for importing modules
sys.path.append(os.path.abspath("src"))

# Import necessary modules from your existing src files
# Ensure createDataTable and createFunctionalAnnotTable are in your 'src' directory
from createDataTable import pop_up_info, gene_pair0, generate_perplexity_links, gene_pair00
from createFunctionalAnnotTable import gene_pair_annot_ligand, gene_pair_annot_receptor

# --- Paths ---
MERGED_TEMPLATE_PATH = 'HTML/mergedCardWithPMIDTemplate.html'
OUTPUT_DIR = 'data/cards/' # New output directory for combined files

# --- Load and Preprocess Data (Combined from both scripts) ---

# Load PubMed data (from createPMIDpages.py)
pubmed_data = pd.read_csv("data/pubmed_results.csv")
pubmed_data["Year"] = pubmed_data["Year"].astype(str).str.replace(".0", "", regex=False).astype(int)
pubmed_data["PMID"] = pubmed_data["PMID"].astype(str)
pubmed_data = pubmed_data.reset_index(drop=True)

# Load LLM results (from createPMIDpages.py)
bio_keywords = pd.read_csv("data/llm_results.csv")

# --- Prepare gene_pair00 for PMID section (from createPMIDpages.py) ---
# gene_pair00 is used for PMID and Keywords, so it needs the '——' placeholder
# Ensure gene_pair00 is a copy to avoid SettingWithCopyWarning later
gene_pair00_copy = gene_pair00.copy()
gene_pair00_copy["Human LR Pair"] = gene_pair00_copy["Human LR Pair"].str.replace(" ", "——")

# Merge with LLM results
gene_pair000 = gene_pair00_copy.merge(bio_keywords, how='left', left_on="Human LR Pair", right_on='Human LR Pair')
gene_pair000["Relevance Keywords"] = gene_pair000["Relevance Keywords"].astype(str)
gene_pair000["Human LR Pair"] = gene_pair000["Human LR Pair"].astype(str) # Ensure string type

# --- Prepare gene_pair0 for Card section (from createCards.py) ---
# gene_pair0 is used for card details, it should retain spaces for splitting gene names
# Ensure gene_pair0 is a copy to avoid SettingWithCopyWarning later
gene_pair0_copy = gene_pair0.copy()

# Add Disease (specific) to cards
df_disease = pd.read_csv("data/disease_annotations_per_pair.csv")
df_disease = df_disease.groupby('interaction')['disease'].apply(', '.join).reset_index()
mapping_disease = dict(zip(df_disease['interaction'], df_disease['disease']))
gene_pair0_copy["Disease"] = gene_pair0_copy['Human LR Pair'].map(mapping_disease).fillna("unknown")

gene_pair0_copy = generate_perplexity_links(
    gene_pair0_copy,
    pathway_col="Disease",
    default_query_template="What-diseases-is-the-ligand-receptor-pair-{pair}-associated-with"
)

gene_pair0_copy["Interaction ID"] = gene_pair0_copy["Interaction ID"].apply(
    lambda x: f"<a href='https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/database/filter/{x}.html'>{x}</a>"
)
# Add external link icon
icon_html = '<i class="fa-solid fa-arrow-up-right-from-square" style="margin-left:4px;"></i></a>'
columns_to_update = [
    "KEGG Pathway", "PROGENy Pathway", "Cancer-related",
    "Disease Type", "Disease"
]
for col in columns_to_update:
    gene_pair0_copy[col] = gene_pair0_copy[col].str.replace(
        "</a>", icon_html, regex=False
    )

# Add Ligand/Receptor group info
agg_func = lambda x: ', '.join(sorted(set(map(str, x))))
gene_pair_annot_ligand = gene_pair_annot_ligand.groupby('Ligand HGNC ID').agg(agg_func).reset_index()
ligand_mapping = dict(zip(gene_pair_annot_ligand['Ligand HGNC ID'], gene_pair_annot_ligand['Ligand group']))

gene_pair_annot_receptor = gene_pair_annot_receptor.groupby('Receptor HGNC ID').agg(agg_func).reset_index()
receptor_mapping = dict(zip(gene_pair_annot_receptor['Receptor HGNC ID'], gene_pair_annot_receptor['Receptor group']))


# --- Helper Functions (Combined and adjusted) ---

def load_template(template_path):
    """Load Jinja2 template from a file."""
    with open(template_path, 'r', encoding='utf-8') as file:
        return jinja2.Template(file.read())

def encode_image(image_path):
    """Encode an image to base64. (Not used in this version, but kept for reference)"""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except FileNotFoundError:
        return None

def extract_hgnc_id(col):
    """Use regular expression to extract the HGNC ID after 'HGNC:'."""
    match = re.search(r'HGNC:(\d+)', col)
    if match:
        return match.group(1)
    return None

def convert_hgnc_url(col):
    hgnc_id = extract_hgnc_id(col)
    if hgnc_id:
        visible_text = 'GeneCards <i class="fa-solid fa-arrow-up-right-from-square" style="margin-left: 4px;"></i>'
        new_link = (
            f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}" '
            f'target="_blank" style="color: #0000EE; text-decoration: underline;">{visible_text}</a>'
        )
        return new_link
    return None

def convert_hgnc_url_disease(col):
    hgnc_id = extract_hgnc_id(col)
    if hgnc_id:
        visible_text = 'MalaCards <i class="fa-solid fa-arrow-up-right-from-square" style="margin-left: 4px;"></i>'
        new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}#diseases" target="_blank">{visible_text}</a>'
        return new_link
    return None

def convert_hgnc_url_exp(col):
    hgnc_id = extract_hgnc_id(col)
    if hgnc_id:
        visible_text = 'mRNA expression in normal human tissues <i class="fa-solid fa-arrow-up-right-from-square" style="margin-left: 4px;"></i>'
        new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}#expression" target="_blank">{visible_text}</a>'
        return new_link
    return None

def prepare_card_dataframes(gene_pair_input_df):
    """Prepare interaction, ligand, and receptor dataframes for the card section."""
    # Ensure gene_pair_input_df is a copy to avoid SettingWithCopyWarning
    gene_pair_input_df = gene_pair_input_df.copy()

    gene_pair_input_df["Interaction Type"] = [
        f'{ligand} {ligandLocation} ligand binds to {receptor} {receptorLocation} receptor'
        for ligand, ligandLocation, receptor, receptorLocation in zip(
            gene_pair_input_df["Ligand"], gene_pair_input_df["Ligand Location"],
            gene_pair_input_df["Receptor"], gene_pair_input_df["Receptor Location"]
        )
    ]
    interaction_card = gene_pair_input_df[["Interaction ID", "Human LR Pair", "Interaction Type", "Perplexity", "PMID", "KEGG Pathway",  "PROGENy Pathway", "Cancer-related", "Disease Type", "Disease"]]
    interaction_card["Perplexity"] = interaction_card["Perplexity"].str.replace('size=30', 'size=80')

    pop_up_info_lim = pop_up_info[
        ["Approved symbol", "Alias symbol", "Previous symbol", "Date symbol changed"]
    ].drop_duplicates(subset="Approved symbol", keep="first")

    def format_symbol_aliases(old_symbol, aliases):
        parts = [p for p in (old_symbol, aliases) if p != "N/A"]
        return f"{', '.join(parts)}" if parts else aliases

    pop_up_info_lim['Other Symbols'] = pop_up_info_lim.apply(
        lambda row: format_symbol_aliases(row["Previous symbol"], row["Alias symbol"]),
        axis=1
    )

    ligand_card = gene_pair_input_df[["Human LR Pair", "Ligand", "Ligand name", "Ligand HGNC ID", "Ligand MGI ID", "Ligand RGD ID", "Ligand Location"]].merge(
        pop_up_info_lim, how='left', left_on='Ligand', right_on='Approved symbol'
    ).drop_duplicates(subset='Human LR Pair', keep="first").drop(columns=["Ligand", "Approved symbol"])

    ligand_card_1 = ligand_card[["Human LR Pair", "Ligand name", "Other Symbols" ]]
    ligand_card_2 = ligand_card[["Human LR Pair", "Ligand HGNC ID", "Ligand Location"]]
    ligand_card_2["HGNC gene card"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url)
    ligand_card_2["Disease relevance"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url_disease)
    ligand_card_2["Expression Profile"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url_exp)
    ligand_card_2["Lineage group"] = ligand_card_2['Ligand HGNC ID'].map(ligand_mapping).fillna("none")
    icon_html_card = '<i class="fa-solid fa-arrow-up-right-from-square" style="margin-left:4px;"></i></a>' # Use a different name to avoid conflict
    for col in ["Ligand HGNC ID"]:
        ligand_card_2[col] = ligand_card_2[col].str.replace(
            "</a>", icon_html_card, regex=False
        )
    ligand_card_2 = ligand_card_2[["Human LR Pair", "Ligand HGNC ID", "HGNC gene card", "Ligand Location", "Lineage group", "Disease relevance", "Expression Profile"]]


    receptor_card = gene_pair_input_df[["Human LR Pair", "Receptor", "Receptor name", "Receptor HGNC ID", "Receptor MGI ID", "Receptor RGD ID", "Receptor Location"]].merge(
        pop_up_info_lim, how='left', left_on='Receptor', right_on='Approved symbol'
    ).drop_duplicates(subset='Human LR Pair', keep="first").drop(columns=["Receptor", "Approved symbol"])

    receptor_card_1 = receptor_card[["Human LR Pair", "Receptor name", "Other Symbols"]]
    receptor_card_2 = receptor_card[["Human LR Pair", "Receptor HGNC ID", "Receptor Location"]]
    receptor_card_2["HGNC gene card"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url)
    receptor_card_2["Disease relevance"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url_disease)
    receptor_card_2["Expression Profile"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url_exp)
    receptor_card_2["Lineage group"] = receptor_card_2['Receptor HGNC ID'].map(receptor_mapping).fillna("none")
    for col in ["Receptor HGNC ID"]:
        receptor_card_2[col] = receptor_card_2[col].str.replace(
            "</a>", icon_html_card, regex=False
        )
    receptor_card_2 = receptor_card_2[["Human LR Pair", "Receptor HGNC ID",  "HGNC gene card", "Receptor Location", "Lineage group", "Disease relevance", "Expression Profile" ]]

    return interaction_card, ligand_card_1, ligand_card_2, receptor_card_1, receptor_card_2

# Define test genes - these should be in the 'space' format for gene_pair0
    # and will be converted to '——' for gene_pair000 internally.
test_genes = ["VEGFA ITGB1", "VEGFA KDR", "VEGFA NRP1"] # Example genes

    # Filter gene_pair0 for the test genes to be used in prepare_card_dataframes
    # This gene_pair_input should have space-separated LR pairs
gene_pair_input = gene_pair0_copy[gene_pair0_copy["Human LR Pair"].isin(test_genes)]

In [None]:
gene_pair_main_df=gene_pair0_copy
gene_pair_main_df

In [None]:
page_navigation_map = []
    # Ensure df_for_navigation is sorted consistently to guarantee correct prev/next order.
df_for_navigation = gene_pair_main_df.sort_values(
        by="Interaction ID",
        key=lambda series: series.apply(
            lambda x: int(re.search(r'CDB(\d+)</a>', x).group(1))
            if isinstance(x, str) and re.search(r'CDB(\d+)</a>', x)
            else -1 # Return -1 (or 0) for sorting if the pattern is not found or if x is not a string
        )
    ).reset_index(drop=True)
df_for_navigation

In [None]:
gene_pair_main_df

In [None]:
df_for_rendering_and_navigation = cleaned_main_df.sort_values(
        by="Clean Interaction ID",
        key=lambda series: series.str[3:].astype(int) # Now it's just "CDBXXXXX" so simple slicing works
    ).reset_index(drop=True)
df_for_rendering_and_navigation

In [None]:
page_navigation_map = []
    # Ensure df_for_navigation is sorted consistently to guarantee correct prev/next order.
df_for_navigation = gene_pair_main_df.sort_values(by="Interaction ID", key=lambda x: x.str[3:].astype(int)).reset_index(drop=True)
for idx, row in df_for_navigation.iterrows():
        interaction_id = row["Interaction ID"] # e.g., 'CDB00001'
        human_lr_pair = row["Human LR Pair"] # e.g., 'VEGFA KDR'

        # Construct the filename with interaction_id appended
value1, value2 = human_lr_pair.split(" ")
        # UPDATED FILENAME CONSTRUCTION
filename = f"{value1.strip()} —— {value2.strip()}_{interaction_id}.html"
full_url_path = os.path.join("/data/cards/", filename) # Path relative to web root

page_navigation_map.append({
            "interaction_id": interaction_id,
            "url": full_url_path,
            "filename": filename # Store filename for saving later
        })
page_navigation_map


In [None]:
for i, page_info in enumerate(page_navigation_map):
        current_interaction_id = page_info["interaction_id"]
        current_filename_to_save = page_info["filename"]
            # Retrieve the original human_lr_pair for data filtering, if needed
        current_human_lr_pair = df_for_navigation.loc[df_for_navigation["Interaction ID"] == current_interaction_id, "Human LR Pair"].iloc[0]

        # Determine previous page info
        prev_page_info = None
        if i > 0:
            prev_page_info = page_navigation_map[i - 1]
        print(prev_page_info)
page_navigation_map

In [None]:
total_interaction_ids = gene_pair_main_df["Interaction ID"].str[3:].astype(int).max()
total_interaction_ids

In [None]:
gene_pair_main_df

In [None]:
# Replace spaces in "Human LR Pair" with a placeholder
gene_pair00["Human LR Pair"] = gene_pair00["Human LR Pair"].str.replace(" ", "——")
gene_pair000 = gene_pair00.merge(bio_keywords, how='left', left_on="Human LR Pair", right_on='Human LR Pair')
gene_pair000["Relevance Keywords"] = gene_pair000["Relevance Keywords"].astype(str)
gene_pair000["Human LR Pair"]  = gene_pair000["Human LR Pair"].astype(str)
pubmed_data = pubmed_data.reset_index(drop=True)  # Remove the index
gene_pair000

In [None]:
test_genes = ["VEGFA KDR", "ADAM17 IL6R"]
test_genes.replace("——", " ")

In [None]:
species_list

In [None]:
import os
import jinja2
import sys
import pandas as pd
import numpy as np
import time
import base64
import re


sys.path.append(os.path.abspath("src"))  
import fetchGSheet
from createDataTable import pop_up_info, gene_pair0, generate_perplexity_links
from createFunctionalAnnotTable import gene_pair_annot_ligand, gene_pair_annot_receptor
gene_pair0
value = "CD80 CD274"
value1 = "CD80"
ligand_pairs = gene_pair0[gene_pair0['Ligand'] == value1]
# print out all Human LR Pair values except for value
ligand_pairs = ligand_pairs[ligand_pairs["Human LR Pair"] != value]
ligand_pairs = ligand_pairs[["Human LR Pair"]]
# Create the links to the HTML cards
ligand_pairs["Human LR Pair"] = [
            f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrpair}.html" target="_blank" '
            f'role="button" title="Open {lrpair} card" class="btn btn-outline-primary" '
            f'style="background-color: #3498db; color: white; border-color: #2980b9; font-size: 16px; '
            f'padding: 8px 12px; margin: 4px; text-decoration: none; border-radius: 4px;">'
            f'{lrpair} Card</a>'
            if pd.notna(lrpair) and lrpair.strip() else ""
            for lrpair in ligand_pairs["Human LR Pair"]
        ]
        
        # Aggregate into one value separated by space
ligand_pairs = ' '.join([btn for btn in ligand_pairs["Human LR Pair"] if btn])
ligand_pairs

In [None]:
gene_pair_trip.columns

In [None]:
gene_pair_trip_cat = gene_pair_trip.iloc[:, [8,5,12]]
cat_cols_trip = gene_pair_trip_cat.select_dtypes(include='object').columns
cat_cols_trip

In [None]:
## Function to prepare functional annotation datatable
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
from bs4 import BeautifulSoup
from createDataTable import gene_pair, gene_pair0, generate_perplexity_link_pmid
import warnings
import fetchGSheet 
import string

def make_ids_unique(series):
    return [
        f"{id_val}{letter}"
        #if count > 1 else id_val -- UNCOMMENT SO UNIQUE ONES ARE ALSO WITH APPENDED LETTER ["A"]
        for id_val, count, letter in zip(
            series,
            series.groupby(series).transform('count'),
            series.groupby(series).cumcount().map(lambda i: string.ascii_uppercase[i] if i < 26 else f"_{i}")
        )
    ]

gene_pair0 = gene_pair0.dropna(subset=['Human LR Pair'])
mapping_ID = dict(zip(gene_pair0['Human LR Pair'], gene_pair0['Interaction ID']))
gene_pair_PMID = fetchGSheet.gene_pair.dropna(axis=1, how='all')
gene_pair_PMID = gene_pair_PMID[gene_pair_PMID['LR pair'] != '']
gene_pair_PMID= gene_pair_PMID[["LR pair", "PMID", "original source"]]
# Mapping for replacements
mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# Replace values in the column based on the mapping
gene_pair_PMID['Database Source'] = gene_pair_PMID['original source'].replace(mapping)

# Replace values in the column based on the mapping
gene_pair_PMID["Interaction ID"] = gene_pair_PMID['LR pair'].replace(mapping_ID)
df_pub = pd.read_csv("data/pubmed_results.csv", usecols=[0,1,3,4,5])
gene_pair_PMID["PMID"] = gene_pair_PMID["PMID"].astype(str)
df_pub["PMID"] = df_pub["PMID"].astype(str)
gene_pair_trip = pd.merge(gene_pair_PMID, df_pub, how='left', on='PMID')
### patch for the BioRxiV ###
gene_pair_trip["Year"] = gene_pair_trip["Year"].apply(
    lambda x: "2024" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "Species Specificity", "NaN", ""] else x
)
###
gene_pair_trip["Year"] = pd.to_numeric(gene_pair_trip["Year"], errors="coerce").astype("Int64")
gene_pair_trip = gene_pair_trip.merge(gene_pair, how='left', left_on='Interaction ID', right_on=gene_pair.columns[0])
gene_pair_trip = gene_pair_trip.drop(columns=["Interaction ID", gene_pair.columns[2], gene_pair.columns[5],gene_pair.columns[6]])
gene_pair_trip = gene_pair_trip.drop_duplicates()
gene_pair_trip = gene_pair_trip.reset_index(drop=True)  

# Add perplexity query
gene_pair_trip = gene_pair_trip.rename(columns={"LR pair": "Human LR Pair"})
gene_pair_trip["Perplexity"] = gene_pair_trip.apply(generate_perplexity_link_pmid, axis=1)
gene_pair_trip = gene_pair_trip.drop(columns=["Human LR Pair", "original source"])

gene_pair_trip["Species"] = gene_pair_trip["Species"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "Species Specificity", "NaN", ""] else x
)

### patch for the BioRxiV ###
gene_pair_trip["Title"] = gene_pair_trip["Title"].apply(
    lambda x: "ACKR5/GPR182 is a scavenger receptor for the atypical chemokine CXCL17, GPR15L and various endogenous peptides." if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "NaN", ""] else x
)

### Pop-up for title
gene_pair_trip["Title"] = [
    f'<span title="{title}">{title}</span>'
    for title in gene_pair_trip["Title"]
]

gene_pair_trip["Journal"] = gene_pair_trip["Journal"].apply(
    lambda x: "bioRxiv" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "NaN", ""] else x
)


gene_pair_trip["PMID"] = [
    f'<a href="https://pubmed.ncbi.nlm.nih.gov/{pmid}" target="_blank">{pmid}</a>'
    if pd.notna(pmid) and str(pmid).isdigit()
    else f'<a href="{pmid}" target="_blank">BioRxiv</a>'
    for pmid in gene_pair_trip["PMID"]
]
####

df_annot=pd.read_csv("data/journal_abbv.csv")
mapping = dict(zip(df_annot['Journal Name'], df_annot['Abbreviation']))
# Replace values in the column based on the mapping
gene_pair_trip["JournalAbbv"] = gene_pair_trip['Journal'].replace(mapping)
gene_pair_trip["Journal"] = [
    f'<span title="{Journal}">{JournalAbbv}</span>'
    for Journal, JournalAbbv in zip(gene_pair_trip["Journal"], gene_pair_trip["JournalAbbv"])
    ]
gene_pair_trip = gene_pair_trip.drop(columns=['JournalAbbv'])

# Make ID unique
gene_pair_trip = gene_pair_trip.sort_values(by='Year', ascending=True)
gene_pair_trip[gene_pair_trip.columns[6]] = make_ids_unique(gene_pair_trip[gene_pair_trip.columns[6]])
gene_pair_trip = gene_pair_trip.sort_values(by='Year', ascending=False)

In [None]:
## Function to prepare functional annotation datatable
import sys, os
import re
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
from bs4 import BeautifulSoup
from createDataTable import gene_pair0, gene_pair, top_pathway_df
from fetchGSheet import gene_group
import warnings

gene_pair_annot = gene_pair0[["Interaction ID", "Human LR Pair", "Cancer-related", "Ligand symbol and aliases",  "Receptor symbol and aliases"]].copy()
# Diseases
df= pd.read_csv("data/disease_annotations_per_pair.csv")
df_cat=pd.read_csv("data/disease_categories.csv")
mapping = dict(zip(df_cat['Disease Name'], df_cat['Category']))
# Replace values in the column based on the mapping
df["Disease Type"] = df['disease'].replace(mapping)
gene_pair_annot = gene_pair_annot.merge(df, how='left', left_on='Human LR Pair', right_on='interaction')
gene_pair_annot = gene_pair_annot.drop(columns=["interaction"])
# PROGENy Pathway retrieved via LIANA+
df= pd.read_csv("data/pathway_annotations_per_pair.csv") 
gene_pair_annot = gene_pair_annot.merge(df, how='left', left_on='Human LR Pair', right_on='interaction')
gene_pair_annot = gene_pair_annot.drop(columns=["interaction", "weight"])

gene_pair_annot = gene_pair_annot.rename(columns={
                                     "disease": "Disease", 
                                     "source": "PROGENy Pathway"}
                            )

In [None]:
# Bring in KEGG Pathways from AU side
gene_pair_annot = gene_pair_annot.merge(top_pathway_df, how='left', left_on='Human LR Pair', right_on='LR Pair')

# reorder
gene_pair_annot = gene_pair_annot[["Interaction ID", "Human LR Pair", "Disease", "Disease Type", "Cancer-related", "KEGG Pathway ID", "KEGG Pathway", "KEGG relationship", "PROGENy Pathway", "Ligand symbol and aliases",  "Receptor symbol and aliases"]]

In [None]:
gene_pair_annot["Disease"] = gene_pair_annot["Disease"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot["Disease Type"] = gene_pair_annot["Disease Type"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot["PROGENy Pathway"] = gene_pair_annot["PROGENy Pathway"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot["KEGG Pathway"] = gene_pair_annot["KEGG Pathway"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)

gene_pair_annot["KEGG Pathway ID"] = gene_pair_annot["KEGG Pathway ID"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)

gene_pair_annot["KEGG relationship"] = gene_pair_annot["KEGG relationship"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)

In [None]:

gene_pair_annot = gene_pair_annot.reset_index(drop=True).copy()
gene_pair_annot["Interaction ID"] = gene_pair_annot["Interaction ID"].apply(
    lambda x: f"<a href='https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/database/filter/{x}.html'>{x}</a>"
)


In [None]:
# Separate Disease and Pathway and then rm duplicates
gene_pair_disease = gene_pair_annot[["Interaction ID", "Human LR Pair", "Disease", "Disease Type", "Cancer-related", "Ligand symbol and aliases",  "Receptor symbol and aliases"]]
gene_pair_disease = gene_pair_disease.drop_duplicates()
gene_pair_disease=gene_pair_disease.reset_index(drop=True)  


In [None]:
def generate_perplexity_link(row):
    if pd.isna(row["Disease"]) or row["Disease"] == "unknown":
        query = f"What-disease-is-the-{row['Human LR Pair']}-associated-with"
    else:
        query = f"What-is-the-role-of-the-ligand-and-receptor-pair-{row['Human LR Pair']}-in-{row['Disease']}"
    
    return (
        f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank">'
        f'<img src="https://img.icons8.com/?size=30&id=0NbBuNOxUwps&format=png&color=000000" alt="Perplexity AI" /></a>'
    )

gene_pair_disease["Perplexity"] = gene_pair_disease.apply(generate_perplexity_link, axis=1)


In [None]:
# Create the links to the HTML cards
gene_pair_disease["Human LR Pair"] = [
    f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrPairOrig}.html">{lrPair}</a>'
    for lrPairOrig, lrPair in zip(gene_pair_disease["Human LR Pair"], gene_pair_disease["Human LR Pair"])
]

### Pop-up for disease, disease type
gene_pair_disease["Disease"] = [
    f'<span title="{disease}">{disease}</span>'
    for disease in gene_pair_disease["Disease"]
]

gene_pair_disease["Disease Type"] = [
    f'<span title="{disease}">{disease}</span>'
    for disease in gene_pair_disease["Disease Type"]
]

gene_pair_pathway = gene_pair_annot[["Interaction ID", "Human LR Pair",  "KEGG Pathway ID", "KEGG Pathway", "KEGG relationship", "PROGENy Pathway", "Ligand symbol and aliases",  "Receptor symbol and aliases"]]
gene_pair_pathway = gene_pair_pathway.drop_duplicates()
gene_pair_pathway=gene_pair_pathway.reset_index(drop=True)  
def generate_perplexity_link_pathway(row):
    if pd.isna(row["KEGG Pathway"]) or row["KEGG Pathway"] == "unknown":
        if pd.isna(row["PROGENy Pathway"]) or row["PROGENy Pathway"] == "unknown":
            query = f"What-biological-pathway-is-the-{row['Human LR Pair']}-associated-with"
        else:
            query = f"What-is-the-role-of-the-ligand-and-receptor-pair-{row['Human LR Pair']}-in-{row['PROGENy Pathway']}"
    else:
        query = f"What-is-the-role-of-the-ligand-and-receptor-pair-{row['Human LR Pair']}-in-{row['KEGG Pathway']}"
    
    return (
        f'<a href="https://www.perplexity.ai/search?q={query}" target="_blank">'
        f'<img src="https://img.icons8.com/?size=30&id=0NbBuNOxUwps&format=png&color=000000" alt="Perplexity AI" /></a>'
    )

gene_pair_pathway["Perplexity"] = gene_pair_pathway.apply(generate_perplexity_link_pathway, axis=1)


In [None]:
# Create the links to the HTML cards
gene_pair_pathway["Human LR Pair"] = [
    f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrPairOrig}.html">{lrPair}</a>'
    for lrPairOrig, lrPair in zip(gene_pair_pathway["Human LR Pair"], gene_pair_pathway["Human LR Pair"])
]

### Pop-up for KEGG Pathway
gene_pair_pathway["KEGG Pathway"] = [
    f'<span title="{path}">{path}</span>'
    for path in gene_pair_pathway["KEGG Pathway"]
]


In [None]:
gene_pair_annot2 = gene_pair0[[
    'Ligand HGNC ID', 'Receptor HGNC ID', 
    "Ligand symbol and aliases",  
    "Receptor symbol and aliases",
    'Ligand location', 'Receptor location',
]].copy()

# Extract HGNC IDs cleanly using regex only if string is valid
def extract_hgnc_id(text):
    if pd.isna(text): return None
    match = re.search(r'(HGNC:\d+)', str(text))
    return match.group(1) if match else None

gene_pair_annot2["ligand_hgnc_id"] = gene_pair_annot2["Ligand HGNC ID"].apply(extract_hgnc_id)
gene_pair_annot2["receptor_hgnc_id"] = gene_pair_annot2["Receptor HGNC ID"].apply(extract_hgnc_id)


In [None]:
gene_pair_annot2.columns

In [None]:
# Add Disease (specific) to cards
df= pd.read_csv("data/disease_annotations_per_pair.csv")
df = df.groupby('interaction')['disease'].apply(', '.join).reset_index()
mapping = dict(zip(df['interaction'],df['disease']))
gene_pair0["Disease"] = gene_pair0['Human LR Pair'].replace(mapping)

gene_pair0["Disease"] = gene_pair0["Disease"].apply(
    lambda x: "ask Perplexity" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "", "unknown"] else x)
gene_pair0["Disease Type"] = gene_pair0["Disease Type"].apply(
    lambda x: "ask Perplexity"  if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "","unknown"] else x)
gene_pair0["PROGENy Pathway"] = gene_pair0["PROGENy Pathway"].apply(
    lambda x: "ask Perplexity"  if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "", "unknown"] else x)
gene_pair0["KEGG Pathway"] = gene_pair0["KEGG Pathway"].apply(
    lambda x: "ask Perplexity"  if pd.isna(x) or str(x).strip().lower() in ["nan", "none", "", "unknown"] else x)

# if only one replace gene_pair0 to e.g. 
gene_pair_input = gene_pair0[gene_pair0["Human LR Pair"] == "A2M HSPA5"]
gene_pair_input
gene_pair_input = gene_pair_input = gene_pair0[gene_pair0["Human LR Pair"].isin(["A2M HSPA5", "APOE LRP1"])]


In [None]:
# Diseases
df= pd.read_csv("data/disease_annotations_per_pair.csv")
df = df.groupby('interaction')['disease'].apply(', '.join).reset_index()
mapping = dict(zip(df['interaction'],df['disease']))
gene_pair0["Disease"]  = gene_pair0['Human LR Pair'].replace(mapping)
gene_pair0

In [None]:

human_gene_pair = human_gene_pair[["Interaction ID", "Human LR Pair", "Ligand", 
                                   "Ligand symbol and aliases", "Ligand HGNC ID",
                                   "Ligand location", "Receptor", 
                                   "Receptor symbol and aliases", "Receptor HGNC ID",
                                   "Receptor location", "PMID", "Perplexity",
                                   "Database Source", "binding location",
                                   "bind in trans?", "bidirectional signalling?",
                                   "interaction type", "KEGG Pathway", "Cancer-related",
                                   "Disease Type"]]

In [None]:
import scanpy as sc
import anndata
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import requests
import scanpy as sc
import re

output_dir = "data/tabula_sapiens/"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(output_dir +"umap" , exist_ok=True)
os.makedirs(output_dir +"heatmap" , exist_ok=True)

output_file = "data/tissue_dataset.h5ad"

if os.path.exists(output_file):
    # Do only if the file exists
    print("File exists. Proceeding with the task.")

    # Rest of the logic that should only run if file exists
else:
    url = "https://datasets.cellxgene.cziscience.com/9daa676b-07ec-4cea-80aa-daa49200aa64.h5ad"
    #Tabula Sapiens is a benchmark, first-draft human cell atlas of over 1.1M cells from 28 organs of 24 normal human subjects. This work is the product of the Tabula Sapiens Consortium. Taking the organs from the same individual controls for genetic background, age, environment, and epigenetic effects, and allows detailed analysis and comparison of cell types that are shared between tissues.
    # Get file size for progress bar
    response = requests.head(url)
    total_size = int(response.headers.get('Content-Length', 0))
    
    # Stream download with tqdm
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(output_file, 'wb') as f, tqdm(
            total=total_size, unit='B', unit_scale=True, desc=output_file
        ) as pbar:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
                    
# Load with scanpy
adata = sc.read_h5ad(output_file)
print(adata)
print(adata.obs.columns) 
print(adata.var_names)    # Gene names

In [None]:
import pandas as pd
import re
from createDataTable import gene_pair0, pop_up_info

# Build HGNC-to-Ensembl mapping
ensembl_id = dict(zip(pop_up_info['HGNC ID'], pop_up_info['ensembl_gene_id']))

# Define HGNC ID extractor
def extract_hgnc_id(text):
    if pd.isna(text): return None
    match = re.search(r'(HGNC:\d+)', str(text))
    return match.group(1) if match else None

# --- Prepare ligand dataframe ---
ligand_df = gene_pair0[['Ligand', 'Ligand HGNC ID']].copy()
ligand_df.columns = ['gene_symbol', 'hgnc_id']  # Standardize column names

# --- Prepare receptor dataframe ---
receptor_df = gene_pair0[['Receptor', 'Receptor HGNC ID']].copy()
receptor_df.columns = ['gene_symbol', 'hgnc_id']

# Combine both
gene_pair_input = pd.concat([ligand_df, receptor_df], ignore_index=True)

# Extract clean HGNC IDs
gene_pair_input['hgnc_id'] = gene_pair_input['hgnc_id'].apply(extract_hgnc_id)

# Map to Ensembl IDs
gene_pair_input['ensembl_id'] = gene_pair_input['hgnc_id'].map(ensembl_id)

# Drop duplicates and NaNs if needed
gene_pair_input = gene_pair_input.drop_duplicates().dropna(subset=['ensembl_id'])

# Final output
print(gene_pair_input.head())


In [None]:
# --- Imports ---
import os
import numpy as np
import pandas as pd
from scipy.sparse import issparse, csr_matrix
import plotly.express as px
from joblib import Parallel, delayed
from tqdm import tqdm
# --- Ensure Output Directories Exist ---
os.makedirs(f"{output_dir}heatmap", exist_ok=True)
os.makedirs(f"{output_dir}umap", exist_ok=True)

# --- Annotations ---
tissues = adata.obs["tissue_in_publication"].astype(str).values
cell_types = adata.obs["cell_type"].astype(str).values
assert len(tissues) == adata.n_obs, "Tissue annotation length mismatch"
assert len(cell_types) == adata.n_obs, "Cell type annotation length mismatch"

# --- Gene Lists and Labels ---
gene_id_list = gene_pair_input["ensembl_id"].tolist()

# --- Limit to First Few for Testing ---
test_mode = False  # ← change to True if testing
max_genes = 3 if test_mode else None
if test_mode:
    gene_id_list = gene_id_list[:max_genes]

# Strip versions from gene IDs if needed
def strip_version(gid):
    return gid.split(".")[0] if "." in gid else gid

gene_id_list_stripped = [strip_version(g) for g in gene_id_list]
gene_label_map = dict(zip(gene_id_list_stripped,
                          gene_pair_input.set_index("ensembl_id").loc[gene_id_list]["gene_symbol"].values))

In [None]:
# If you want to save the scaled data again
# --- Precompute Scaled Expression ---
def precompute_scaled_expression(adata, gene_id_list):
    if "scale_data" not in adata.layers:
        raise ValueError("Layer 'scale_data' not found in adata.")

    X = adata.layers["scale_data"]  # shape: cells x genes
    var_names_stripped = [strip_version(g) for g in adata.var_names]
    gene_indices = {gene_id: i for i, gene_id in enumerate(var_names_stripped)}

    gene_expr_map = {}
    missing_genes = []

    for gene_id in tqdm(gene_id_list, desc="Precomputing scaled expression"):
        if gene_id not in gene_indices:
            missing_genes.append(gene_id)
            continue
        i = gene_indices[gene_id]
        x = X[:, i]  # ✅ not transposed
        expr = x.toarray().flatten() if hasattr(x, "toarray") else x.flatten()
        gene_expr_map[gene_id] = expr

    print(f"[✓] Precomputed expression for {len(gene_expr_map)} genes.")
    if missing_genes:
        print(f"[!] Skipped {len(missing_genes)} genes not found in adata.var_names.")

    return gene_expr_map, missing_genes

# uncomment for now so it is not accidentally ran
# gene_expr_map, _ = precompute_scaled_expression(adata, gene_id_list_stripped)

# --- Plot Function ---
def plot_gene_umap_expression(adata, gene_id, gene_name, output_dir, gene_expr_map=None):
    try:
        if gene_expr_map and gene_id in gene_expr_map:
            expr = gene_expr_map[gene_id]
        else:
            if gene_id not in adata.var_names:
                print(f"[!] {gene_id} not in adata.var_names")
                return
            expr_data = adata[:, gene_id].X
            expr = expr_data.toarray().flatten() if issparse(expr_data) else expr_data.flatten()

        df = pd.DataFrame(adata.obsm["X_umap"], columns=["UMAP1", "UMAP2"])
        df["expression"] = expr
        df["tissue"] = tissues
        df["cell_type"] = cell_types

        fig = px.scatter(
            df,
            x="UMAP1", y="UMAP2",
            color="expression",
            color_continuous_scale="viridis",
            title=f"{gene_name} Expression on UMAP",
            width=600, height=600,
            render_mode="webgl",
            custom_data=["tissue", "cell_type"]
        )

        fig.update_traces(marker=dict(size=3), hovertemplate="<br>".join([
            "Tissue: %{customdata[0]}",
            "Cell type: %{customdata[1]}",
            "Expression: %{marker.color:.2f}"
        ]))

        fig.update_layout(
            plot_bgcolor='white',
            paper_bgcolor='white',
            legend_title="Expr",
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
        )

        out_path = os.path.join(output_dir, "umap", f"{gene_name}.html")
        fig.write_html(out_path, include_plotlyjs="cdn")
        del fig  # free memory
        print(f"[✓] UMAP written: {out_path}")
    except Exception as e:
        print(f"[✗] Error plotting {gene_name} ({gene_id}): {e}")
        

In [None]:
# --- Wrapper for Parallel Execution ---
def generate_umap_wrapper(gene_id_original):
    gene_id = strip_version(gene_id_original)
    gene_label = gene_label_map.get(gene_id, gene_id)
    plot_gene_umap_expression(adata, gene_id, gene_label, output_dir, gene_expr_map)
# --- Run in Parallel ---
Parallel(n_jobs=16, backend="threading")(
    delayed(generate_umap_wrapper)(gene_id)
    for gene_id in tqdm(gene_id_list, desc="Generating UMAPs")
)


In [None]:
# --- Chunking Logic --- MAKE SURE you remove the gene_expr_map line of code from above
chunk_size = 424
num_genes = len(gene_id_list_stripped)

for i in range(0, num_genes, chunk_size):
    print(f"\n[INFO] Processing chunk {i // chunk_size + 1} of {(num_genes - 1) // chunk_size + 1}")
    
    # Get the current batch
    batch_gene_ids = gene_id_list_stripped[i:i + chunk_size]
    
    # Precompute expression for this chunk
    gene_expr_map, _ = precompute_scaled_expression(adata, batch_gene_ids)

    # Wrapper for plotting in parallel
    def generate_umap_wrapper(gene_id):
        gene_label = gene_label_map.get(gene_id, gene_id)
        plot_gene_umap_expression(adata, gene_id, gene_label, output_dir, gene_expr_map)

    # Plot in parallel
    Parallel(n_jobs=16, backend="threading")(
        delayed(generate_umap_wrapper)(gene_id)
        for gene_id in tqdm(batch_gene_ids, desc="Generating UMAPs")
    )


In [None]:
# normalize all data and log
# sc.pp.normalize_total(adata, target_sum=1e4)
# sc.pp.log1p(adata)
# sc.pp.pca(adata)
# sc.pp.neighbors(adata)
# sc.tl.umap(adata)
# UMAP colored by gene expression
#sc.pl.umap(adata, color=gene_pair_input["ensembl_id"][0])

In [None]:
# test_genes = ["ENSG00000141510", "ENSG00000148773"]  # Replace with your own test gene IDs

# gene_id_list = test_genes  # or your full list of Ensembl IDs
# gene_expr_map, missing_genes = precompute_scaled_expression(adata, gene_id_list)
# for gene_id in test_genes:
#     gene_name = gene_label_map.get(gene_id, gene_id)
#     plot_gene_umap_expression(adata, gene_id, gene_name, output_dir, gene_expr_map)


In [None]:
X = adata.layers["scale_data"]  # shape: cells x genes
var_names_stripped = [strip_version(g) for g in adata.var_names]
gene_indices = {gene_id: i for i, gene_id in enumerate(var_names_stripped)}

In [None]:
# --- Normalize Ensembl IDs and build label map ---
def strip_version(ensembl_id):
    return ensembl_id.split('.')[0]

# Rebuild gene_label_map using stripped Ensembl IDs
gene_label_map = {
    strip_version(row["ensembl_id"]): row["gene_symbol"]
    for _, row in gene_pair_input.iterrows()
}

# Strip version from gene_id_list
gene_id_list = [strip_version(g) for g in gene_id_list]

In [None]:
# --- Filter to valid genes ---
valid_gene_ids = [g for g in gene_id_list if g in gene_indices]
rows = [gene_indices[g] for g in valid_gene_ids]
missing_genes = [g for g in gene_id_list if g not in gene_indices]
print(f"[!] Skipped {len(missing_genes)} genes not found in adata.var_names.")
len(valid_gene_ids)

In [None]:
tissues = adata.obs["tissue_in_publication"].astype(str).values
np.unique(tissues)

In [None]:
cell_type_ont = adata.obs["broad_cell_class"].astype(str).values
np.unique(cell_type_ont)

In [None]:
cell_cat = pd.read_csv("data/grouping_for_heatmap.csv")
mapping = dict(zip(cell_cat["cell_type"], cell_cat["category"]))

In [None]:
import pickle

# If you want to save the scaled data again
# with open("data/gene_expr_map_scaled.pkl", "wb") as f:
#     pickle.dump(gene_expr_map, f)

with open("data/gene_expr_map_scaled.pkl", "rb") as f:
    gene_expr_map = pickle.load(f)


In [None]:
gene_expr_map['ENSG00000175899'].max()

In [None]:
# Enable test mode
test_mode = False
max_genes = 3 if test_mode else None

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from joblib import Parallel, delayed
from tqdm import tqdm
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist

if test_mode:
    valid_gene_ids = valid_gene_ids[:max_genes]
colorscale_theme = [
    [0.0, "blue"],
    [0.5, "lightgray"],
    [1.0, "red"]
]

# Map cell_type → cell_class → color
unique_classes = sorted(set(cell_type_ont))
# Define color palette
color_palette = px.colors.qualitative.Set2

# Map: cell_type → cell_class
cell_type_to_class = dict(zip(cell_types, cell_type_ont))

# Sort cell types by cell class and name
sorted_cell_types = sorted(set(cell_types), key=lambda ct: (cell_type_to_class[ct], ct))

# Extract cell class order from sorted y-axis
sorted_classes_from_yaxis = []
seen_classes = set()
for ct in sorted_cell_types:
    cls = cell_type_to_class[ct]
    if cls not in seen_classes:
        sorted_classes_from_yaxis.append(cls)
        seen_classes.add(cls)

# Assign color per class using y-axis order
class_to_color = {
    cls: color_palette[i % len(color_palette)]
    for i, cls in enumerate(sorted_classes_from_yaxis)
}

# Assign color to cell types based on class
cell_type_to_color = {ct: class_to_color[cell_type_to_class[ct]] for ct in set(cell_types)}


n_cell = str(len(np.unique(cell_types))) 
n_cellGroup = str(len(np.unique(cell_type_ont)))
n_tissue = str(len(np.unique(tissues)))

cell_cat = pd.read_csv("data/grouping_for_heatmap.csv")
mapping = dict(zip(cell_cat["cell_type"], cell_cat["category"]))

def generate_heatmap(gene_id, gene_label, expr):
    # --- Prepare data ---
    df = pd.DataFrame({
        "expression": expr,
        "tissue": tissues,
        "cell_type": cell_types,
        "cell_class": cell_type_ont
    })

    df['cell_panel'] = df['cell_class'].replace(mapping)

    # Ordered categorical cell types
    sorted_cell_types = sorted(set(cell_types), key=lambda ct: (cell_type_to_class[ct], ct))
    df["cell_type"] = pd.Categorical(df["cell_type"], categories=sorted_cell_types, ordered=True)

    # --- Global pivot ---
    pivot_df = df.groupby(["tissue", "cell_type"]).agg(
        mean_expression=("expression", "mean"),
        cell_class=("cell_class", "first")
    ).reset_index()

    global_heatmap = pivot_df.pivot(index="cell_type", columns="tissue", values="mean_expression")

    # Global zmin/zmax
    global_zmin = global_heatmap.min().min()
    global_zmax = global_heatmap.max().max()

    # Mapping for customdata
    cell_type_class_map = df.drop_duplicates("cell_type").set_index("cell_type")["cell_class"].to_dict()

    # --- Tissue clustering ---
    clustered_tissues = global_heatmap.T.fillna(0)
    linkage_matrix = linkage(pdist(clustered_tissues, metric='euclidean'), method='average')
    clustered_tissue_order = clustered_tissues.index[leaves_list(linkage_matrix)]
    global_heatmap = global_heatmap[clustered_tissue_order]

    # --- Panels setup ---
    panel_names = sorted(df['cell_panel'].dropna().unique())
    n_panels = len(panel_names)

    # Panel size logic
    row_heights = []
    for panel in panel_names:
        n = len(df[df['cell_panel'] == panel]["cell_type"].unique())
        row_heights.append(n)
    total_rows = sum(row_heights)
    normalized_heights = [h / total_rows for h in row_heights]

    # Only 1 column (heatmap), not 2
    fig = make_subplots(
        rows=n_panels, cols=1,
        shared_xaxes=False,
        shared_yaxes=True,
        vertical_spacing=0.035,
        row_heights=normalized_heights,
        specs=[[{"type": "heatmap"}]] * n_panels
    )

    for i, panel in enumerate(panel_names, start=1):
        panel_cell_types = df[df['cell_panel'] == panel]["cell_type"].unique()
        panel_cell_types = sorted(panel_cell_types, key=lambda ct: cell_type_class_map.get(ct, ""))
        heatmap_data = global_heatmap.loc[global_heatmap.index.intersection(panel_cell_types)]
        y_labels = heatmap_data.index.astype(str).tolist()

        hover_class_data = np.array([[cell_type_class_map[ct]] * len(heatmap_data.columns) for ct in y_labels])

        # --- Heatmap only ---
        base_y_pos = 1 - sum(normalized_heights[:i-1])
        y_pos = base_y_pos
        if i == 2:
            y_pos = base_y_pos - 0.010
        if i == 3:
            y_pos = base_y_pos - 0.035
        elif i == 4:
            y_pos = base_y_pos - 0.023
        elif i == 5:
            y_pos = base_y_pos - 0.026
        fig.add_trace(go.Heatmap(
            z=heatmap_data.values,
            x=heatmap_data.columns,
            y=y_labels,
            customdata=hover_class_data,
            colorscale=colorscale_theme,
            zmin=global_zmin,
            zmax=global_zmax,
            showscale=True,
            colorbar=dict(
                title="Log(Gene Expression)",
                orientation="h",
                x=-0.7,
                xanchor="left",
                y=y_pos,
                len=0.9,
                thickness=10
            ),
            hovertemplate="Tissue: %{x}<br>Cell type: %{y}<br>Class: %{customdata}<br>Expr: %{z:.2f}<extra></extra>"
        ), row=i, col=1)

        # Axes
        fig.update_xaxes(#title_text=panel,
                         showticklabels=True, 
                         tickangle=270, 
                         tickfont=dict(size=9), row=i, col=1)
        fig.update_yaxes(
            autorange="reversed",
            tickfont=dict(size=9),
            title_text=panel,
            title_font=dict(size=13, color="black"),
            title_standoff=5,
            row=i,
            col=1
        )
        # Compute the center y-position of this panel (in data coordinates)
        mid_index = len(y_labels) // 2
        mid_cell = y_labels[mid_index] if y_labels else ""
        
        # Compute the center y-position of this panel (in data coordinates)
        mid_index = len(y_labels) // 2
        mid_cell = y_labels[mid_index] if y_labels else ""
        
        fig.add_annotation(
            text=f"<b>{panel}</b>",
            xref=f'x{i}',  # panel's x-axis
            yref=f'y{i}',  # panel's y-axis
            x=heatmap_data.columns[len(heatmap_data.columns) // 2],  # center tissue
            y=mid_cell,  # center cell type
            showarrow=False,
            font=dict(size=14, color='black'),
            xanchor="center",
            yanchor="middle"
        )


    # Layout
    fig.update_layout(
        width=585,
        height=total_rows * 20,
        plot_bgcolor="white",
        paper_bgcolor="white",
        margin=dict(l=1, r=1, t=80, b=100),
        title=dict(
            text=f"{gene_label} Expression",
            x=0.7,
            y= 0.995,
            xanchor="center",
            yanchor="top",
            font=dict(size=18, color="black")
        ),
        annotations=[
            dict(
                text=f"{gene_label} Expression",
                x=0.5,
                y=-0.03,
                xref="paper",
                yref="paper",
                xanchor="center",
                yanchor="bottom",
                showarrow=False,
                font=dict(size=18, color="black")
            )
        ]
    )
    # Add panel titles globally above each subplot
    for i, panel in enumerate(panel_names, start=1):
        # same as above
        base_y_pos = 1 - sum(normalized_heights[:i-1])
        y_pos = base_y_pos
        if i == 2:
            y_pos = base_y_pos - 0.010
        if i == 3:
            y_pos = base_y_pos - 0.035
        elif i == 4:
            y_pos = base_y_pos - 0.023
        elif i == 5:
            y_pos = base_y_pos - 0.026
    
        fig.add_annotation(
            text=f"<b>{panel}</b>",
            x=0.5,
            y=y_pos,
            xref="paper",
            yref="paper",
            showarrow=False,
            font=dict(size=14),
            xanchor="center",
            yanchor="bottom"
        )



    # Save
    output_path = f"{output_dir}heatmap/{gene_label}.html"
    fig.write_html(output_path, include_plotlyjs="cdn")
    print(f"[✓] Heatmap written: {output_path}")



def generate_heatmap_wrapper(gene_id):
    try:
        gene_label = gene_label_map[gene_id]
        expr = gene_expr_map[gene_id]
        generate_heatmap(gene_id, gene_label, expr)
    except Exception as e:
        print(f"[!] Error with {gene_id}: {e}")

Parallel(n_jobs=16, backend="threading")(
    delayed(generate_heatmap_wrapper)(gene_id)
    for gene_id in tqdm(valid_gene_ids, desc="Generating heatmaps")
)
 

In [None]:
import plotly.express as px
import pandas as pd
import os

def plot_gene_umap_expression(adata, gene_id, gene_name, output_dir, gene_expr_map=None):
    """
    Plot UMAP expression of a single gene with tissue and cell type in hover.
    
    Parameters:
    - adata: AnnData object
    - gene_id: Ensembl gene ID
    - gene_name: gene symbol (or fallback name)
    - output_dir: where to save HTML
    - gene_expr_map: optional dict of {gene_id: precomputed expression array}
    """
    try:
        # Ensure output folder exists
        os.makedirs(f"{output_dir}/umap", exist_ok=True)

        # UMAP coordinates
        df = pd.DataFrame(adata.obsm["X_umap"], columns=["UMAP1", "UMAP2"])

        # Expression: use precomputed if available
        if gene_expr_map and gene_id in gene_expr_map:
            expr = gene_expr_map[gene_id]
        else:
            # Check existence
            if gene_id not in adata.var_names:
                print(f"[!] Skipping {gene_id}: not found in adata.var_names.")
                return
            expr_data = adata[:, gene_id].X
            expr = expr_data.toarray().flatten() if hasattr(expr_data, "toarray") else expr_data.flatten()

        # Add expression and metadata
        df["expression"] = expr
        df["tissue"] = adata.obs["tissue"].astype(str).values
        df["cell_type"] = adata.obs["cell_type"].astype(str).values

        # Plot
        fig = px.scatter(
            df,
            x="UMAP1", y="UMAP2",
            color="expression",
            color_continuous_scale="viridis",
            title=f"{gene_name} Expression on UMAP",
            width=600, height=600,
            render_mode="webgl",
            custom_data=["tissue", "cell_type"]
        )

        fig.update_traces(
            marker=dict(size=3),
            hovertemplate="<br>".join([
                "Tissue: %{customdata[0]}",
                "Cell type: %{customdata[1]}",
                "Expression: %{marker.color:.2f}"
            ])
        )

        fig.update_layout(
            plot_bgcolor='white',
            paper_bgcolor='white',
            legend_title="Expr",
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, title=None),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, title=None)
        )

        # Save HTML
        fig.write_html(f"{output_dir}/umap/{gene_name}.html", include_plotlyjs="cdn")
        print(f"[✓] UMAP saved: {output_dir}/umap/{gene_name}.html")

    except Exception as e:
        print(f"[✗] Failed for {gene_name} ({gene_id}): {e}")


In [None]:
gene = gene_pair_input["ensembl id"][0]

# Build dataframe
df = pd.DataFrame(adata.obsm["X_umap"], columns=["UMAP1", "UMAP2"])
df["expression"] = adata[:, gene].X.toarray().flatten() if hasattr(adata[:, gene].X, "toarray") else adata[:, gene].X.flatten()
df["tissue"] = adata.obs["tissue"].astype(str).values  # ensure it’s an array
df["cell_type"] = adata.obs["cell_type"].astype(str).values  # ensure it’s an array
# Ensure `custom_data` is passed correctly as a list of columns
fig = px.scatter(
    df,
    x="UMAP1", y="UMAP2",
    color="expression",
    color_continuous_scale="viridis",
    title=f"{gene_name[0]} Expression on UMAP",
    width=600, height=600,
    render_mode="webgl",  # <-- faster for many points
    custom_data=["tissue", "cell_type"]  # required for hovertemplate
)

# Update hover to show tissue properly
fig.update_traces(
    marker=dict(size=3),
    hovertemplate="<br>".join([
        "Tissue: %{customdata[0]}",
        "Cell type: %{customdata[1]}",
        "Expression: %{marker.color:.2f}"
    ])
)
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend_title="Expr",
    xaxis=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=False,
        title=None
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showticklabels=False,
        title=None
    )
)


fig.show()
fig.write_html(f"{output_dir}umap/{gene_name[0]}.html", include_plotlyjs="cdn")

In [None]:
sc.pl.umap(
    adata,
    color='tissue_in_publication',
    legend_loc='right margin',  # or 'right margin' for large datasets
    size=0.1,  # reduce for large datasets
    title='UMAP by Tissue',
    frameon=False
)


In [None]:
sc.pl.umap(
    adata,
    color='cell_type',
    legend_loc='right margin',  # or 'right margin' for large datasets
    size=0.1,  # reduce for large datasets
    title='UMAP by Cell Type',
    frameon=False
)
