# Python Notebook

In [1]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
import pandas as pd
# Change working directory to ConnectomeDB
project_root = "/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories/ConnectomeDB"
os.chdir(project_root)
sys.path.append(os.path.abspath("src"))

In [2]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
from bs4 import BeautifulSoup
from createDataTable import gene_pair, gene_pair000, human_columns, lrPairsCount
import warnings

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet"
]

def process_species(gene_pair_df, gene_pair000_df, species, id_prefix, ligand_index, receptor_index):
    """
    Processes ligand-receptor interactions for a given species.

    Parameters:
        gene_pair_df (pd.DataFrame): The main gene pair DataFrame.
        gene_pair000_df (pd.DataFrame): The filtered gene pair DataFrame.
        species (str): The species name (e.g., "Mouse", "Rat", "Zebrafish").
        id_prefix (str): The identifier prefix for the species (e.g., "MGI" for Mouse, "RGD" for Rat).
        ligand_index (int): Index for selecting the ligand column.
        receptor_index (int): Index for selecting the receptor column.

    Returns:
        pd.DataFrame: Processed DataFrame for the species.
    """
    # Identify relevant columns for the species
    if species in ["Mouse", "Rat", "Zebrafish"]:
        species_columns = [col for col in gene_pair_df.columns if id_prefix in col or species in col]
    else:
        species_columns = [col for col in gene_pair_df.columns if species in col]

    # Filter rows where all species-specific columns are not empty
    species_gene_pair = gene_pair000_df[(gene_pair000_df[species_columns].map(str.strip) != "").all(axis=1)]

    # Rename columns to remove species name
    species_gene_pair.columns = [
        col.replace(f"{species} ", "").strip() if "Ligand" in col or "Receptor" in col else col
        for col in species_gene_pair.columns
    ]

    # Identify ligand and receptor columns dynamically
    ligand_col = [col for col in species_gene_pair.columns if "Ligand&nbsp;" in col][ligand_index]
    receptor_col = [col for col in species_gene_pair.columns if "Receptor&nbsp;" in col][receptor_index]
    ligand_location = [col for col in species_gene_pair.columns if "Ligand location" in col][0]
    receptor_location = [col for col in species_gene_pair.columns if "Receptor location" in col][0]

    if id_prefix == "Ensembl":
        # Extract species names dynamically
        all_species = {"Chimpanzee", "Pig", "Dog", "Cow", 
                       "Chicken", "Horse", "Sheep"}
        # Function to clean column names by removing HTML tags
        def clean_column_name(col):
            # Remove HTML tags using BeautifulSoup
            cleaned_name = BeautifulSoup(col, "html.parser").get_text()
            return cleaned_name.strip()
        
        # Step 1: Identify columns that contain any species from all_species
        cleaned_columns = [clean_column_name(col) for col in species_gene_pair.columns]
        
        # Identify columns where the species from all_species is in the cleaned name
        columns_to_remove = [
            species_gene_pair.columns[i] for i, cleaned_name in enumerate(cleaned_columns)
            if any(species in cleaned_name for species in all_species)
        ]
        
        # Step 2: Remove those columns from the DataFrame
        species_gene_pair = species_gene_pair.drop(columns=columns_to_remove)
        
        
            # Function to format ligand-receptor pairs
    def format_lr_pair(row):
        if row[ligand_location] in ['secreted', '']:
            return f"{row[ligand_col]} <span style='font-size: 15px;'>○</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
        elif row[receptor_location] == 'plasma membrane':
            return f"{row[ligand_col]} <span style='font-size: 24px;'>⤙</span> <span style='font-size: 24px;'>⤚</span> {row[receptor_col]}"
        else:
            return f"{row[ligand_col]} \u2192 {row[receptor_col]}"

    # Apply the formatting function
    species_gene_pair1 = species_gene_pair.copy()
    species_gene_pair1.loc[:, f"{species} LR Pair"] = species_gene_pair1.apply(format_lr_pair, axis=1)

    # Identify relevant columns for the species
    species_columns = [col for col in species_gene_pair1.columns if id_prefix in col]
    print(species_columns)
    new_order = [human_columns[0]]+ [f"{species} LR Pair", ligand_col, receptor_col] + species_columns + human_columns[1:]
    
    print(species_gene_pair1.columns)
    species_gene_pair1 = species_gene_pair1[new_order].reset_index(drop=True)

    return species_gene_pair1


# Process each species
# Mouse
mouse_gene_pair1 = process_species(gene_pair, gene_pair000, "Mouse", "MGI", 1, 1)

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


['<span title="Genome Informatics (MGI) ID. Click on the link for more details">Ligand MGI ID</span>', '<span title="Genome Informatics (MGI) ID. Click on the link for more details">Receptor MGI ID</span>']
Index(['<span title="Double-click header of Interaction ID to ensure all values are shown">Interaction ID&nbsp;</span>',
       '<span title=" Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)">Human LR Pair</span>',
       '<span title="Double-click header of Database Source to ensure all values are shown">Database Source&nbsp;</span>',
       '<span title=" Official Gene Symbol; Hover on symbols below to show gene names"">Human Ligand&nbsp;&nbsp;&nbsp;</span</span>',
       '<span title=" Official Gene Symbol; Hover on symbols below to show gene names"">Human Receptor&nbsp;&nbsp;&nbsp;</span</span>',
       '<span title="Click the logo below to run Perplexity on the Human LR pair">Perplexity&nbsp;</span>',
       '<span title=" PubMed IDs (PMID) with Lite

In [3]:
mouse_gene_pair1

Unnamed: 0,"<span title=""Double-click header of Interaction ID to ensure all values are shown"">Interaction ID&nbsp;</span>",Mouse LR Pair,"<span title=""Double-click header of Ligand to ensure all values are shown"">Ligand&nbsp;</span>","<span title=""Double-click header of Receptor to ensure all values are shown"">Receptor&nbsp;</span>","<span title=""Genome Informatics (MGI) ID. Click on the link for more details"">Ligand MGI ID</span>","<span title=""Genome Informatics (MGI) ID. Click on the link for more details"">Receptor MGI ID</span>","<span title="" Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)"">Human LR Pair</span>","<span title=""Double-click header of Database Source to ensure all values are shown"">Database Source&nbsp;</span>","<span title="" Official Gene Symbol; Hover on symbols below to show gene names"""">Human Ligand&nbsp;&nbsp;&nbsp;</span</span>","<span title="" Official Gene Symbol; Hover on symbols below to show gene names"""">Human Receptor&nbsp;&nbsp;&nbsp;</span</span>",...,"<span title=""Double-click header of interaction type to ensure all values are shown"">interaction type&nbsp;</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Ligand HGNC ID&nbsp;&nbsp;</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Receptor HGNC ID&nbsp;&nbsp;</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Ligand location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Receptor location</span>","<span title=""Double-click header of Top Pathway to ensure all values are shown"">Top Pathway&nbsp;</span>","<span title=""Double-click header of Cancer-related to ensure all values are shown"">Cancer-related&nbsp;</span>","<span title=""Double-click header of Disease Type to ensure all values are shown"">Disease Type&nbsp;</span>","<span title=""Double-click header of Ligand symbols to ensure all values are shown"">Ligand symbols&nbsp;</span>","<span title=""Double-click header of Receptor symbols to ensure all values are shown"">Receptor symbols&nbsp;</span>"
0,CDB00001,A2m → Hspa5,A2m,Hspa5,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","connectomeDB2020 x GPT,connectomeDB2025 🆕","<span title=""alpha-2-macroglobulin"">A2M</span>","<span title=""heat shock protein family A (Hsp7...",...,non-covalent binding,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",multiple,multiple,Estrogen,Yes,Cancer,A2M;FWP007;S863-7;CPAMD5,HSPA5;GRP78;BiP
1,CDB00002,A2m → Lrp1,A2m,Lrp1,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",connectomeDB2015,"<span title=""alpha-2-macroglobulin"">A2M</span>","<span title=""LDL receptor related protein 1"">L...",...,non-covalent binding,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",multiple,multiple,Hypoxia,unknown,unknown,A2M;FWP007;S863-7;CPAMD5,LRP1;APR;A2MR;LRP;CD91;LRP1A;APOER;IGFBP3R1;IG...
2,CDB00003,Ace <span style='font-size: 24px;'>⤙</span> <s...,Ace,Bdkrb2,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",connectomeDB2015,"<span title=""angiotensin I converting enzyme"">...","<span title=""bradykinin receptor B2"">BDKRB2</s...",...,cleavage,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",multiple,plasma membrane,PI3K,No,"Cardiovascular, Psychiatric",ACE;DCP1;ACE1;CD143,BDKRB2;BK-2
3,CDB00004,Ada → Dpp4,Ada,Dpp4,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",Cell TalkDB_Hs,"<span title=""adenosine deaminase"">ADA</span>","<span title=""dipeptidyl peptidase 4"">DPP4</span>",...,non-covalent binding,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",cytoplasm,multiple,VEGF,unknown,unknown,ADA;ADA1,DPP4;CD26;ADCP2;DPPIV
4,CDB00005,Adam10 → Epha3,Adam10,Epha3,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",connectomeDB2025 🆕,"<span title=""ADAM metallopeptidase domain 10"">...","<span title=""EPH receptor A3"">EPHA3</span>",...,cleavage,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",multiple,multiple,VEGF,unknown,unknown,ADAM10;kuz;MADM;HsT18717;CD156C,EPHA3;ETK;ETK1;TYRO4;HEK;HEK4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3280,CDB03360,Sbp <span style='font-size: 15px;'>○</span> <s...,Sbp,Tlr2,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",connectomeDB2015,"<span title=""zymogen granule protein 16B"">ZG16...","<span title=""toll like receptor 2"">TLR2</span>",...,non-covalent binding,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,Trail,unknown,unknown,ZG16B;HRPE773;PRO1567;JCLN2,TLR2;TIL4;CD282
3281,CDB03361,Sbp <span style='font-size: 15px;'>○</span> <s...,Sbp,Tlr4,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",connectomeDB2015,"<span title=""zymogen granule protein 16B"">ZG16...","<span title=""toll like receptor 4"">TLR4</span>",...,non-covalent binding,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,multiple,unknown,unknown,unknown,ZG16B;HRPE773;PRO1567;JCLN2,TLR4;hToll;CD284;TLR-4;ARMD10
3282,CDB03362,Sbp <span style='font-size: 15px;'>○</span> <s...,Sbp,Tlr5,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",connectomeDB2015,"<span title=""zymogen granule protein 16B"">ZG16...","<span title=""toll like receptor 5"">TLR5</span>",...,non-covalent binding,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,p53,unknown,unknown,ZG16B;HRPE773;PRO1567;JCLN2,TLR5;SLEB1;TIL3;FLJ10052;MGC126430;MGC126431
3283,CDB03363,Sbp <span style='font-size: 15px;'>○</span> <s...,Sbp,Tlr6,"<a href=""https://www.informatics.jax.org/marke...","<a href=""https://www.informatics.jax.org/marke...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...",connectomeDB2015,"<span title=""zymogen granule protein 16B"">ZG16...","<span title=""toll like receptor 6"">TLR6</span>",...,non-covalent binding,"<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,unknown,unknown,unknown,ZG16B;HRPE773;PRO1567;JCLN2,TLR6;CD286


In [None]:
# Try getting the query parameter from Quarto (web context), fallback to local test value
try:
    query_string = quarto_context['request'].query_string
    params = {k: v[0] for k, v in urllib.parse.parse_qs(query_string).items()}
    filter_id = params.get('id', '')
except Exception:
    filter_id = 'CDB02991'  # fallback or use 'CDB02991' for local testing

if filter_id:
    human_gene_pairTrip = human_gene_pairTrip.loc[human_gene_pairTrip[human_gene_pairTrip.columns[0]].str.startswith(filter_id)].copy()
human_gene_pairTrip

In [None]:
human_gene_pairTrip.columns.values[1]

In [None]:
pmids_without_period = pmids = df[~df['Title'].str.endswith(('.', '?'), na=False)]['PMID']
pmid_check= pmids_without_period.tolist()
print("These " + str(len(pmid_check)) + " titles that have to be manually checked -- possible incomplete titles")
print(pmid_check)

In [None]:
human_gene_pairTrip.rename(columns={human_gene_pairTrip.columns[0]: "Interaction ID"}, inplace=True)
human_gene_pairTrip

In [None]:
import re
from fuzzywuzzy import process, fuzz
import csv

# Output CSV file
output_file = "data/journal_abbv.csv"

pubmed_df = pd.read_csv("data/pubmed_results.csv")
journal_names = pubmed_df["Journal"].unique().tolist()

manual_abbr_dict = {
    "The Journal of biological chemistry": "J. Biol. Chem.",
    "Journal of immunology": "J. Immunol.",
    "Acta physiologica": "Acta Physiol. (Oxf.)",
    "The Biochemical journal": "Biochem. J.",
    "Hepatology": "Hepatology",
    "Chemical reviews": "Chem. Rev.",
    "Molecular endocrinology": "Mol. Endocrinol.",
    "Journal of molecular biology": "J. Mol. Biol.",
    "The Journal of experimental medicine": "J. Exp. Med.",
    "Growth factors": "Growth Factors",
    "Development": "Development",
    "Structure": "Structure",
    "Journal of neurochemistry": "J. Neurochem.",
    "Cancer research": "Cancer Res.",
    "Advanced science": "Adv. Sci.",
    "Arthritis & rheumatology": "Arthritis Rheumatol.",
    "Clinical immunology": "Clin. Immunol.",
    "Hypertension": "Hypertension",
    "Neoplasia": "Neoplasia",
    "The Journal of general physiology": "J. Gen. Physiol.",
    "Acta biochimica Polonica": "Acta Biochim. Pol.",
    "Cell cycle": "Cell Cycle",
    "Human reproduction": "Hum. Reprod.",
    "American journal of reproductive immunology": "Am. J. Reprod. Immunol.",
    "Methods": "Methods",
    "Lung cancer": "Lung Cancer",
    "Gut": "Gut",
    "Archives of surgery": "Arch. Surg.",
    "Lancet": "Lancet"
}


medline_file= "data/J_Medline.txt"
def load_journal_info(filename):
    journals = []
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
        # Split the content by the separator that divides journal entries
        entries = content.split('--------------------------------------------------------')
        
        for entry in entries:
            title_match = re.search(r"JournalTitle:\s*(.*?)\n", entry)
            abbr_match = re.search(r"MedAbbr:\s*(.*?)\n", entry)
            
            if title_match and abbr_match:
                title = title_match.group(1).strip()
                abbr = abbr_match.group(1).strip()
                journals.append((title, abbr))
    
    return journals


def get_abbreviations(journal_names, journal_dict, score_threshold=98):
    journal_keys = [title for title, _ in journal_dict]
    results = []

    for name in journal_names:
        cleaned_name = name.strip().lower()

        # 1. Check for exact match
        abbr = None
        for title, abbreviation in journal_dict:
            if cleaned_name == title.strip().lower():
                abbr = abbreviation
                results.append((name, abbr, "Exact"))
                break

        # 2. Fuzzy match
        if not abbr:
            matches = process.extract(cleaned_name, journal_keys, scorer=fuzz.ratio, limit=5)
            best_match = matches[0] if matches else None
            if best_match:
                matched_name, score = best_match
                if score >= score_threshold:
                    abbr = journal_dict[journal_keys.index(matched_name)][1]
                    results.append((name, abbr, f"Fuzzy (score: {score})"))

        # 3. Manual backup
        if not abbr:
            manual_match = manual_abbr_dict.get(name.strip())
            if manual_match:
                results.append((name, manual_match, "Manual Backup"))
            else:
                results.append((name, "Not Found", "No Match"))

    return results

def save_to_csv(results, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Journal Name', 'Abbreviation', 'Match Type']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow({'Journal Name': result[0], 'Abbreviation': result[1], 'Match Type': result[2]})
# Load journal information from the file
journal_list = load_journal_info(medline_file)

#Get the matches
results = get_abbreviations(journal_names, journal_list)

# Save results to CSV
save_to_csv(results, output_file)


In [None]:
results

In [None]:
# Load journal information from the file
journal_list = load_journal_info(medline_file)

#Get the matches
results = get_abbreviations(journal_names, journal_list)

# Save results to CSV
save_to_csv(results, output_file)


In [None]:
df =mouse_gene_pair1[mouse_gene_pair1["Mouse LR Pair"].duplicated()]

In [None]:
df.to_csv("data/mouseOrth.csv")

In [None]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet"
]

# Select only the relevant columns from pop_up_info
cols_to_keep = cols_to_keep = list(range(0, 30)) 
# Step 3: Load file using only the desired columns
df = pd.read_table("data/HGNC_gene_info_full.tsv", usecols=cols_to_keep)
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          "rgd_id": "RGD ID",
                                          "mgd_id": "MGI ID", 
                                          "rgd_id": "RGD ID",
                                          "alias_symbol": "Alias symbol", # add to table
                                          "prev_symbol": "Previous symbol", # add to table
                                          "date_symbol_changed": "Date symbol changed"
                                          
                                         })

# Keep only first MGI/RGD ID
pop_up_info["MGI ID"] = pop_up_info["MGI ID"].str.split("|").str[0]
pop_up_info["RGD ID"] = pop_up_info["RGD ID"].str.split("|").str[0]

pop_up_info["Alias symbol"] = pop_up_info["Alias symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

pop_up_info["Previous symbol"] = pop_up_info["Previous symbol"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)

pop_up_info["Date symbol changed"] = pop_up_info["Date symbol changed"].apply(
    lambda x: "N/A" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x
)


pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "MGI ID", "RGD ID", "Alias symbol",
                               "Approved symbol", "Previous symbol"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair.dropna(axis=1, how='all')

# for now, keep only the following columns
gene_pair = gene_pair[['LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
                       'perplexity link', 'PMID', 'binding location', 
                       'bind in trans?', 'bidirectional signalling?',
                       'interaction type', 'original source']]
# some PMIDs kick in with "," so replace
gene_pair["PMID"] = [value.replace(",", "") for value in gene_pair["PMID"]]

# Mapping for replacements
mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# Replace values in the column based on the mapping
gene_pair['original source'] = gene_pair['original source'].replace(mapping)

## add Ligand/Receptor Location
mapping_loc = dict(zip(fetchGSheet.loc_info['ApprovedSymbol'], fetchGSheet.loc_info['Localization']))
gene_pair['Ligand location'] = gene_pair['Ligand'].replace(mapping_loc)
gene_pair['Receptor location'] = gene_pair['Receptor'].replace(mapping_loc)
# Set missing mappings to 'unknown'
gene_pair.loc[gene_pair['Ligand location'] == gene_pair['Ligand'], 'Ligand location'] = 'unknown'
gene_pair.loc[gene_pair['Receptor location'] == gene_pair['Receptor'], 'Receptor location'] = 'unknown'
# Set "n/a" to unknown
gene_pair['Ligand location'] = [value.replace("n/a", "unknown") for value in gene_pair['Ligand location']]
gene_pair['Receptor location'] = [value.replace("n/a", "unknown") for value in gene_pair['Receptor location']]

# Fetch species IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "LR pair": "Human LR Pair",
    "Ligand.HGNC": "Ligand HGNC ID",
    "Receptor.HGNC": "Receptor HGNC ID",
    "perplexity link": "Perplexity", # will be replaced with actual link later
    "original source": "Database Source",
    "PMID": "PMID support"
})


# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand name", 
                                     "MGI ID": "Ligand MGI ID",
                                     "RGD ID": "Ligand RGD ID",
                                      "Alias symbol": "Ligand Aliases",
                                      "Previous symbol": "Ligand Old symbol",
                                     },
                            )
gene_pair = gene_pair.drop(columns=["HGNC ID", "Approved symbol"])
# Add top pathway per pair
LR_pairs = gene_pair["Human LR Pair"].unique()
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
#df = df[df["interaction"].isin(LR_pairs)]
# Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
})
top_pathway_df["interaction"] = [value.replace("^", " ") for value in top_pathway_df["interaction"]]
gene_pair = gene_pair.merge(top_pathway_df, how='left', left_on='Human LR Pair', right_on='interaction')
gene_pair = gene_pair.drop(columns=["interaction"])
# Add Disease Category per pair
df= pd.read_csv("data/disease_annotations_per_pair.csv")
df_cat=pd.read_csv("data/disease_categories.csv")
mapping = dict(zip(df_cat['Disease Name'], df_cat['Category']))
# Replace values in the column based on the mapping
df["Disease Type"] = df['disease'].replace(mapping)
df = df[["interaction", "Disease Type"]].drop_duplicates()
df['Disease Type'] = df['Disease Type'].astype(str)
df = df.sort_values(by='Disease Type', ascending=True)

In [None]:
df = df.groupby('interaction')['Disease Type'].apply(', '.join).reset_index()
# Create "Cancer-related" column based on whether "Cancers & Neoplasms" is in col2
df['Cancer-related'] = df['Disease Type'].apply(lambda x: 'Yes' if 'Cancer' in x else 'No')
disease_df = df[df["interaction"].isin(LR_pairs)]
disease_df

In [None]:
## Function to create Ligand-Receptor pair cards

import os
import jinja2
import sys
import pandas as pd
import numpy as np
import time
import base64
import re


sys.path.append(os.path.abspath("src"))  
import fetchGSheet
from createDataTable import pop_up_info, gene_pair0
gene_pair0

In [None]:

def load_template(template_path):
    """Load Jinja2 template from a file."""
    with open(template_path, 'r') as file:
        return jinja2.Template(file.read())

def encode_image(image_path):
    """Encode an image to base64."""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except FileNotFoundError:
        return None

# Function to extract the HGNC ID from the anchor tag URL
def extract_hgnc_id(col):
    # Use regular expression to extract the HGNC ID after "HGNC:"
    match = re.search(r'HGNC:(\d+)', col)
    if match:
        return match.group(1)  # Return the HGNC ID (number part)
    return None  # Return None if the format doesn't match or it's not a string

# Updated functions to convert the HGNC link (using extract_hgnc_id)
def convert_hgnc_url(col):
    hgnc_id = extract_hgnc_id(col)  # Extract the HGNC ID
    if hgnc_id:
        visible_text = "genecard.org"
        new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}" target="_blank">{visible_text}</a>'
        return new_link
    return None

def convert_hgnc_url_disease(col):
    hgnc_id = extract_hgnc_id(col)  # Extract the HGNC ID
    if hgnc_id:
        visible_text = "see here"
        new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}#diseases" target="_blank">{visible_text}</a>'
        return new_link
    return None

def convert_hgnc_url_exp(col):
    hgnc_id = extract_hgnc_id(col)  # Extract the HGNC ID
    if hgnc_id:
        visible_text = "see here"
        new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}#expression" target="_blank">{visible_text}</a>'
        return new_link
    return None

def prepare_dataframes(gene_pair0):
    """Prepare interaction, ligand, and receptor dataframes."""
    # DBlength = len(gene_pair0)
    # gene_pair0["Interaction ID"] = [f"CDB{str(i).zfill(4)}" for i in range(1, DBlength + 1)]
    gene_pair0["Interaction Type"] = [
        f'{ligand} {ligandLocation} ligand binds to {receptor} {receptorLocation} receptor'
        for ligand, ligandLocation, receptor, receptorLocation in zip(
            gene_pair0["Ligand"], gene_pair0["Ligand location"],
            gene_pair0["Receptor"], gene_pair0["Receptor location"]
        )
    ]
    interaction_card = gene_pair0[["Interaction ID", "Human LR Pair", "Interaction Type", "Perplexity", "PMID support", "Top Pathway", "Cancer-related", "Disease Type"]]
    interaction_card["Perplexity"] = interaction_card["Perplexity"].str.replace('size=30', 'size=80')

    pop_up_info_lim = pop_up_info[
        ["Approved symbol", "Alias symbol", "Previous symbol", "Date symbol changed"]
    ].drop_duplicates(subset="Approved symbol", keep="first")
    
    ligand_card = gene_pair0[["Human LR Pair", "Ligand", "Ligand name", "Ligand HGNC ID", "Ligand MGI ID", "Ligand RGD ID", "Ligand location"]].merge(
        pop_up_info_lim, how='left', left_on='Ligand', right_on='Approved symbol'
    ).drop_duplicates(subset='Human LR Pair', keep="first").drop(columns=["Ligand", "Approved symbol"])

    ligand_card_1 = ligand_card[["Human LR Pair", "Alias symbol", "Date symbol changed", "Ligand name"]] 
    ligand_card_2 = ligand_card[["Human LR Pair", "Ligand HGNC ID", "Ligand location"]] 
    # Convert links
    ligand_card_2["HGNC gene card"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url)
    ligand_card_2["Disease relevance"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url_disease)
    ligand_card_2["Expression Profile"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url_exp)
    ligand_card_2 = ligand_card_2[["Human LR Pair", "Ligand HGNC ID", "HGNC gene card", "Disease relevance", "Expression Profile", "Ligand location"]]       

    receptor_card = gene_pair0[["Human LR Pair", "Receptor", "Receptor name", "Receptor HGNC ID", "Receptor MGI ID", "Receptor RGD ID", "Receptor location"]].merge(
        pop_up_info_lim, how='left', left_on='Receptor', right_on='Approved symbol'
    ).drop_duplicates(subset='Human LR Pair', keep="first").drop(columns=["Receptor", "Approved symbol"])
    
    receptor_card_1 = receptor_card[["Human LR Pair", "Alias symbol", "Date symbol changed", "Receptor name"]] 
    receptor_card_2 = receptor_card[["Human LR Pair", "Receptor HGNC ID", "Receptor location"]] 
    receptor_card_2["HGNC gene card"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url)
    receptor_card_2["Disease relevance"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url_disease)
    receptor_card_2["Expression Profile"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url_exp)
    receptor_card_2 = receptor_card_2[["Human LR Pair", "Receptor HGNC ID", "HGNC gene card", "Disease relevance", "Expression Profile", "Receptor location"]]       

    return interaction_card, ligand_card_1, ligand_card_2, receptor_card_1, receptor_card_2

def generate_html_files(template, interaction_card, ligand_card_1, receptor_card_1, ligand_card_2, receptor_card_2, output_dir):
    """Generate HTML files for each Human LR Pair."""
    column_values = interaction_card["Human LR Pair"].dropna().unique()
    os.makedirs(output_dir, exist_ok=True)

    # Encode the plotlegend image to base64
    plotlegend_image_path = "data/image/plotlegend.webp"
    plotlegend_base64 = encode_image(plotlegend_image_path)  # Convert WebP to base64

    for value in column_values:
        value1, value2 = value.split()
        row0 = interaction_card[interaction_card['Human LR Pair'] == value]
        row1 = ligand_card_1[ligand_card_1['Human LR Pair'] == value]
        row2 = receptor_card_1[receptor_card_1['Human LR Pair'] == value]
        row3 = ligand_card_2[ligand_card_2['Human LR Pair'] == value]
        row4 = receptor_card_2[receptor_card_2['Human LR Pair'] == value]

        # Check if the HTML files exist
        ligand_image_path = f'data/gene_expr_plots/{value1}.html'
        receptor_image_path = f'data/gene_expr_plots/{value2}.html'
        
        if os.path.exists(ligand_image_path):
            with open(ligand_image_path, "r") as html_file:
                ligand_image = html_file.read()  # Read the HTML content
        else:
            ligand_image = "Plot does not exist"
        
        if os.path.exists(receptor_image_path):
            with open(receptor_image_path, "r") as html_file:
                receptor_image = html_file.read()  # Read the HTML content
        else:
            receptor_image = "Plot does not exist"


        table0_data = row0.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row0.empty else {}
        table1_data = row1.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row1.empty else {}
        table2_data = row2.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row2.empty else {}
        table3_data = row3.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row3.empty else {}
        table4_data = row4.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row4.empty else {}

        rendered_content = template.render(
            value1=value1,
            value2=value2,
            table0_data=table0_data,
            table1_data=table1_data,
            table2_data=table2_data,
            table3_data=table3_data,
            table4_data=table4_data,
            ligand_image=ligand_image,
            receptor_image=receptor_image,
            plotlegend_base64=plotlegend_base64 
        )
        
        output_file = os.path.join(output_dir, f"{value1} {value2}.html")
        with open(output_file, 'w') as file:
            #time.sleep(0.5)
            file.write(rendered_content)


In [None]:
pop_up_info_lim

In [None]:
## Function to create Ligand-Receptor pair cards

import os
import jinja2
import sys
import pandas as pd
import numpy as np
import time
import base64
import re


sys.path.append(os.path.abspath("src"))  
import fetchGSheet
from createDataTable import pop_up_info, gene_pair0

# Paths
TEMPLATE_PATH = 'HTML/cardTemplate.html'
OUTPUT_DIR = 'data/cards/'

def load_template(template_path):
    """Load Jinja2 template from a file."""
    with open(template_path, 'r') as file:
        return jinja2.Template(file.read())

def encode_image(image_path):
    """Encode an image to base64."""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except FileNotFoundError:
        return None

# Function to extract the HGNC ID from the anchor tag URL
def extract_hgnc_id(col):
    # Use regular expression to extract the HGNC ID after "HGNC:"
    match = re.search(r'HGNC:(\d+)', col)
    if match:
        return match.group(1)  # Return the HGNC ID (number part)
    return None  # Return None if the format doesn't match or it's not a string

# Updated functions to convert the HGNC link (using extract_hgnc_id)
def convert_hgnc_url(col):
    hgnc_id = extract_hgnc_id(col)  # Extract the HGNC ID
    if hgnc_id:
        visible_text = "genecard.org"
        new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}" target="_blank">{visible_text}</a>'
        return new_link
    return None

def convert_hgnc_url_disease(col):
    hgnc_id = extract_hgnc_id(col)  # Extract the HGNC ID
    if hgnc_id:
        visible_text = "see here"
        new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}#diseases" target="_blank">{visible_text}</a>'
        return new_link
    return None

def convert_hgnc_url_exp(col):
    hgnc_id = extract_hgnc_id(col)  # Extract the HGNC ID
    if hgnc_id:
        visible_text = "see here"
        new_link = f'<a href="https://www.genecards.org/cgi-bin/carddisp.pl?id_type=hgnc&id={hgnc_id}#expression" target="_blank">{visible_text}</a>'
        return new_link
    return None

def prepare_dataframes(gene_pair0):
    """Prepare interaction, ligand, and receptor dataframes."""
    # DBlength = len(gene_pair0)
    # gene_pair0["Interaction ID"] = [f"CDB{str(i).zfill(4)}" for i in range(1, DBlength + 1)]
    gene_pair0["Interaction Type"] = [
        f'{ligand} {ligandLocation} ligand binds to {receptor} {receptorLocation} receptor'
        for ligand, ligandLocation, receptor, receptorLocation in zip(
            gene_pair0["Ligand"], gene_pair0["Ligand location"],
            gene_pair0["Receptor"], gene_pair0["Receptor location"]
        )
    ]
    interaction_card = gene_pair0[["Interaction ID", "Human LR Pair", "Interaction Type", "Perplexity", "PMID support", "Top Pathway", "Cancer-related", "Disease Type"]]
    interaction_card["Perplexity"] = interaction_card["Perplexity"].str.replace('size=30', 'size=80')

    pop_up_info_lim = pop_up_info[
        ["Approved symbol", "Alias symbol", "Previous symbol", "Date symbol changed"]
    ].drop_duplicates(subset="Approved symbol", keep="first")
    
    ligand_card = gene_pair0[["Human LR Pair", "Ligand", "Ligand name", "Ligand HGNC ID", "Ligand MGI ID", "Ligand RGD ID", "Ligand location"]].merge(
        pop_up_info_lim, how='left', left_on='Ligand', right_on='Approved symbol'
    ).drop_duplicates(subset='Human LR Pair', keep="first").drop(columns=["Ligand", "Approved symbol"])

    ligand_card_1 = ligand_card[["Human LR Pair", "Alias symbol", "Date symbol changed", "Ligand name"]] 
    ligand_card_2 = ligand_card[["Human LR Pair", "Ligand HGNC ID", "Ligand location"]] 
    # Convert links
    ligand_card_2["HGNC gene card"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url)
    ligand_card_2["Disease relevance"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url_disease)
    ligand_card_2["Expression Profile"] = ligand_card_2["Ligand HGNC ID"].apply(convert_hgnc_url_exp)
    ligand_card_2 = ligand_card_2[["Human LR Pair", "Ligand HGNC ID", "HGNC gene card", "Disease relevance", "Expression Profile", "Ligand location"]]       

    receptor_card = gene_pair0[["Human LR Pair", "Receptor", "Receptor name", "Receptor HGNC ID", "Receptor MGI ID", "Receptor RGD ID", "Receptor location"]].merge(
        pop_up_info_lim, how='left', left_on='Receptor', right_on='Approved symbol'
    ).drop_duplicates(subset='Human LR Pair', keep="first").drop(columns=["Receptor", "Approved symbol"])
    
    receptor_card_1 = receptor_card[["Human LR Pair", "Alias symbol", "Date symbol changed", "Receptor name"]] 
    receptor_card_2 = receptor_card[["Human LR Pair", "Receptor HGNC ID", "Receptor location"]] 
    receptor_card_2["HGNC gene card"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url)
    receptor_card_2["Disease relevance"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url_disease)
    receptor_card_2["Expression Profile"] = receptor_card_2["Receptor HGNC ID"].apply(convert_hgnc_url_exp)
    receptor_card_2 = receptor_card_2[["Human LR Pair", "Receptor HGNC ID", "HGNC gene card", "Disease relevance", "Expression Profile", "Receptor location"]]       

    return interaction_card, ligand_card_1, ligand_card_2, receptor_card_1, receptor_card_2

def generate_html_files(template, interaction_card, ligand_card_1, receptor_card_1, ligand_card_2, receptor_card_2, output_dir):
    """Generate HTML files for each Human LR Pair."""
    column_values = interaction_card["Human LR Pair"].dropna().unique()
    os.makedirs(output_dir, exist_ok=True)

    # Encode the plotlegend image to base64
    plotlegend_image_path = "data/image/plotlegend.webp"
    plotlegend_base64 = encode_image(plotlegend_image_path)  # Convert WebP to base64

    for value in column_values:
        value1, value2 = value.split()
        row0 = interaction_card[interaction_card['Human LR Pair'] == value]
        row1 = ligand_card_1[ligand_card_1['Human LR Pair'] == value]
        row2 = receptor_card_1[receptor_card_1['Human LR Pair'] == value]
        row3 = ligand_card_2[ligand_card_2['Human LR Pair'] == value]
        row4 = receptor_card_2[receptor_card_2['Human LR Pair'] == value]

        # Check if the HTML files exist
        ligand_image_path = f'data/gene_expr_plots/{value1}.html'
        receptor_image_path = f'data/gene_expr_plots/{value2}.html'
        
        if os.path.exists(ligand_image_path):
            with open(ligand_image_path, "r") as html_file:
                ligand_image = html_file.read()  # Read the HTML content
        else:
            ligand_image = "Plot does not exist"
        
        if os.path.exists(receptor_image_path):
            with open(receptor_image_path, "r") as html_file:
                receptor_image = html_file.read()  # Read the HTML content
        else:
            receptor_image = "Plot does not exist"


        table0_data = row0.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row0.empty else {}
        table1_data = row1.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row1.empty else {}
        table2_data = row2.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row2.empty else {}
        table3_data = row3.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row3.empty else {}
        table4_data = row4.drop('Human LR Pair', axis=1).to_dict(orient='records')[0] if not row4.empty else {}

        rendered_content = template.render(
            value1=value1,
            value2=value2,
            table0_data=table0_data,
            table1_data=table1_data,
            table2_data=table2_data,
            table3_data=table3_data,
            table4_data=table4_data,
            ligand_image=ligand_image,
            receptor_image=receptor_image,
            plotlegend_base64=plotlegend_base64 
        )
        
        output_file = os.path.join(output_dir, f"{value1} {value2}.html")
        with open(output_file, 'w') as file:
            #time.sleep(0.5)
            file.write(rendered_content)

In [None]:
gene_pair0["Human LR Pair"] == "GPR15LG GPR182"

In [None]:
# Function to update and display the table
def update_table():
    show(
        human_gene_pair,
        keys=True,
        layout={"top": ["searchPanes"],
                },
        searchPanes={"layout": "rows-1", 
                     "cascadePanes": True, 
                     "columns": [2,3,4,7,13,14,15,17], 
                     "regex": True, "caseInsensitive": False, 
                     "smart": True, 
                     "initCollapsed": True, 
                     "controls": True},
        #layout={"bottom": "searchBuilder", "title": "Advanced Search"},
        fixedColumns={"start": 2, "end": 0},
        fixedHeader=True,  # Keep header fixed for alignment
        column_filters="footer", 
        scrollX=True,
        classes="display nowrap cell-border compact",
        select=True,
        selected_rows=[],
        responsive=True,  # Enable responsive behavior
        theme='grid',
        search={"regex": True, "caseInsensitive": False, "smart": True},
        lengthMenu=[ [10, 20, 50, 100, -1], ["10", "20", "50", "100", "All"] ],
        buttons=[
            "pageLength",  
            {"extend": "colvis", "text": "Display Columns"},
            {"extend": "csvHtml5", "title": "Source"},
            {"extend": "excelHtml5", "title": "Source"},
            {"extend": "copyHtml5", "title": "Source"},
        ],
        columnDefs=[
            {"className": "dt-center", "targets": [5]}, 
            # {"width": "100px", "targets": "_all"},
            {"targets": list(range(-1, -17, -1)), "visible": False} # Hide last 12 columns by default
        ],
        style="1200px;margin:auto",
           language={
            "search": "Search Any Column:",
            "searchPlaceholder": "e.g. CD24",
            "searchPanesPlaceholder": "",
           # "searchBuilder": {
           #     "title": "Advanced Search <i>Multiple filtering, AND/OR condition</i>" 
           # }
        }, 
        autoWidth=True,
        maxBytes=0,  # So it does not downsample
        initCode='''
            $(document).on('draw.dt', function () {
                $($.fn.dataTable.tables(true)).DataTable().columns.adjust();
            });
        ''',
    )
      
      

# Call the function to render the table
update_table()

In [None]:
human_gene_pair.columns[13]

In [None]:
gene_pair_annot = gene_pair0[["Human LR Pair", "Cancer-related", "Top Pathway"]]
df= pd.read_csv("data/disease_annotations_per_pair.csv") # Liana Diseases
gene_pair_annot = gene_pair_annot.merge(df, how='left', left_on='Human LR Pair', right_on='interaction')
gene_pair_annot = gene_pair_annot.drop(columns=["interaction"])
df= pd.read_csv("data/pathway_annotations_per_pair.csv") # Liana Pathway
gene_pair_annot = gene_pair_annot.merge(df, how='left', left_on='Human LR Pair', right_on='interaction')
gene_pair_annot = gene_pair_annot.drop(columns=["interaction", "weight"])

gene_pair_annot = gene_pair_annot.rename(columns={
                                     "disease": "Disease", 
                                     "source": "Related Pathway"}
                            )

In [None]:
pgene_pair0["Human LR Pair"] == "GPR15LG GPR182"

In [None]:
# Main execution
if __name__ == "__main__":
    template = load_template(TEMPLATE_PATH)
    interaction_card, ligand_card_1, receptor_card_1, ligand_card_2, receptor_card_2 = prepare_dataframes(gene_pair0)
    generate_html_files(template, interaction_card, ligand_card_1, receptor_card_1, ligand_card_2, receptor_card_2, OUTPUT_DIR)


In [None]:
fetchGSheet.gene_pair.columns

In [None]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet"
]

# Select only the relevant columns from pop_up_info
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          "rgd_id": "RGD ID",
                                          "mgd_id": "MGI ID", 
                                          "rgd_id": "RGD ID",
                                          "alias_symbol": "Alias symbol",
                                          "prev_symbol": "Previous symbol",
                                          "date_symbol_changed": "Date symbol changed"
                                          
                                         })

pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "MGI ID", "RGD ID"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair.dropna(axis=1, how='all')

# for now, rm some columns
gene_pair = gene_pair[['LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
                       'perplexity link', 'PMID', 'binding location', 
                       'bind in trans?', 'bidirectional signalling?',
                       'interaction type', 'original source']]

# Mapping for replacements
mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# Replace values in the column based on the mapping
gene_pair['original source'] = gene_pair['original source'].replace(mapping)

## add Ligand/Receptor Location
mapping_loc = dict(zip(fetchGSheet.loc_info['ApprovedSymbol'], fetchGSheet.loc_info['Localization']))
gene_pair['Ligand location'] = gene_pair['Ligand'].replace(mapping_loc)
gene_pair['Receptor location'] = gene_pair['Receptor'].replace(mapping_loc)

# Fetch species IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "LR pair": "Human LR Pair",
    "Ligand.HGNC": "Ligand HGNC ID",
    "Receptor.HGNC": "Receptor HGNC ID",
    "perplexity link": "Perplexity", # will be replaced with actual link later
    "original source": "Database Source",
    "PMID": "PMID support"
})

# Recreate Perplexity link
# Function to generate Perplexity search link
def create_url_basic(gene_name):
    query = f"What is the primary evidence that {gene_name} bind-each-other-as-a-ligand-and-receptor-pair. Exclude reviews, uniprot, wiki, genecards, PIPS, iuphar as sources."
    encoded_query = query.replace(" ", "%20")
    return f"https://www.perplexity.ai/search?q={encoded_query}"

# Apply function to the DataFrame
gene_pair["Perplexity"] = gene_pair["Perplexity"].apply(create_url_basic)

# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand name", 
                                     "MGI ID": "Ligand MGI ID",
                                     "RGD ID": "Ligand RGD ID"},
                            )
gene_pair = gene_pair.drop(columns=["HGNC ID"])
# Add top pathway per pair
LR_pairs = gene_pair["Human LR Pair"].unique()
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
df = df[df["interaction"].isin(LR_pairs)]
# Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
})
gene_pair = gene_pair.merge(top_pathway_df, how='left', left_on='Human LR Pair', right_on='interaction')

# Add Disease Category per pair
df= pd.read_csv("data/diseaseType_per_pair.csv")
disease_df = df[df["interaction_x"].isin(LR_pairs)]

gene_pair = gene_pair.merge(disease_df, how='left', left_on='Human LR Pair', right_on='interaction_x')

# Add MGI annotation
MGI_info = pd.read_csv("data/MGI_ID_biomart.csv")
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Ligand MGI ID', right_on='MGI ID')

# Find rows where Ligand HGNC ID is missing & copy Ligand to MGI name for those rows
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'MGI name'] = gene_pair.loc[mask, 'Ligand']
# Map MGI ID using the MGI_info table
gene_pair = gene_pair.merge(MGI_info, left_on='MGI name', right_on='MGI name', how='left', suffixes=('', '_from_info'))
# Fill missing 'MGI ID' only where it was previously missing
gene_pair['Ligand MGI ID'] = gene_pair['Ligand MGI ID'].combine_first(gene_pair['MGI ID_from_info'])
gene_pair = gene_pair.drop(columns=['MGI ID_from_info'])

# Add RGD annotation
RGD_info = pd.read_csv("data/RGD_ID_biomart.csv")
RGD_info['RGD ID'] = "RGD:" + RGD_info['RGD ID'].astype(str)
gene_pair = gene_pair.merge(RGD_info, how='left', left_on='Ligand RGD ID', right_on='RGD ID')

# Add ZFIN id and symbol
ZFIN_info = pd.read_csv("data/ZFIN_ID_human_orthos.txt", sep="\t", skiprows=1)
ZFIN_info = ZFIN_info[['ZFIN ID', 'ZFIN Symbol', 'ZFIN Name', 'HGNC ID']]

ZFIN_info = ZFIN_info.dropna(subset=['HGNC ID'])
ZFIN_info = ZFIN_info.drop_duplicates(subset=['HGNC ID'])
ZFIN_info['HGNC ID'] = ZFIN_info['HGNC ID'].apply(lambda x: f'HGNC:{int(x)}')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.drop(columns=["RGD ID", "MGI ID", "HGNC ID", "interaction", "interaction_x"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Ligand", 
                                     "RGD name": "Rat Ligand",
                                     "ZFIN ID": "Ligand ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Ligand",
                                     "ZFIN Name": "Zebrafish Ligand name"}
                            )

gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Receptor name",
                                      "MGI ID": "Receptor MGI ID",
                                      "RGD ID": "Receptor RGD ID"}
                            )


gene_pair = gene_pair.drop(columns=["HGNC ID"])

# Add MGI name
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Receptor MGI ID', right_on='MGI ID')
# Find rows where Receptor HGNC ID is missing & copy Receptor to MGI name for those rows
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'MGI name'] = gene_pair.loc[mask, 'Receptor']
# Map MGI ID using the MGI_info table
gene_pair = gene_pair.merge(MGI_info, left_on='MGI name', right_on='MGI name', how='left', suffixes=('', '_from_info'))
# Fill missing 'MGI ID' only where it was previously missing
gene_pair['Receptor MGI ID'] = gene_pair['Receptor MGI ID'].combine_first(gene_pair['MGI ID_from_info'])
gene_pair = gene_pair.drop(columns=['MGI ID_from_info'])

gene_pair = gene_pair.merge(RGD_info, how='left', left_on='Receptor RGD ID', right_on='RGD ID')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')
gene_pair = gene_pair.drop(columns=["RGD ID", "MGI ID", "HGNC ID"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Receptor", 
                                     "RGD name": "Rat Receptor",
                                     "ZFIN ID": "Receptor ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Receptor",
                                     "ZFIN Name": "Zebrafish Receptor name"})

gene_pair.tail()

In [None]:
LR_pairs = gene_pair["Human LR Pair"].unique()
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
#df = df[df["interaction"].isin(LR_pairs)]
# Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
})

In [None]:
top_pathway_df

In [None]:
gene_pair.columns

In [None]:
# Find rows where Ligand HGNC ID is missing & copy Ligand to MGI name for those rows
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'Ligand MGI name'] = gene_pair.loc[mask, 'Ligand']
mask
gene_pair

In [None]:
# Map MGI ID using the MGI_info table
gene_pair = gene_pair.merge(MGI_info, left_on='Ligand MGI name', right_on='MGI name', how='left', suffixes=('', '_from_info'))
gene_pair

In [None]:
# Fill missing 'MGI ID' only where it was previously missing
gene_pair['Ligand MGI ID'] = gene_pair['Ligand MGI ID'].combine_first(gene_pair['MGI ID_from_info'])
gene_pair = gene_pair.drop(columns=['MGI ID_from_info'])

In [None]:
MGI_info[MGI_info["MGI name"] == "Pcdhb17"]

In [None]:
df

In [None]:
gene_pair[["PMID support"]]

In [None]:
# Add ZFIN id and symbol
ZFIN_info = pd.read_csv("data/ZFIN_ID_human_orthos.txt", sep="\t", skiprows=1)
ZFIN_info = ZFIN_info[['ZFIN ID', 'ZFIN Symbol', 'ZFIN Name', 'HGNC ID']]

ZFIN_info = ZFIN_info.dropna(subset=['HGNC ID'])
ZFIN_info = ZFIN_info.drop_duplicates(subset=['HGNC ID'])
ZFIN_info['HGNC ID'] = ZFIN_info['HGNC ID'].apply(lambda x: f'HGNC:{int(x)}')
ZFIN_info

In [None]:
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')
gene_pair

In [None]:
gene_pair.columns

In [None]:
pop_up_info_lim = pop_up_info[["HGNC ID", "Approved symbol", "Approved name", "MGI ID", "RGD ID"]]
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")
pop_up_info_lim

In [None]:
import sys
import os
import pandas as pd
from itables import init_notebook_mode, show
from IPython.display import display, Javascript
import itables.options as opt
# Change working directory to ConnectomeDB
project_root = os.path.dirname(os.getcwd())
os.chdir(project_root)
sys.path.append(os.path.abspath("src"))

In [None]:
os.getcwd()

In [None]:
os.chdir('/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories/ConnectomeDB')

In [None]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0
gene_pair0

In [None]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

gene_pair_annot = gene_pair0[["Human LR Pair", "Cancer-related", "Top Pathway"]]
df= pd.read_csv("data/disease_annotations_per_pair.csv") # Liana Diseases
gene_pair_annot = gene_pair_annot.merge(df, how='left', left_on='Human LR Pair', right_on='interaction_x')
df= pd.read_csv("data/pathway_annotations_per_pair.csv") # Liana Pathway
gene_pair_annot = gene_pair_annot.merge(df, how='left', left_on='Human LR Pair', right_on='interaction')
gene_pair_annot = gene_pair_annot.drop(columns=["interaction_x", "interaction", "weight"])

gene_pair_annot = gene_pair_annot.rename(columns={
                                     "disease": "Disease", 
                                     "source": "Related Pathway"}
                            )
# Create the links to the HTML cards
gene_pair_annot["Human LR Pair"] = [
    f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrPairOrig}.html">{lrPair}</a>'
    for lrPairOrig, lrPair in zip(gene_pair_annot["Human LR Pair"], gene_pair_annot["Human LR Pair"])
]

# reorder
gene_pair_annot = gene_pair_annot[["Human LR Pair", "Disease", "Disease Type", "Cancer-related",  "Related Pathway", "Top Pathway"]]
gene_pair_annot["Disease"] = gene_pair_annot["Disease"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot["Disease Type"] = gene_pair_annot["Disease Type"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot["Related Pathway"] = gene_pair_annot["Related Pathway"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot

In [None]:
gene_pair_annot

In [None]:
# Drop columns where all values are NA in gene_pair
gene_pair = gene_pair.dropna(axis=1, how='all')

gene_pair = gene_pair.fillna(" ")
gene_pair = gene_pair[gene_pair['Human LR Pair'] != ' ']

if "PMID link" in gene_pair.columns:
    gene_pair = gene_pair.drop(columns=["PMID link"])

# Add
first_columns=['Human LR Pair', 'Ligand', 'Receptor', 'Interaction Source']

end_columns=['HGNC L R', 'sanity check', 'curator', 'secondary source?']
gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns + end_columns] + end_columns]


# number of unique vars

lrPairsCount = len(gene_pair["Human LR Pair"].unique())

ligandCount = len(gene_pair["Ligand"].unique())

receptorCount = len(gene_pair["Receptor"].unique())

# Mouse Orthologue
MouseLigandCount = len(gene_pair["Ligand MGI ID"].unique())

MouseReceptorCount = len(gene_pair["Receptor MGI ID"].unique())

# Rat Orthologue
RatLigandCount = len(gene_pair["Ligand RGD ID"].unique())

RatReceptorCount = len(gene_pair["Receptor RGD ID"].unique())

gene_pair["PMID support"] = [value.replace(" ", "") for value in gene_pair["PMID support"]]

source = np.array(gene_pair["PMID support"].unique())
source = source.astype(str)
source = ",".join(sorted(set(filter(lambda x: x.lower() != 'nan', source))))

# Split the string into individual elements, filter out empty strings, and get unique values
source = sorted(
    set(filter(lambda x: x.strip() and x.strip().lower() != 'nan', source.split(',')))
)
source = [value.replace(" ", "") for value in source]
sourceCount = len(source)

# for creating PMIDs
gene_pair00 = gene_pair[['Human LR Pair', 'PMID support']]

# create URLs for the HGNC IDs

# ligand
gene_pair["Ligand HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(ligand, ligand)
    for ligand in gene_pair["Ligand HGNC ID"]
]

# receptor
gene_pair["Receptor HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(receptor, receptor)
    for receptor in gene_pair["Receptor HGNC ID"]
]

# Perplexity
gene_pair["Perplexity"] = [
    '<a href="{}" target="_blank"> <img src="https://img.icons8.com/?size=30&id=0NbBuNOxUwps&format=png&color=000000" alt="Perplexity AI" /></a>'.format(url)
    for url in gene_pair["Perplexity"]
]

# Function to generate hyperlinks for the "PMID support" column
# Function to generate hyperlinks for the "PMID support" column
def generate_links_with_doi(df, gene_column, pmid_column):
    def create_link(gene, sources):
        # Replace spaces with "——" in the gene name for the link
        gene_name = gene.replace(" ", "——")
        
        if len(sources) == 1:
            source = sources[0]
            if source.startswith("https://www.biorxiv.org/content/"):
                # If the value starts with "https://doi.org/", use it as the hyperlink
                return f'<a href="{source}" target="_blank">BioRxiv preprint</a>'
            else:
                # If it's a single PMID, hyperlink the PMID text
                return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/pubmed/{gene_name}_pmid_details.html">{source}</a>'
        else:
            # If multiple PMIDs, show the count and hyperlink to the page
            return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/pubmed/{gene_name}_pmid_details.html" target="_blank">{len(sources)} PMIDs</a>'

    # Process each row to generate the "PMID support" column
    df["PMID support"] = [
        create_link(
            gene=row[gene_column], 
            sources=[s.strip() for s in row[pmid_column].split(',') if s.strip()]
        )
        for _, row in df.iterrows()
    ]
    return df


# Generate the links for the "PMID support" column
gene_pair = generate_links_with_doi(gene_pair, gene_column="Human LR Pair", pmid_column="PMID support")

gene_pair["Ligand MGI ID"] = [
        f'<a href="https://www.informatics.jax.org/marker/{mouseOrth}" target="_blank">{mouseOrth}</a>' 
        if pd.notna(mouseOrth) and mouseOrth.strip() else "" 
        for mouseOrth in gene_pair["Ligand MGI ID"]
    ]

gene_pair["Receptor MGI ID"] = [
        f'<a href="https://www.informatics.jax.org/marker/{mouseOrth}" target="_blank">{mouseOrth}</a>' 
        if pd.notna(mouseOrth) and mouseOrth.strip() else "" 
        for mouseOrth in gene_pair["Receptor MGI ID"]
    ]

gene_pair["Ligand RGD ID"] = [
        f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={ratOrth.replace("RGD:", "")}" target="_blank">{ratOrth}</a>' 
        if pd.notna(ratOrth) and ratOrth.strip() else "" 
        for ratOrth in gene_pair["Ligand RGD ID"]
    ]

gene_pair["Receptor RGD ID"] = [
        f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={ratOrth.replace("RGD:", "")}" target="_blank">{ratOrth}</a>' 
        if pd.notna(ratOrth) and ratOrth.strip() else "" 
        for ratOrth in gene_pair["Receptor RGD ID"]
    ]

In [None]:
gene_pair["Source"].unique()

In [None]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet"
]

# Select only the relevant columns from pop_up_info
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          "rgd_id": "RGD ID",
                                          "mgd_id": "MGI ID", 
                                          "rgd_id": "RGD ID",
                                          "alias_symbol": "Alias symbol",
                                          "prev_symbol": "Previous symbol",
                                          "date_symbol_changed": "Date symbol changed"
                                          
                                         })

pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "MGI ID", "RGD ID"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair.dropna(axis=1, how='all')

# for now, rm some columns
gene_pair = gene_pair[['LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
                       'perplexity link', 'PMID', 'binding location', 
                       'bind in trans?', 'bidirectional signalling?',
                       'interaction type', 'original source']]

# Mapping for replacements
mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# Replace values in the column based on the mapping
gene_pair['original source'] = gene_pair['original source'].replace(mapping)

## add Ligand/Receptor Location
mapping_loc = dict(zip(fetchGSheet.loc_info['ApprovedSymbol'], fetchGSheet.loc_info['Localization']))
gene_pair['Ligand location'] = gene_pair['Ligand'].replace(mapping_loc)
gene_pair['Receptor location'] = gene_pair['Receptor'].replace(mapping_loc)

# Fetch species IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "LR pair": "Human LR Pair",
    "Ligand.HGNC": "Ligand HGNC ID",
    "Receptor.HGNC": "Receptor HGNC ID",
    "perplexity link": "Perplexity", # will be replaced with actual link later
    "original source": "Database Source",
    "PMID": "PMID support"
})

# Recreate Perplexity link
# Function to generate Perplexity search link
def create_url_basic(gene_name):
    query = f"What is the primary evidence that {gene_name} bind-each-other-as-a-ligand-and-receptor-pair. Exclude reviews, uniprot, wiki, genecards, PIPS, iuphar as sources."
    encoded_query = query.replace(" ", "%20")
    return f"https://www.perplexity.ai/search?q={encoded_query}"

# Apply function to the DataFrame
gene_pair["Perplexity"] = gene_pair["Perplexity"].apply(create_url_basic)

# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand name", 
                                     "MGI ID": "Ligand MGI ID",
                                     "RGD ID": "Ligand RGD ID"},
                            )
gene_pair = gene_pair.drop(columns=["HGNC ID"])
# Add top pathway per pair
LR_pairs = gene_pair["Human LR Pair"].unique()
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
#df = df[df["interaction"].isin(LR_pairs)]
# Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
})
top_pathway_df["interaction"] = [value.replace("^", " ") for value in top_pathway_df["interaction"]]
gene_pair = gene_pair.merge(top_pathway_df, how='left', left_on='Human LR Pair', right_on='interaction')

In [None]:
df= pd.read_csv("data/diseaseType_per_pair.csv")

In [None]:
disease_df = df[df["interaction_x"].isin(LR_pairs)]

gene_pair = gene_pair.merge(disease_df, how='left', left_on='Human LR Pair', right_on='interaction_x')
gene_pair

In [None]:
import pandas as pd
import os, sys
import json
# Add the src directory to the path for importing modules
sys.path.append(os.path.abspath("src"))
from createPMIDpages import gene_pair00

# Load the files
file1 = pd.read_csv("data/pubmed_results.csv") 
file2 = gene_pair00

# Convert the PMIDs column in file2 to lists for easy comparison
file1['PMID'] = file1['PMID'].astype(str)
file2['PMID_List'] = file2['PMID support'].apply(lambda x: x.split(','))

# Create a dictionary for quick PMID to Abstract mapping
pmid_to_abstract = dict(zip(file1['PMID'], file1['Abstract']))
pmid_to_abstract

In [None]:
data_for_llm

In [None]:
# Function to get all abstracts for a list of PMIDs
def get_abstracts(pmids):
    return [pmid_to_abstract[pmid] for pmid in pmids if pmid in pmid_to_abstract]

# Map abstracts to LR pairs
file2['Abstracts'] = file2['PMID_List'].apply(get_abstracts)

# Convert to a list of dictionaries
data_for_llm = file2[['Human LR Pair', 'Abstracts']].to_dict(orient='records')

# Save as JSON
with open("data/data_for_llm.json", "w") as f:
    json.dump(data_for_llm, f, indent=4)

In [None]:
# Save as JSON
with open("data/data_for_llm.json", "w") as f:
    json.dump(data_for_llm, f, indent=4)

In [None]:
# Function to add species-specific species Enseml ID and symbol for all other species except for mouse, rat, and zebrafish
def appendOtherSpeciesInfo(species, origDF):
    # Load species-specific data
    species_info = pd.read_csv(f"data/{species}_ID_biomart.csv")

    # Keep relevant columns
    species_info = species_info[[f"{species}_homolog_ensembl_gene", 
                                 f"{species}_homolog_associated_gene_name", 
                                 'hgnc_id']]

    # Remove rows where 'hgnc_id' is NaN and drop duplicates
    species_info = species_info.dropna(subset=['hgnc_id'])
    species_info = species_info.drop_duplicates(subset=['hgnc_id'])

    # Merge with ligand data
    origDF = origDF.merge(species_info, how='left', 
                           left_on='Ligand HGNC ID', right_on='hgnc_id')
    
    # Rename columns for ligand info
    origDF = origDF.rename(columns={
        f"{species}_homolog_associated_gene_name": f"{species} Ligand", 
        f"{species}_homolog_ensembl_gene": f"{species} Ligand Ensembl ID"
    })

    # Drop duplicate 'hgnc_id' column
    origDF = origDF.drop(columns=['hgnc_id'])

    # Merge with receptor data
    origDF = origDF.merge(species_info, how='left', 
                           left_on='Receptor HGNC ID', right_on='hgnc_id')

    # Rename columns for receptor info
    origDF = origDF.rename(columns={
        f"{species}_homolog_associated_gene_name": f"{species} Receptor", 
        f"{species}_homolog_ensembl_gene": f"{species} Receptor Ensembl ID"
    })

        # Drop duplicate 'hgnc_id' column
    origDF = origDF.drop(columns=['hgnc_id'])

    # Drop columns where all values are NaN
    origDF = origDF.dropna(axis=1, how='all')

    return origDF

species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet"
]

# Loop through each species and update gene_pair
for species in species_list:
    gene_pair = appendOtherSpeciesInfo(species, gene_pair)

In [None]:
gene_pair.columns

In [None]:
def add_row(change):
    global gene_pair
    # Add a new row at the top with None values
    new_row = {col: None for col in gene_pair.columns}
    gene_pair = pd.DataFrame([new_row] + gene_pair.to_dict(orient="records"))
    update_table()

# Function to remove the last row of the dataframe
def remove_row(change):
    global gene_pair
    if len(gene_pair) > 0:
        gene_pair = gene_pair[:-1]  # Remove the last row
        update_table()

In [None]:
gene_pair.columns

In [None]:
duplicates = gene_pair00[gene_pair00["Human LR Pair"].duplicated()]
print(duplicates["Human LR Pair"])

In [None]:
## Function to create horizontal bar plots of each gene in Human Taxon --expression log(x+1) transformed with cell types as y-axis

import requests
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go

sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

# Input file
input_file="data/connectome_j.tsv" #"data/connectome_j.tsv" # data/ExpressionGenes.txt
# Get all unique genes
ligand_list = gene_pair0["Ligand"].tolist()
receptor_list = gene_pair0["Receptor"].tolist()
unique_genes = list(set(ligand_list + receptor_list))  # Combine and remove duplicates

connectomeDB = pd.read_table(input_file, sep="\t")
# All Taxon for now
#connectomeDB = connectomeDB[connectomeDB["Taxon"]== "Human"]
if "Taxon" in connectomeDB.columns:
    connectomeDB = connectomeDB.drop(columns=["Localization", "Taxon"] + [col for col in connectomeDB.columns if col.startswith("F5_")])

In [None]:
column_sums = connectomeDB.iloc[:, 1:].sum()

In [None]:
connectomeDB.iloc[:, 1:].sum()

In [None]:
intersection = pd.Series(list(set(connectomeDB['ApprovedSymbol']).intersection(unique_genes)))
intersection

connectomeDB = connectomeDB[connectomeDB["ApprovedSymbol"].isin(intersection)]
connectomeDB
    

In [None]:
# log(x+1) transform
connectomeDB.iloc[:, 1:] = np.log1p(connectomeDB.iloc[:, 1:])
# Reshape 
connectomeDB_long = connectomeDB.melt(id_vars=["ApprovedSymbol"], 
                                      var_name="cellTypes", value_name="expr_val")
cellCat = pd.read_csv("data/cell_categories.csv")
connectomeDB_long = connectomeDB_long.merge(cellCat, how='left', left_on='cellTypes', right_on='cellType')
connectomeDB_long = connectomeDB_long.drop(columns=["cellType"])

intersection = pd.Series(list(set(connectomeDB_long['cellTypes']).intersection(set(cellCat['cellType']))))
intersection

diff_df = pd.Series(list(set(connectomeDB_long['cellTypes']).difference(set(cellCat['cellType']))))
diff_df

def plot_gene_expression(df):
    # Define the colors for each cell category
    colors = {
        "missing": "#B0B0B0",  # Neutral gray
        "other": "#D4A76A",  # Warm gold
        "mesenchymal": "#377EB8",  # Vibrant blue
        "epithelial": "#E41A1C",  # Bold red
        "hematopoietic": "#4DAF4A",  # Fresh green
        "endothelial": "#984EA3",  # Deep purple
        "nervous system": "#FF7F00",  # Bright orange
    }

    # Define sorting order for cell categories
    category_order = {cat: i for i, cat in enumerate(colors.keys())}

    for gene, sub_df in df.groupby("ApprovedSymbol"):
        # Sort by category first, then by expression value (highest first)
        sub_df = sub_df.copy()
        sub_df["category_order"] = sub_df["cellCategory"].map(category_order).fillna(len(category_order))
        sub_df = sub_df.sort_values(["category_order", "expr_val"], ascending=[True, False])

        num_bars = len(sub_df)

        # Plotly Figure setup
        fig = go.Figure()

        # Loop through each category and create a trace for it
        for category, color in colors.items():
            # Filter data for the current category
            category_data = sub_df[sub_df["cellCategory"] == category]

            # Add the trace for the current category
            fig.add_trace(go.Bar(
                y=category_data["cellTypes"],  # Categories for y-axis
                x=category_data["expr_val"],  # Expression values for x-axis
                orientation='h',  # Horizontal bars
                marker=dict(color=color),
                hovertemplate=
                    '<b>%{y}</b><br>' +  # Cell type (y-axis value)
                    'Expression Value: %{x}',  # Expression value (x-axis value)
                    #'Category: %{text}',  # Custom text (cell category)
                #text=category_data["cellCategory"],  # Pass the cell category as custom text
                name=category,  # Use the category name for the legend
                showlegend=True,  # Ensure the legend is shown for this trace
            ))

        # Update layout settings
        fig.update_layout(
            title="",
            xaxis_title="log(x+1) Expression value",
            yaxis_title="Cell Types",
            yaxis=dict(
                tickmode='array',
                tickvals=np.arange(num_bars),
                ticktext=sub_df["cellTypes"],
                tickangle=0,  # Avoid overlapping labels by setting the angle to 0
                tickfont=dict(size=6),  # Set font size for the labels
            ),
            showlegend=True,
            legend_title="Cell Category",
            legend=dict(
                orientation="v",  # Vertical legend
                yanchor="top",
                y=1,
                xanchor="",
                x=1.05,  # Position the legend outside of the plot area
                font=dict(size=10)
            ),
            margin=dict(t=50, b=50, l=150, r=50),
            height=min(1000, max(500, num_bars * 30)),  # Adjust plot height
            plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
            paper_bgcolor='rgba(0,0,0,0)',  # Transparent paper background
        )

        # Save to HTML file
        fig.write_html(f"data/gene_expr_plots/{gene}.html")


plot_gene_expression(connectomeDB_long)

In [None]:
connectomeDB_long

## Testing Liana+

In [None]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

In [None]:
import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

### Pathway Annotations

In [None]:
# load PROGENy pathways, we use decoupler as a proxy as it formats the data in a more convenient way
progeny = dc.get_progeny(top=10000)
progeny

In [None]:
lr_pairs = gene_pair0[["Ligand", "Receptor"]]
lr_pairs.columns = lr_pairs.columns.str.lower()

In [None]:
lr_pairs

In [None]:
# generate ligand-receptor geneset
lr_progeny = li.rs.generate_lr_geneset(lr_pairs, progeny, lr_sep="^")

In [None]:
lr_progeny

In [None]:
# some of the pairs are missing
len(lr_progeny["interaction"].unique())

In [None]:
output_file="data/pathway_annotations_per_pair.csv"
lr_progeny.to_csv(output_file, index=False)

In [None]:
whichDB= 'DisGeNet'
# A database of expression profiles related to human diseases, including cancer
diseases = op.requests.Annotations.get(
    resources = [whichDB]
    )

In [None]:
diseases

In [None]:
diseases.to_csv("data/" + whichDB + ".csv")

### Disease Annotations

In [None]:
# DisGeNet
diseases = op.requests.Annotations.get(
    resources = ['DisGeNet']
    )

In [None]:
diseases = diseases[['genesymbol', 'label', 'value']]
diseases = diseases.pivot_table(index='genesymbol',
                                columns='label', values='value',
                                aggfunc=lambda x: '; '.join(x)).reset_index()
diseases = diseases[['genesymbol', 'disease']]
diseases['disease'] = diseases['disease'].str.split('; ')
diseases = diseases.explode('disease')
lr_diseases = li.rs.generate_lr_geneset(lr_pairs, diseases, source='disease', target='genesymbol', weight=None, lr_sep="^")
lr_diseases.sort_values("interaction")

In [None]:
# some of the pairs are missing
len(lr_diseases["interaction"].unique())

In [None]:
output_file="data/disease_annotations_per_pair.csv"
lr_diseases.to_csv(output_file, index=False)

In [None]:
op.requests.Annotations.resources()

### Get FASTA sequences for each gene

In [None]:
import requests
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath("src"))  # Add src directory to path/
from createDataTable import gene_pair0


# Get all unique genes
ligand_list = gene_pair0["Ligand"].tolist()
receptor_list = gene_pair0["Receptor"].tolist()
unique_genes = list(set(ligand_list + receptor_list)) 
LR_pairs=gene_pair0['Human LR Pair'].unique()

In [None]:
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
df = df[df["interaction"].isin(LR_pairs)]
# Step 1: Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
}
                            )
top_pathway_df

In [None]:
df= pd.read_csv("data/disease_annotations_per_pair.csv")
df_cat=pd.read_csv("data/disease_categories.csv")
mapping = dict(zip(df_cat['Disease Name'], df_cat['Category']))
# Replace values in the column based on the mapping
df["Disease Type"] = df['disease'].replace(mapping)
df = df[["interaction", "Disease Type"]].drop_duplicates()
df['Disease Type'] = df['Disease Type'].astype(str)
# Group by 'col1' and combine 'col2' values with ', '
df = df.groupby('interaction')['Disease Type'].apply(', '.join).reset_index()
# Create "Cancer-related" column based on whether "Cancers & Neoplasms" is in col2
df['Cancer-related'] = df['Disease Type'].apply(lambda x: 'Yes' if 'Cancer' in x else 'No')

In [None]:
# Create "Cancer-related" column based on whether "Cancers & Neoplasms" is in col2
df_combined['Cancer-related'] = df_combined['Disease Type'].apply(lambda x: 'Yes' if 'Cancer' in x else 'No')


In [None]:
df_combined

In [None]:
df_combined.to_csv("data/diseaseType_per_pair.csv", index =False)

In [None]:
pathway_list = df['source'].unique()
len(disease_list)
pair_list = df['interaction'].unique()

In [None]:
len(pair_list)

In [None]:
uniquepairs = list(set(pair_list) & set(LR_pairs)) 
len(uniquepairs)

In [None]:
# Get pair count per Disease
pairPerDisease = disease_df.groupby('source')['interaction'].nunique().reset_index()
pairPerDisease.to_csv("data/pairPerPathwayCount.csv")

In [None]:
# Get Disease count per LRPair
pairPerDisease = disease_df.groupby('interaction')['source'].nunique().reset_index()
pairPerDisease.to_csv("data/PathwayPerLRPair.csv")

In [None]:
len(df["Gene Symbol"].unique())

In [None]:
df= pd.read_table("data/human_uniprot_isoforms.tsv", sep="\t")

In [None]:
df.columns

In [None]:
df = df[['UniProt ID', 'Gene Symbol', 'Isoform Type', 'FASTA Sequence']]

In [None]:
df

In [None]:
lim_df = gene_pair0[["Human LR Pair", "Ligand", "Receptor"]]

In [None]:
lim_df

In [None]:
lim_df = lim_df.merge(df, how='left', left_on='Ligand', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Ligand Sequence",
                                "UniProt ID": "Ligand Isoform Uniprot ID",
                                "Isoform Type": "Ligand Isoform Type"})
lim_df

In [None]:
lim_df = lim_df.merge(df, how='left', left_on='Receptor', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Receptor Sequence",
                                "UniProt ID": "Receptor Isoform Uniprot ID",
                                "Isoform Type": "Receptor Isoform Type"})
lim_df

In [None]:
lim_df.to_csv("data/LRpair_uniprot_sequences.tsv", sep="\t", index=False)

In [None]:
import gzip
import re
import pandas as pd

# Step 1: Extract Gene Symbol Mapping from GTF
gtf_file = "data/gencode.v47.annotation.gtf.gz"
gene_map = {}

# Read GTF file and extract gene_id -> gene_name mapping
with gzip.open(gtf_file, "rt") as f:
    for line in f:
        if line.startswith("#"):  # Skip comments
            continue
        
        fields = line.strip().split("\t")
        if fields[2] == "gene":  # Only extract gene entries
            info = {key.strip(): value.strip('"') for key, value in re.findall(r'(\S+) "([^"]+)"', fields[8])}
            if "gene_id" in info and "gene_name" in info:
                gene_map[info["gene_id"]] = info["gene_name"]

print(f"✅ Extracted {len(gene_map)} gene mappings from GTF.")

In [None]:
# Step 2: Parse GENCODE Protein FASTA and Add Gene Symbols
fasta_file = "data/gencode.v47.pc_translations.fa.gz"

# Store extracted data
records = []

# Open the GENCODE FASTA file and parse sequences
with gzip.open(fasta_file, "rt") as f:
    header = None
    sequence = []
    
    for line in f:
        line = line.strip()
        
        if line.startswith(">"):
            # Store previous sequence if exists
            if header and sequence:
                # Extract the Gene Symbol using the Gene ID
                gene_symbol = gene_map.get(header["gene_id"], "Unknown")
                isoform_type = "Canonical" if "-1" in header["protein_id"] else "Alternative Isoform"
                
                # Append the parsed data to records
                records.append([header["protein_id"], header["transcript_id"], header["gene_id"], gene_symbol, isoform_type, "".join(sequence)])
            
            # Split header by '|' and extract necessary fields
            fields = line[1:].split("|")  # Skip the '>' symbol and split by '|'
            if len(fields) >= 6:
                header = {
                    "protein_id": fields[0], 
                    "transcript_id": fields[1], 
                    "gene_id": fields[2]  
                }
                sequence = []
            else:
                header = None
        
        elif header:
            sequence.append(line)

    # Add the last record if needed
    if header and sequence:
        gene_symbol = gene_map.get(header["gene_id"], "Unknown")
        isoform_type = "Canonical" if "-1" in header["protein_id"] else "Alternative Isoform"
        records.append([header["protein_id"], header["transcript_id"], header["gene_id"], gene_symbol, isoform_type, "".join(sequence)])

# Step 3: Convert to pandas DataFrame and Save to TSV
df = pd.DataFrame(records, columns=["Ensembl Protein ID", "Ensembl Transcript ID", "Ensembl Gene ID", "Gene Symbol", "Isoform Type", "FASTA Sequence"])

# Save to TSV
df.to_csv("data/gencode_protein_isoforms_with_symbols.tsv", sep="\t", index=False)

# Print completion message
print(f"✅ Extracted {len(df)} protein sequences with Gene Symbols and saved to 'gencode_protein_isoforms_with_symbols.tsv'.")

In [None]:
lim_df = gene_pair0[["Human LR Pair", "Ligand", "Receptor"]]
lim_df = lim_df.merge(df, how='left', left_on='Ligand', right_on='Gene Symbol')

In [None]:
lim_df

In [None]:
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Ligand Sequence",
                                "Ensembl Protein ID": "Ligand Ensembl Protein ID",
                                "Ensembl Transcript ID": "Ligand Ensembl Transcript ID",
                                "Ensembl Gene ID": "Ligand Ensembl Gene ID",
                                "Isoform Type": "Ligand Isoform Type"})

In [None]:
lim_df

In [None]:
lim_df = lim_df.merge(df, how='left', left_on='Receptor', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Receptor Sequence",
                                "Ensembl Protein ID": "Receptor Ensembl Protein ID",
                                "Ensembl Transcript ID": "Receptor Ensembl Transcript ID",
                                "Ensembl Gene ID": "Receptor Ensembl Gene ID",
                                "Isoform Type": "Receptor Isoform Type"})

In [None]:
lim_df

In [None]:
lim_df.to_csv("data/LRpair_gencode_sequences.tsv", sep="\t", index=False)

####################################################################

In [None]:
## Function to scrape data from Pubmed for Title, Abstract, Journal, and Year
### IMPORTANT: TURN OFF VPN and make sure you have the data directory (from Sakura)

import sys
import requests
import pandas as pd
import time
import os
import xml.etree.ElementTree as ET

sys.path.append(os.path.abspath("src"))  
import fetchGSheet

# Read the API key from a file
with open("data/ncbi_api_key.txt", "r") as file:
    ncbi_api_key = file.read().strip()

# File to save the results
output_file = "data/pubmed_results.csv"

# Example of fetching HGNC gene symbols (you should have the `fetchGSheet.pop_up_info` dataframe ready)
def extract_hgnc_symbols(fetchGSheet):
    # Concatenate Approved, Alias, and Previous symbols, then extract unique symbols
    hgnc_symbols = pd.concat([
        fetchGSheet['Approved symbol'],
        fetchGSheet['Alias symbol'],
        fetchGSheet['Previous symbol']
    ], axis=0).dropna().str.upper().unique()  # Remove NaNs and make uppercase for matching
     # Remove any empty strings from the list
    hgnc_symbols = [symbol for symbol in hgnc_symbols if symbol != ""]
    return set(hgnc_symbols)  # Return as a set for fast lookup
    
hgnc_symbols = extract_hgnc_symbols(fetchGSheet.pop_up_info)

In [None]:
len(hgnc_symbols)

In [None]:
# Official species names and their corresponding terms (scientific names)
# Load your list of PMIDs
pmid_list = source
species_dict = {
    "human": "Homo sapiens",
    "mouse": "Mus musculus",
    "rat": "Rattus norvegicus",
    "rabbit": "Oryctolagus cuniculus",
    "monkey": "Macaca spp.",
    "dog": "Canis lupus familiaris",
    "pig": "Sus scrofa",
    "zebra fish": "Danio rerio",
    "chicken": "Gallus gallus",
    "horse": "Equus ferus caballus",
    "cat": "Felis catus",
    "sheep": "Ovis aries",
    "cow": "Bos taurus",
    "fruit fly": "Drosophila melanogaster",
    "c. elegans": "Caenorhabditis elegans",
}

def fetch_pubmed_data(pmid_list, hgnc_symbols):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    results = []

    # Load existing data if output file exists
    if os.path.exists(output_file):
        existing_data = pd.read_csv(output_file)
    else:
        existing_data = pd.DataFrame(columns=["PMID", "Title", "Abstract", "Journal", "Year", "Species"])

    # Split PMIDs into batches
    batch_size = 50
    pmid_batches = [pmid_list[i:i + batch_size] for i in range(0, len(pmid_list), batch_size)]

    # Iterate over the batches
    for batch in pmid_batches:
        params = {
            "db": "pubmed",
            "id": ",".join(batch),  # Join PMIDs as comma-separated
            "retmode": "xml",
            "api_key": ncbi_api_key
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()

            # Parse the XML response
            root = ET.fromstring(response.text)
            for article in root.findall(".//PubmedArticle"):
                # Extract Title and Abstract
                title = article.findtext(".//ArticleTitle", default="N/A")
                abstract = article.findtext(".//AbstractText", default="No abstract available")

                # Extract Journal Title
                journal_tag = article.find(".//Journal/Title")
                journal = journal_tag.text.strip() if journal_tag is not None and journal_tag.text else "N/A"

                # Extract Publication Year
                pub_date = article.find(".//PubDate")
                if pub_date is not None:
                    year_tag = pub_date.find("Year")
                    year = year_tag.text if year_tag is not None else "N/A"

                    # Fallback to MedlineDate if Year is missing
                    if year == "N/A":
                        medline_date_tag = pub_date.find("MedlineDate")
                        year = medline_date_tag.text.split()[0] if medline_date_tag is not None else "N/A"
                else:
                    year = "N/A"  # PubDate is completely missing

                # Initialize species as N/A
                species = "N/A"

                # Check if the word "patient" is detected in title or abstract (assume human)
                if "patient" in title.lower() or "patient" in abstract.lower():
                    species = "Homo sapiens"
                elif "human" in title.lower() or "human" in abstract.lower():
                    species = "Homo sapiens"
                else:
                    # Look for HGNC gene symbols in title or abstract (assume human if found)
                    for gene in hgnc_symbols:
                        if gene in title or gene in abstract:
                            species = "Homo sapiens"
                            break
                    else:
                        # Look for MeSH terms related to species
                        for mesh_heading in article.findall(".//MeshHeadingList/MeshHeading"):
                            descriptor_name = mesh_heading.findtext("DescriptorName")
                            if descriptor_name:
                                # Match official species names using the species_dict
                                for species_term, scientific_name in species_dict.items():
                                    if species_term in descriptor_name.lower():
                                        species = scientific_name
                                        break  # Stop after finding the first match

                # Append the result
                results.append({
                    "PMID": article.findtext(".//MedlineCitation/PMID"),
                    "Title": title,
                    "Abstract": abstract,
                    "Journal": journal,
                    "Year": year,
                    "Species": species
                })

        except Exception as e:
            print(f"Error fetching batch {batch}: {e}")
            # Optionally save the response for debugging
            with open(f"error_batch_{batch[0]}_{batch[-1]}.xml", "w") as f:
                f.write(response.text)

        # Rate limiting to avoid API overload
        time.sleep(1)  # Increase delay for better API compliance

    # Save results
    new_data = pd.DataFrame(results)
    if not new_data.empty:
        # Merge existing and new data, updating missing values
        updated_data = pd.concat([existing_data, new_data])

        # Ensure all PMIDs are strings
        updated_data["PMID"] = updated_data["PMID"].astype(str)

        # Drop rows with missing PMIDs
        updated_data = updated_data.dropna(subset=["PMID"])

        # Ensure rows are ordered and remove duplicates
        updated_data = (
            updated_data.sort_values(by="PMID")  # Ensure rows are ordered
            .drop_duplicates(subset="PMID", keep="last")  # Keep the latest data
        )
        updated_data["Journal"] = updated_data["Journal"].str.split(" (", n=1, expand=False, regex=False).str[0]
        updated_data.to_csv(output_file, index=False)
    else:
        print("No new data fetched.")

    return results

# Fetch PubMed data with your list of PMIDs, output file path, and NCBI API key
fetch_pubmed_data(pmid_list, hgnc_symbols)

In [None]:
from createDataTable import gene_pair, gene_pair000

In [None]:
gene_pair000

In [None]:
gene_pair

In [None]:
import pandas as pd
from bs4 import BeautifulSoup

# Example DataFrame with HTML tags in column names
data = {
    "<span title='Double-click header of Sheep Ligand Ensembl ID to ensure all values are shown'>Sheep Ligand Ensembl ID&nbsp;</span>": [1, 2, 3],
    "<span title='Double-click header of Cow Ligand Ensembl ID to ensure all values are shown'>Cow Ligand Ensembl ID&nbsp;</span>": [4, 5, 6],
    "<span title='Double-click header of Dog Ligand Ensembl ID to ensure all values are shown'>Dog Ligand Ensembl ID&nbsp;</span>": [7, 8, 9],
}

species_gene_pair = pd.DataFrame(data)

In [None]:
species_gene_pair

In [None]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

### Parameters
topN = 10000 #Number of top pathways to be included
pathway_output_file="data/pathway_annotations_per_pair.csv"
output_file="data/disease_annotations_per_pair.csv"

### Pathway Annotations

# load PROGENy pathways, we use decoupler as a proxy as it formats the data in a more convenient way
progeny = dc.get_progeny(top=topN)
# import connectomeDB database ligands and receptors
lr_pairs = gene_pair0[["Ligand", "Receptor"]]
lr_pairs.columns = lr_pairs.columns.str.lower()

# generate ligand-receptor geneset
lr_progeny = li.rs.generate_lr_geneset(lr_pairs, progeny, lr_sep="^")
# some of the pairs are missing
len(lr_progeny["interaction"].unique())
# Replace '^' with ' ' in the 2nd column
lr_progeny.iloc[:, 1] = lr_progeny.iloc[:, 1].str.replace(r'\^', ' ', regex=True)
lr_progeny

In [None]:
lr_progeny.iloc[:, 1] = lr_progeny.iloc[:, 1].str.replace(r'\^', ' ', regex=True)
lr_progeny.iloc[:, 1] 

In [None]:
ligand_index =4
ligand_col = [col for col in species_gene_pair.columns if "Ligand&nbsp;" in col][ligand_index]

In [None]:
ligand_col

In [None]:
    # Rename columns to remove species name
    species_gene_pair.columns = [
        col.replace(f"{species} ", "").strip() if "Ligand" in col or "Receptor" in col else col
        for col in species_gene_pair.columns
    ]

In [None]:
    ligand_col = [col for col in species_gene_pair.columns if "Ligand&nbsp;" in col][ligand_index]

In [None]:
ligand_col

In [None]:
species_gene_pair