# Python Notebook

In [1]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath("src"))  # Add src directory to path
import fetchGSheet

In [9]:
fetchGSheet.gene_pair.columns

Index(['PMID', 'LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
       'Both L&R match HGNC', 'triplet', 'Primary annotation',
       'Primary annotator', 'original source', 'year', 'PMID link',
       'perplexity link', 'PMId for vlookup', 'Secondary annotation',
       'Secondary annotator', 'confirmed by 2 annotators', 'review yes/no',
       'binding location', 'bind in trans?', 'bidirectional signalling?',
       'interaction type', 'issues'],
      dtype='object', name=0)

In [126]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet"
]

# Select only the relevant columns from pop_up_info
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          "rgd_id": "RGD ID",
                                          "mgd_id": "MGI ID", 
                                          "rgd_id": "RGD ID",
                                          "alias_symbol": "Alias symbol",
                                          "prev_symbol": "Previous symbol",
                                          "date_symbol_changed": "Date symbol changed"
                                          
                                         })

pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "MGI ID", "RGD ID"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair.dropna(axis=1, how='all')

# for now, rm some columns
gene_pair = gene_pair[['LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
                       'perplexity link', 'PMID', 'binding location', 
                       'bind in trans?', 'bidirectional signalling?',
                       'interaction type', 'original source']]

# Mapping for replacements
mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# Replace values in the column based on the mapping
gene_pair['original source'] = gene_pair['original source'].replace(mapping)

## add Ligand/Receptor Location
mapping_loc = dict(zip(fetchGSheet.loc_info['ApprovedSymbol'], fetchGSheet.loc_info['Localization']))
gene_pair['Ligand location'] = gene_pair['Ligand'].replace(mapping_loc)
gene_pair['Receptor location'] = gene_pair['Receptor'].replace(mapping_loc)

# Fetch species IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "LR pair": "Human LR Pair",
    "Ligand.HGNC": "Ligand HGNC ID",
    "Receptor.HGNC": "Receptor HGNC ID",
    "perplexity link": "Perplexity", # will be replaced with actual link later
    "original source": "Database Source",
    "PMID": "PMID support"
})

# Recreate Perplexity link
# Function to generate Perplexity search link
def create_url_basic(gene_name):
    query = f"What is the primary evidence that {gene_name} bind-each-other-as-a-ligand-and-receptor-pair. Exclude reviews, uniprot, wiki, genecards, PIPS, iuphar as sources."
    encoded_query = query.replace(" ", "%20")
    return f"https://www.perplexity.ai/search?q={encoded_query}"

# Apply function to the DataFrame
gene_pair["Perplexity"] = gene_pair["Perplexity"].apply(create_url_basic)

# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand name", 
                                     "MGI ID": "Ligand MGI ID",
                                     "RGD ID": "Ligand RGD ID"},
                            )
gene_pair = gene_pair.drop(columns=["HGNC ID"])
# Add top pathway per pair
LR_pairs = gene_pair["Human LR Pair"].unique()
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
df = df[df["interaction"].isin(LR_pairs)]
# Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
})
gene_pair = gene_pair.merge(top_pathway_df, how='left', left_on='Human LR Pair', right_on='interaction')

# Add Disease Category per pair
df= pd.read_csv("data/diseaseType_per_pair.csv")
disease_df = df[df["interaction_x"].isin(LR_pairs)]

gene_pair = gene_pair.merge(disease_df, how='left', left_on='Human LR Pair', right_on='interaction_x')

# Add MGI annotation
MGI_info = pd.read_csv("data/MGI_ID_biomart.csv")
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Ligand MGI ID', right_on='MGI ID')

# Find rows where Ligand HGNC ID is missing & copy Ligand to MGI name for those rows
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'MGI name'] = gene_pair.loc[mask, 'Ligand']
# Map MGI ID using the MGI_info table
gene_pair = gene_pair.merge(MGI_info, left_on='MGI name', right_on='MGI name', how='left', suffixes=('', '_from_info'))
# Fill missing 'MGI ID' only where it was previously missing
gene_pair['Ligand MGI ID'] = gene_pair['Ligand MGI ID'].combine_first(gene_pair['MGI ID_from_info'])
gene_pair = gene_pair.drop(columns=['MGI ID_from_info'])

# Add RGD annotation
RGD_info = pd.read_csv("data/RGD_ID_biomart.csv")
RGD_info['RGD ID'] = "RGD:" + RGD_info['RGD ID'].astype(str)
gene_pair = gene_pair.merge(RGD_info, how='left', left_on='Ligand RGD ID', right_on='RGD ID')

# Add ZFIN id and symbol
ZFIN_info = pd.read_csv("data/ZFIN_ID_human_orthos.txt", sep="\t", skiprows=1)
ZFIN_info = ZFIN_info[['ZFIN ID', 'ZFIN Symbol', 'ZFIN Name', 'HGNC ID']]

ZFIN_info = ZFIN_info.dropna(subset=['HGNC ID'])
ZFIN_info = ZFIN_info.drop_duplicates(subset=['HGNC ID'])
ZFIN_info['HGNC ID'] = ZFIN_info['HGNC ID'].apply(lambda x: f'HGNC:{int(x)}')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.drop(columns=["RGD ID", "MGI ID", "HGNC ID", "interaction", "interaction_x"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Ligand", 
                                     "RGD name": "Rat Ligand",
                                     "ZFIN ID": "Ligand ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Ligand",
                                     "ZFIN Name": "Zebrafish Ligand name"}
                            )

gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Receptor name",
                                      "MGI ID": "Receptor MGI ID",
                                      "RGD ID": "Receptor RGD ID"}
                            )


gene_pair = gene_pair.drop(columns=["HGNC ID"])

# Add MGI name
gene_pair = gene_pair.merge(MGI_info, how='left', left_on='Receptor MGI ID', right_on='MGI ID')
# Find rows where Receptor HGNC ID is missing & copy Receptor to MGI name for those rows
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'MGI name'] = gene_pair.loc[mask, 'Receptor']
# Map MGI ID using the MGI_info table
gene_pair = gene_pair.merge(MGI_info, left_on='MGI name', right_on='MGI name', how='left', suffixes=('', '_from_info'))
# Fill missing 'MGI ID' only where it was previously missing
gene_pair['Receptor MGI ID'] = gene_pair['Receptor MGI ID'].combine_first(gene_pair['MGI ID_from_info'])
gene_pair = gene_pair.drop(columns=['MGI ID_from_info'])

gene_pair = gene_pair.merge(RGD_info, how='left', left_on='Receptor RGD ID', right_on='RGD ID')
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Receptor HGNC ID', right_on='HGNC ID')
gene_pair = gene_pair.drop(columns=["RGD ID", "MGI ID", "HGNC ID"])

gene_pair = gene_pair.rename(columns={
                                     "MGI name": "Mouse Receptor", 
                                     "RGD name": "Rat Receptor",
                                     "ZFIN ID": "Receptor ZFIN ID",
                                     "ZFIN Symbol": "Zebrafish Receptor",
                                     "ZFIN Name": "Zebrafish Receptor name"})

gene_pair.tail()

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


Unnamed: 0,Human LR Pair,Ligand,Ligand HGNC ID,Receptor,Receptor HGNC ID,Perplexity,PMID support,binding location,bind in trans?,bidirectional signalling?,...,Zebrafish Ligand,Zebrafish Ligand name,Receptor name,Receptor MGI ID,Receptor RGD ID,Mouse Receptor,Rat Receptor,Receptor ZFIN ID,Zebrafish Receptor,Zebrafish Receptor name
5002,Pcdhb20 Pcdhb20,Pcdhb20,,Pcdhb20,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,extracellular,cis,unknown,...,,,,MGI:2136758,,Pcdhb20,,,,
5003,Pcdhb21 Pcdhb21,Pcdhb21,,Pcdhb21,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,unknown,unknown,unknown,...,,,,MGI:2136759,,Pcdhb21,,,,
5004,Pcdhb22 Pcdhb22,Pcdhb22,,Pcdhb22,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,unknown,unknown,unknown,...,,,,MGI:2136760,,Pcdhb22,,,,
5005,Pcdhgb8 Pcdhgb8,Pcdhgb8,,Pcdhgb8,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,extracellular,trans,unknown,...,,,,MGI:1935200,,Pcdhgb8,,,,
5006,HMGB1 SCARA5,HMGB1,HGNC:4983,SCARA5,HGNC:28701,https://www.perplexity.ai/search?q=What%20is%2...,27647835,extracellular,trans,No,...,hmgb1a,high mobility group box 1a,scavenger receptor class A member 5,MGI:1918395,RGD:1306539,Scara5,Scara5,ZDB-GENE-041210-258,scara5,"scavenger receptor class A, member 5 (putative)"


In [131]:
LR_pairs = gene_pair["Human LR Pair"].unique()
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
#df = df[df["interaction"].isin(LR_pairs)]
# Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
})

In [132]:
top_pathway_df

Unnamed: 0,interaction,Top Pathway
0,CXCL8^CXCR1,TNFa
1,CCL20^CXCR3,TNFa
2,CXCL10^SDC4,NFkB
3,IL1B^IL1R2,NFkB
4,CXCL3^CXCR1,TNFa
...,...,...
2023,BGN^TLR4,Estrogen
2024,GNAI2^C5AR1,Androgen
2025,C1QA^CSPG4,WNT
2026,FN1^NT5E,WNT


In [123]:
gene_pair.columns

Index(['Human LR Pair', 'Ligand', 'Ligand HGNC ID', 'Receptor',
       'Receptor HGNC ID', 'Perplexity', 'PMID support', 'binding location',
       'bind in trans?', 'bidirectional signalling?', 'interaction type',
       'Database Source', 'Ligand location', 'Receptor location',
       'Ligand name', 'Ligand MGI ID', 'Ligand RGD ID', 'interaction',
       'Top Pathway', 'interaction_x', 'Disease Type', 'Cancer-related',
       'MGI ID', 'MGI name'],
      dtype='object')

In [119]:
# Find rows where Ligand HGNC ID is missing & copy Ligand to MGI name for those rows
mask = gene_pair['Ligand HGNC ID'].astype(str).str.strip() == ''
gene_pair.loc[mask, 'Ligand MGI name'] = gene_pair.loc[mask, 'Ligand']
mask
gene_pair

Unnamed: 0,Human LR Pair,Ligand,Ligand HGNC ID,Receptor,Receptor HGNC ID,Perplexity,PMID support,binding location,bind in trans?,bidirectional signalling?,...,Ligand MGI ID,Ligand RGD ID,interaction,Top Pathway,interaction_x,Disease Type,Cancer-related,MGI ID,MGI name,Ligand MGI name
0,A2M HSPA5,A2M,HGNC:7,HSPA5,HGNC:5238,https://www.perplexity.ai/search?q=What%20is%2...,12194978,extracellular,trans,Yes,...,MGI:2449119,RGD:2004,,,,,,MGI:2449119,A2m,
1,A2M HSPA5,A2M,HGNC:7,HSPA5,HGNC:5238,https://www.perplexity.ai/search?q=What%20is%2...,32541810,extracellular,trans,Yes,...,MGI:2449119,RGD:2004,,,,,,MGI:2449119,A2m,
2,ADAM10 EPHA5,ADAM10,HGNC:188,EPHA5,HGNC:3389,https://www.perplexity.ai/search?q=What%20is%2...,16239146,extracellular,trans,Yes,...,MGI:109548,RGD:2032,,,,,,MGI:109548,Adam10,
3,ADAM12 ITGB1,ADAM12,HGNC:190,ITGB1,HGNC:6153,https://www.perplexity.ai/search?q=What%20is%2...,10944520,extracellular,cis,Yes,...,MGI:105378,RGD:1583652,,,,,,MGI:105378,Adam12,
4,ADAM15 ITGA5,ADAM15,HGNC:193,ITGA5,HGNC:6141,https://www.perplexity.ai/search?q=What%20is%2...,9914169,extracellular,trans,Yes,...,MGI:1333882,RGD:620402,,,,,,MGI:1333882,Adam15,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5002,Pcdhb20 Pcdhb20,Pcdhb20,,Pcdhb20,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,extracellular,cis,unknown,...,,,,,,,,,,Pcdhb20
5003,Pcdhb21 Pcdhb21,Pcdhb21,,Pcdhb21,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,unknown,unknown,unknown,...,,,,,,,,,,Pcdhb21
5004,Pcdhb22 Pcdhb22,Pcdhb22,,Pcdhb22,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,unknown,unknown,unknown,...,,,,,,,,,,Pcdhb22
5005,Pcdhgb8 Pcdhgb8,Pcdhgb8,,Pcdhgb8,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,extracellular,trans,unknown,...,,,,,,,,,,Pcdhgb8


In [121]:
# Map MGI ID using the MGI_info table
gene_pair = gene_pair.merge(MGI_info, left_on='Ligand MGI name', right_on='MGI name', how='left', suffixes=('', '_from_info'))
gene_pair

Unnamed: 0,Human LR Pair,Ligand,Ligand HGNC ID,Receptor,Receptor HGNC ID,Perplexity,PMID support,binding location,bind in trans?,bidirectional signalling?,...,interaction_x,Disease Type,Cancer-related,MGI ID,MGI name,Ligand MGI name,MGI ID_from_info,MGI name_from_info,MGI ID_from_info.1,MGI name_from_info.1
0,A2M HSPA5,A2M,HGNC:7,HSPA5,HGNC:5238,https://www.perplexity.ai/search?q=What%20is%2...,12194978,extracellular,trans,Yes,...,,,,MGI:2449119,A2m,,MGI:2449119,A2m,,
1,A2M HSPA5,A2M,HGNC:7,HSPA5,HGNC:5238,https://www.perplexity.ai/search?q=What%20is%2...,32541810,extracellular,trans,Yes,...,,,,MGI:2449119,A2m,,MGI:2449119,A2m,,
2,ADAM10 EPHA5,ADAM10,HGNC:188,EPHA5,HGNC:3389,https://www.perplexity.ai/search?q=What%20is%2...,16239146,extracellular,trans,Yes,...,,,,MGI:109548,Adam10,,MGI:109548,Adam10,,
3,ADAM12 ITGB1,ADAM12,HGNC:190,ITGB1,HGNC:6153,https://www.perplexity.ai/search?q=What%20is%2...,10944520,extracellular,cis,Yes,...,,,,MGI:105378,Adam12,,MGI:105378,Adam12,,
4,ADAM15 ITGA5,ADAM15,HGNC:193,ITGA5,HGNC:6141,https://www.perplexity.ai/search?q=What%20is%2...,9914169,extracellular,trans,Yes,...,,,,MGI:1333882,Adam15,,MGI:1333882,Adam15,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5002,Pcdhb20 Pcdhb20,Pcdhb20,,Pcdhb20,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,extracellular,cis,unknown,...,,,,,,Pcdhb20,,,MGI:2136758,Pcdhb20
5003,Pcdhb21 Pcdhb21,Pcdhb21,,Pcdhb21,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,unknown,unknown,unknown,...,,,,,,Pcdhb21,,,MGI:2136759,Pcdhb21
5004,Pcdhb22 Pcdhb22,Pcdhb22,,Pcdhb22,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,unknown,unknown,unknown,...,,,,,,Pcdhb22,,,MGI:2136760,Pcdhb22
5005,Pcdhgb8 Pcdhgb8,Pcdhgb8,,Pcdhgb8,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,extracellular,trans,unknown,...,,,,,,Pcdhgb8,,,MGI:1935200,Pcdhgb8


In [None]:
# Fill missing 'MGI ID' only where it was previously missing
gene_pair['Ligand MGI ID'] = gene_pair['Ligand MGI ID'].combine_first(gene_pair['MGI ID_from_info'])
gene_pair = gene_pair.drop(columns=['MGI ID_from_info'])

In [100]:
MGI_info[MGI_info["MGI name"] == "Pcdhb17"]

Unnamed: 0,MGI ID,MGI name
11451,MGI:2136754,Pcdhb17


In [91]:
df

Unnamed: 0,source,interaction,weight
0,NFkB,IFNA13^IFNAR1,0.582020
1,TNFa,IFNA13^IFNAR1,1.219279
2,Trail,IFNA13^IFNAR1,0.339437
3,NFkB,IFNA13^IFNAR2,2.453309
4,TNFa,IFNA13^IFNAR2,4.271504
...,...,...,...
5749,PI3K,LRFN4^PTPRS,-0.742439
5750,Trail,LRFN5^PTPRD,0.367912
5751,EGFR,KIR2DL5A^PVR,0.801227
5752,MAPK,KIR2DL5A^PVR,0.933643


In [82]:
gene_pair[["PMID support"]]

Unnamed: 0,PMID support
0,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
1,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
2,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
4,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
...,...
3359,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3360,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3361,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."
3362,"<a href=""https://comp.med.yokohama-cu.ac.jp/co..."


In [25]:
# Add ZFIN id and symbol
ZFIN_info = pd.read_csv("data/ZFIN_ID_human_orthos.txt", sep="\t", skiprows=1)
ZFIN_info = ZFIN_info[['ZFIN ID', 'ZFIN Symbol', 'ZFIN Name', 'HGNC ID']]

ZFIN_info = ZFIN_info.dropna(subset=['HGNC ID'])
ZFIN_info = ZFIN_info.drop_duplicates(subset=['HGNC ID'])
ZFIN_info['HGNC ID'] = ZFIN_info['HGNC ID'].apply(lambda x: f'HGNC:{int(x)}')
ZFIN_info

Unnamed: 0,ZFIN ID,ZFIN Symbol,ZFIN Name,HGNC ID
0,ZDB-GENE-000112-47,ppardb,peroxisome proliferator-activated receptor del...,HGNC:9235
10,ZDB-GENE-000125-12,igfbp2a,insulin-like growth factor binding protein 2a,HGNC:5471
22,ZDB-GENE-000125-4,dlc,deltaC,HGNC:2909
24,ZDB-GENE-000128-11,dbx1b,developing brain homeobox 1b,HGNC:33185
26,ZDB-GENE-000128-13,dbx2,developing brain homeobox 2,HGNC:33186
...,...,...,...,...
43708,ZDB-SNORNAG-120314-5,snord30,"small nucleolar RNA, C/D box 30",HGNC:10157
43709,ZDB-SNORNAG-120314-6,snord29,"small nucleolar RNA, C/D box 29",HGNC:10151
43711,ZDB-SNORNAG-150916-2,snord7,"small nucleolar RNA, C/D box 7",HGNC:32704
43713,ZDB-SNORNAG-200824-1,snord69,"small nucleolar RNA, C/D box 69",HGNC:32730


In [26]:
gene_pair = gene_pair.merge(ZFIN_info, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')
gene_pair

Unnamed: 0,PMID,Human LR Pair,Ligand,Ligand HGNC ID,Receptor,Receptor HGNC ID,Both L&R match HGNC,triplet,Primary annotation,Primary annotator,...,Disease Type,Cancer-related,MGI ID,MGI name,RGD ID,RGD name,ZFIN ID,ZFIN Symbol,ZFIN Name,HGNC ID_y
0,12194978,A2M HSPA5,A2M,HGNC:7,HSPA5,HGNC:5238,Match,12194978 A2M HSPA5,valid,Alistair Forrest,...,,,MGI:2449119,A2m,RGD:2004,A2m,,,,
1,32541810,A2M HSPA5,A2M,HGNC:7,HSPA5,HGNC:5238,Match,32541810 A2M HSPA5,valid new,Alistair Forrest,...,,,MGI:2449119,A2m,RGD:2004,A2m,,,,
2,16239146,ADAM10 EPHA5,ADAM10,HGNC:188,EPHA5,HGNC:3389,Match,16239146 ADAM10 EPHA5,valid,Jordan Ramilowski,...,,,MGI:109548,Adam10,RGD:2032,Adam10,ZDB-GENE-040917-2,adam10a,ADAM metallopeptidase domain 10a,HGNC:188
3,10944520,ADAM12 ITGB1,ADAM12,HGNC:190,ITGB1,HGNC:6153,Match,10944520 ADAM12 ITGB1,valid,Alistair Forrest,...,,,MGI:105378,Adam12,RGD:1583652,Adam12,ZDB-GENE-070809-1,adam12b,ADAM metallopeptidase domain 12b,HGNC:190
4,9914169,ADAM15 ITGA5,ADAM15,HGNC:193,ITGA5,HGNC:6141,Match,9914169 ADAM15 ITGA5,valid,Jordan Ramilowski,...,,,MGI:1333882,Adam15,RGD:620402,Adam15,ZDB-GENE-070809-4,adam15,ADAM metallopeptidase domain 15,HGNC:193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5002,25171406,Pcdhb20 Pcdhb20,Pcdhb20,,Pcdhb20,,Not match,25171406 Pcdhb20 Pcdhb20,valid - Fig1d,Alistair Forrest,...,,,,,,,,,,
5003,25171406,Pcdhb21 Pcdhb21,Pcdhb21,,Pcdhb21,,Not match,25171406 Pcdhb21 Pcdhb21,valid - Fig1d,Alistair Forrest,...,,,,,,,,,,
5004,25171406,Pcdhb22 Pcdhb22,Pcdhb22,,Pcdhb22,,Not match,25171406 Pcdhb22 Pcdhb22,valid - Fig1d,Alistair Forrest,...,,,,,,,,,,
5005,25171406,Pcdhgb8 Pcdhgb8,Pcdhgb8,,Pcdhgb8,,Not match,25171406 Pcdhgb8 Pcdhgb8,valid - Fig1d,Alistair Forrest,...,,,,,,,,,,


In [27]:
gene_pair.columns

Index(['PMID', 'Human LR Pair', 'Ligand', 'Ligand HGNC ID', 'Receptor',
       'Receptor HGNC ID', 'Both L&R match HGNC', 'triplet',
       'Primary annotation', 'Primary annotator', 'Database Source', 'year',
       'PMID link', 'Perplexity', 'PMId for vlookup', 'Secondary annotation',
       'Secondary annotator', 'confirmed by 2 annotators', 'review yes/no',
       'binding location', 'bind in trans?', 'bidirectional signalling?',
       'interaction type', 'issues', 'HGNC ID_x', 'Ligand name',
       'Ligand MGI ID', 'Ligand RGD ID', 'interaction', 'Top Pathway',
       'interaction_x', 'Disease Type', 'Cancer-related', 'MGI ID', 'MGI name',
       'RGD ID', 'RGD name', 'ZFIN ID', 'ZFIN Symbol', 'ZFIN Name',
       'HGNC ID_y'],
      dtype='object')

In [26]:
pop_up_info_lim = pop_up_info[["HGNC ID", "Approved symbol", "Approved name", "MGI ID", "RGD ID"]]
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")
pop_up_info_lim

Unnamed: 0,HGNC ID,Approved symbol,Approved name,MGI ID,RGD ID
0,HGNC:5,A1BG,alpha-1-B glycoprotein,MGI:2152878,RGD:69417
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,,
2,HGNC:24086,A1CF,APOBEC1 complementation factor,MGI:1917115,RGD:619834
3,HGNC:7,A2M,alpha-2-macroglobulin,MGI:2449119,RGD:2004
4,HGNC:27057,A2M-AS1,A2M antisense RNA 1,,
...,...,...,...,...,...
44079,HGNC:3562,FABP7,fatty acid binding protein 7,MGI:101916,RGD:69312
44080,HGNC:41951,FABP7P1,fatty acid binding protein 7 pseudogene 1,,
44081,HGNC:41952,FABP7P2,fatty acid binding protein 7 pseudogene 2,,
44082,HGNC:3563,FABP9,fatty acid binding protein 9,MGI:1194881,RGD:620285


In [13]:
import sys
import os
import pandas as pd
from itables import init_notebook_mode, show
from IPython.display import display, Javascript
import itables.options as opt
# Change working directory to ConnectomeDB
project_root = os.path.dirname(os.getcwd())
os.chdir(project_root)
sys.path.append(os.path.abspath("src"))

ModuleNotFoundError: No module named 'createFunctionalAnnotTable'

In [16]:
os.getcwd()

'/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories'

In [1]:
os.chdir('/Users/sakuramaezono/Library/CloudStorage/OneDrive-YokohamaCityUniversity/Personal/05_Python_repositories/ConnectomeDB')

NameError: name 'os' is not defined

In [5]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0
gene_pair0

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Downloading data from `https://omnipathdb.org/queries/enzsub?format=json`
Downloading data from `https://omnipathdb.org/queries/interactions?format=json`
Downloading data from `https://omnipathdb.org/queries/complexes?format=json`
Downloading data from `https://omnipathdb.org/queries/annotations?format=json`
Downloading data from `https://omnipathdb.org/queries/intercell?format=json`
Downloading data from `https://omnipathdb.org/about?format=text`


ModuleNotFoundError: No module named 'createDataTable'

In [6]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

gene_pair_annot = gene_pair0[["Human LR Pair", "Cancer-related", "Top Pathway"]]
df= pd.read_csv("data/disease_annotations_per_pair.csv") # Liana Diseases
gene_pair_annot = gene_pair_annot.merge(df, how='left', left_on='Human LR Pair', right_on='interaction_x')
df= pd.read_csv("data/pathway_annotations_per_pair.csv") # Liana Pathway
gene_pair_annot = gene_pair_annot.merge(df, how='left', left_on='Human LR Pair', right_on='interaction')
gene_pair_annot = gene_pair_annot.drop(columns=["interaction_x", "interaction", "weight"])

gene_pair_annot = gene_pair_annot.rename(columns={
                                     "disease": "Disease", 
                                     "source": "Related Pathway"}
                            )
# Create the links to the HTML cards
gene_pair_annot["Human LR Pair"] = [
    f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/cards/{lrPairOrig}.html">{lrPair}</a>'
    for lrPairOrig, lrPair in zip(gene_pair_annot["Human LR Pair"], gene_pair_annot["Human LR Pair"])
]

# reorder
gene_pair_annot = gene_pair_annot[["Human LR Pair", "Disease", "Disease Type", "Cancer-related",  "Related Pathway", "Top Pathway"]]
gene_pair_annot["Disease"] = gene_pair_annot["Disease"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot["Disease Type"] = gene_pair_annot["Disease Type"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot["Related Pathway"] = gene_pair_annot["Related Pathway"].apply(
    lambda x: "unknown" if pd.isna(x) or str(x).strip().lower() in ["nan", "none", ""] else x)
gene_pair_annot

ModuleNotFoundError: No module named 'createDataTable'

In [None]:
gene_pair_annot

In [20]:
# Drop columns where all values are NA in gene_pair
gene_pair = gene_pair.dropna(axis=1, how='all')

gene_pair = gene_pair.fillna(" ")
gene_pair = gene_pair[gene_pair['Human LR Pair'] != ' ']

if "PMID link" in gene_pair.columns:
    gene_pair = gene_pair.drop(columns=["PMID link"])

# Add
first_columns=['Human LR Pair', 'Ligand', 'Receptor', 'Interaction Source']

end_columns=['HGNC L R', 'sanity check', 'curator', 'secondary source?']
gene_pair = gene_pair[first_columns + [col for col in gene_pair.columns if col not in first_columns + end_columns] + end_columns]


# number of unique vars

lrPairsCount = len(gene_pair["Human LR Pair"].unique())

ligandCount = len(gene_pair["Ligand"].unique())

receptorCount = len(gene_pair["Receptor"].unique())

# Mouse Orthologue
MouseLigandCount = len(gene_pair["Ligand MGI ID"].unique())

MouseReceptorCount = len(gene_pair["Receptor MGI ID"].unique())

# Rat Orthologue
RatLigandCount = len(gene_pair["Ligand RGD ID"].unique())

RatReceptorCount = len(gene_pair["Receptor RGD ID"].unique())

gene_pair["PMID support"] = [value.replace(" ", "") for value in gene_pair["PMID support"]]

source = np.array(gene_pair["PMID support"].unique())
source = source.astype(str)
source = ",".join(sorted(set(filter(lambda x: x.lower() != 'nan', source))))

# Split the string into individual elements, filter out empty strings, and get unique values
source = sorted(
    set(filter(lambda x: x.strip() and x.strip().lower() != 'nan', source.split(',')))
)
source = [value.replace(" ", "") for value in source]
sourceCount = len(source)

# for creating PMIDs
gene_pair00 = gene_pair[['Human LR Pair', 'PMID support']]

# create URLs for the HGNC IDs

# ligand
gene_pair["Ligand HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(ligand, ligand)
    for ligand in gene_pair["Ligand HGNC ID"]
]

# receptor
gene_pair["Receptor HGNC ID"] = [
    '<a href="https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{}" target="_blank">{}</a>'.format(receptor, receptor)
    for receptor in gene_pair["Receptor HGNC ID"]
]

# Perplexity
gene_pair["Perplexity"] = [
    '<a href="{}" target="_blank"> <img src="https://img.icons8.com/?size=30&id=0NbBuNOxUwps&format=png&color=000000" alt="Perplexity AI" /></a>'.format(url)
    for url in gene_pair["Perplexity"]
]

# Function to generate hyperlinks for the "PMID support" column
# Function to generate hyperlinks for the "PMID support" column
def generate_links_with_doi(df, gene_column, pmid_column):
    def create_link(gene, sources):
        # Replace spaces with "——" in the gene name for the link
        gene_name = gene.replace(" ", "——")
        
        if len(sources) == 1:
            source = sources[0]
            if source.startswith("https://www.biorxiv.org/content/"):
                # If the value starts with "https://doi.org/", use it as the hyperlink
                return f'<a href="{source}" target="_blank">BioRxiv preprint</a>'
            else:
                # If it's a single PMID, hyperlink the PMID text
                return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/pubmed/{gene_name}_pmid_details.html">{source}</a>'
        else:
            # If multiple PMIDs, show the count and hyperlink to the page
            return f'<a href="https://comp.med.yokohama-cu.ac.jp/collab/connectomeDB/pubmed/{gene_name}_pmid_details.html" target="_blank">{len(sources)} PMIDs</a>'

    # Process each row to generate the "PMID support" column
    df["PMID support"] = [
        create_link(
            gene=row[gene_column], 
            sources=[s.strip() for s in row[pmid_column].split(',') if s.strip()]
        )
        for _, row in df.iterrows()
    ]
    return df


# Generate the links for the "PMID support" column
gene_pair = generate_links_with_doi(gene_pair, gene_column="Human LR Pair", pmid_column="PMID support")

gene_pair["Ligand MGI ID"] = [
        f'<a href="https://www.informatics.jax.org/marker/{mouseOrth}" target="_blank">{mouseOrth}</a>' 
        if pd.notna(mouseOrth) and mouseOrth.strip() else "" 
        for mouseOrth in gene_pair["Ligand MGI ID"]
    ]

gene_pair["Receptor MGI ID"] = [
        f'<a href="https://www.informatics.jax.org/marker/{mouseOrth}" target="_blank">{mouseOrth}</a>' 
        if pd.notna(mouseOrth) and mouseOrth.strip() else "" 
        for mouseOrth in gene_pair["Receptor MGI ID"]
    ]

gene_pair["Ligand RGD ID"] = [
        f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={ratOrth.replace("RGD:", "")}" target="_blank">{ratOrth}</a>' 
        if pd.notna(ratOrth) and ratOrth.strip() else "" 
        for ratOrth in gene_pair["Ligand RGD ID"]
    ]

gene_pair["Receptor RGD ID"] = [
        f'<a href="https://rgd.mcw.edu/rgdweb/report/gene/main.html?id={ratOrth.replace("RGD:", "")}" target="_blank">{ratOrth}</a>' 
        if pd.notna(ratOrth) and ratOrth.strip() else "" 
        for ratOrth in gene_pair["Receptor RGD ID"]
    ]

In [65]:
gene_pair["Source"].unique()

array(['Ramilowski_2015_Literature_supported', '',
       'Noël et al. 2020 (ICELLNET)',
       'Hou et al. 2020 (connectomeDB2020)',
       'Efremova et al. 2020 (CellphoneDB)',
       'Cabello-Aguilar et al. 2020 (SingleCellSignalR)',
       'Baccin et al. 2020 (RNA-Magnet)',
       'ConnectomeDB2025 (this publication)'], dtype=object)

In [144]:
## Function to prepare datatables (cleaning and hyperlinking, adding tool tips, etc) input for the database qmds
import sys, os
from itables import init_notebook_mode
import pandas as pd
from itables import show
from itables import options
from IPython.display import HTML, display
import numpy as np
import fetchGSheet 
import warnings

# Suppress SettingWithCopyWarning
warnings.simplefilter("ignore", category=UserWarning)


# Other vertebrates
species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet"
]

# Select only the relevant columns from pop_up_info
pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")
pop_up_info = pop_up_info.rename(columns={"hgnc_id": "HGNC ID", 
                                          "name": "Approved name",
                                          "symbol": "Approved symbol",
                                          "rgd_id": "RGD ID",
                                          "mgd_id": "MGI ID", 
                                          "rgd_id": "RGD ID",
                                          "alias_symbol": "Alias symbol",
                                          "prev_symbol": "Previous symbol",
                                          "date_symbol_changed": "Date symbol changed"
                                          
                                         })

pop_up_info_lim = pop_up_info[["HGNC ID", "Approved name", "MGI ID", "RGD ID"]] # rm "Approved symbol" for now
pop_up_info_lim = pop_up_info_lim.drop_duplicates(subset="HGNC ID", keep="first")

# Drop columns where all values are NA in gene_pair
gene_pair = fetchGSheet.gene_pair.dropna(axis=1, how='all')

# for now, rm some columns
gene_pair = gene_pair[['LR pair', 'Ligand', 'Ligand.HGNC', 'Receptor', 'Receptor.HGNC',
                       'perplexity link', 'PMID', 'binding location', 
                       'bind in trans?', 'bidirectional signalling?',
                       'interaction type', 'original source']]

# Mapping for replacements
mapping = dict(zip(fetchGSheet.src_info['original source'], fetchGSheet.src_info['shortname']))
# Replace values in the column based on the mapping
gene_pair['original source'] = gene_pair['original source'].replace(mapping)

## add Ligand/Receptor Location
mapping_loc = dict(zip(fetchGSheet.loc_info['ApprovedSymbol'], fetchGSheet.loc_info['Localization']))
gene_pair['Ligand location'] = gene_pair['Ligand'].replace(mapping_loc)
gene_pair['Receptor location'] = gene_pair['Receptor'].replace(mapping_loc)

# Fetch species IDs from the dataset
hgnc_id = [col for col in gene_pair.columns if "HGNC" in col]
hgnc_id = pd.concat([gene_pair[col] for col in hgnc_id]).unique()

# Rename columns for better clarity
gene_pair = gene_pair.rename(columns={
    "LR pair": "Human LR Pair",
    "Ligand.HGNC": "Ligand HGNC ID",
    "Receptor.HGNC": "Receptor HGNC ID",
    "perplexity link": "Perplexity", # will be replaced with actual link later
    "original source": "Database Source",
    "PMID": "PMID support"
})

# Recreate Perplexity link
# Function to generate Perplexity search link
def create_url_basic(gene_name):
    query = f"What is the primary evidence that {gene_name} bind-each-other-as-a-ligand-and-receptor-pair. Exclude reviews, uniprot, wiki, genecards, PIPS, iuphar as sources."
    encoded_query = query.replace(" ", "%20")
    return f"https://www.perplexity.ai/search?q={encoded_query}"

# Apply function to the DataFrame
gene_pair["Perplexity"] = gene_pair["Perplexity"].apply(create_url_basic)

# Merge gene_pair with pop_up_info_lim for Ligand(L)
gene_pair = gene_pair.merge(pop_up_info_lim, how='left', left_on='Ligand HGNC ID', right_on='HGNC ID')

gene_pair = gene_pair.rename(columns={"Approved name": "Ligand name", 
                                     "MGI ID": "Ligand MGI ID",
                                     "RGD ID": "Ligand RGD ID"},
                            )
gene_pair = gene_pair.drop(columns=["HGNC ID"])
# Add top pathway per pair
LR_pairs = gene_pair["Human LR Pair"].unique()
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
#df = df[df["interaction"].isin(LR_pairs)]
# Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
})
top_pathway_df["interaction"] = [value.replace("^", " ") for value in top_pathway_df["interaction"]]
gene_pair = gene_pair.merge(top_pathway_df, how='left', left_on='Human LR Pair', right_on='interaction')

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


In [149]:
df= pd.read_csv("data/diseaseType_per_pair.csv")

In [151]:
disease_df = df[df["interaction_x"].isin(LR_pairs)]

gene_pair = gene_pair.merge(disease_df, how='left', left_on='Human LR Pair', right_on='interaction_x')
gene_pair

Unnamed: 0,Human LR Pair,Ligand,Ligand HGNC ID,Receptor,Receptor HGNC ID,Perplexity,PMID support,binding location,bind in trans?,bidirectional signalling?,...,Ligand location,Receptor location,Ligand name,Ligand MGI ID,Ligand RGD ID,interaction,Top Pathway,interaction_x,Disease Type,Cancer-related
0,A2M HSPA5,A2M,HGNC:7,HSPA5,HGNC:5238,https://www.perplexity.ai/search?q=What%20is%2...,12194978,extracellular,trans,Yes,...,multiple,multiple,alpha-2-macroglobulin,MGI:2449119,RGD:2004,,,,,
1,A2M HSPA5,A2M,HGNC:7,HSPA5,HGNC:5238,https://www.perplexity.ai/search?q=What%20is%2...,32541810,extracellular,trans,Yes,...,multiple,multiple,alpha-2-macroglobulin,MGI:2449119,RGD:2004,,,,,
2,ADAM10 EPHA5,ADAM10,HGNC:188,EPHA5,HGNC:3389,https://www.perplexity.ai/search?q=What%20is%2...,16239146,extracellular,trans,Yes,...,multiple,plasma membrane,ADAM metallopeptidase domain 10,MGI:109548,RGD:2032,,,,,
3,ADAM12 ITGB1,ADAM12,HGNC:190,ITGB1,HGNC:6153,https://www.perplexity.ai/search?q=What%20is%2...,10944520,extracellular,cis,Yes,...,multiple,multiple,ADAM metallopeptidase domain 12,MGI:105378,RGD:1583652,ADAM12 ITGB1,TGFb,,,
4,ADAM15 ITGA5,ADAM15,HGNC:193,ITGA5,HGNC:6141,https://www.perplexity.ai/search?q=What%20is%2...,9914169,extracellular,trans,Yes,...,multiple,plasma membrane,ADAM metallopeptidase domain 15,MGI:1333882,RGD:620402,ADAM15 ITGA5,TNFa,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5002,Pcdhb20 Pcdhb20,Pcdhb20,,Pcdhb20,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,extracellular,cis,unknown,...,Pcdhb20,Pcdhb20,,,,,,,,
5003,Pcdhb21 Pcdhb21,Pcdhb21,,Pcdhb21,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,unknown,unknown,unknown,...,Pcdhb21,Pcdhb21,,,,,,,,
5004,Pcdhb22 Pcdhb22,Pcdhb22,,Pcdhb22,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,unknown,unknown,unknown,...,Pcdhb22,Pcdhb22,,,,,,,,
5005,Pcdhgb8 Pcdhgb8,Pcdhgb8,,Pcdhgb8,,https://www.perplexity.ai/search?q=What%20is%2...,25171406,extracellular,trans,unknown,...,Pcdhgb8,Pcdhgb8,,,,,,,,


In [158]:
import pandas as pd
import os, sys
import json
# Add the src directory to the path for importing modules
sys.path.append(os.path.abspath("src"))
from createPMIDpages import gene_pair00

# Load the files
file1 = pd.read_csv("data/pubmed_results.csv") 
file2 = gene_pair00

# Convert the PMIDs column in file2 to lists for easy comparison
file1['PMID'] = file1['PMID'].astype(str)
file2['PMID_List'] = file2['PMID support'].apply(lambda x: x.split(','))

# Create a dictionary for quick PMID to Abstract mapping
pmid_to_abstract = dict(zip(file1['PMID'], file1['Abstract']))
pmid_to_abstract

  pop_up_info = pd.read_table("data/HGNC_gene_info_full.tsv")


{'1': 'No abstract available',
 '10': 'No abstract available',
 '10025398': 'Transforming growth factor beta (TGF beta) family members are secreted in inactive complexes with a latency-associated peptide (LAP), a protein derived from the N-terminal region of the TGF beta gene product. Extracellular activation of these complexes is a critical but incompletely understood step in regulation of TGF beta function in vivo. We show that TGF beta 1 LAP is a ligand for the integrin alpha v beta 6 and that alpha v beta 6-expressing cells induce spatially restricted activation of TGF beta 1. This finding explains why mice lacking this integrin develop exaggerated inflammation and, as we show, are protected from pulmonary fibrosis. These data identify a novel mechanism for locally regulating TGF beta 1 function in vivo by regulating expression of the alpha v beta 6 integrin.',
 '10037686': 'Among members of the tumor necrosis factor receptor (TNFR) superfamily, 4-1BB, CD27, and glucocorticoid-indu

In [160]:
data_for_llm

[{'Human LR Pair': 'A2M——HSPA5',
  'Abstracts': ['The low density lipoprotein receptor-related protein (LRP) is a scavenger receptor that binds to many proteins, some of which trigger signal transduction. Receptor-recognized forms of alpha(2)-Macroglobulin (alpha(2)M*) bind to LRP, but the pattern of signal transduction differs significantly from that observed with other LRP ligands. For example, neither Ni(2+) nor the receptor-associated protein, which blocks binding of all known ligands to LRP, block alpha(2)M*-induced signal transduction. In the current study, we employed alpha(2)-macroglobulin (alpha(2)M)-agarose column chromatography to purify cell surface membrane binding proteins from 1-LN human prostate cancer cells and murine macrophages. The predominant binding protein purified from 1-LN prostate cancer cells was Grp 78 with small amounts of LRP, a fact that is consistent with our previous observations that there is little LRP present on the surface of these cells. The ratio 

In [159]:
# Function to get all abstracts for a list of PMIDs
def get_abstracts(pmids):
    return [pmid_to_abstract[pmid] for pmid in pmids if pmid in pmid_to_abstract]

# Map abstracts to LR pairs
file2['Abstracts'] = file2['PMID_List'].apply(get_abstracts)

# Convert to a list of dictionaries
data_for_llm = file2[['Human LR Pair', 'Abstracts']].to_dict(orient='records')

# Save as JSON
with open("data/data_for_llm.json", "w") as f:
    json.dump(data_for_llm, f, indent=4)

In [15]:
# Function to add species-specific species Enseml ID and symbol for all other species except for mouse, rat, and zebrafish
def appendOtherSpeciesInfo(species, origDF):
    # Load species-specific data
    species_info = pd.read_csv(f"data/{species}_ID_biomart.csv")

    # Keep relevant columns
    species_info = species_info[[f"{species}_homolog_ensembl_gene", 
                                 f"{species}_homolog_associated_gene_name", 
                                 'hgnc_id']]

    # Remove rows where 'hgnc_id' is NaN and drop duplicates
    species_info = species_info.dropna(subset=['hgnc_id'])
    species_info = species_info.drop_duplicates(subset=['hgnc_id'])

    # Merge with ligand data
    origDF = origDF.merge(species_info, how='left', 
                           left_on='Ligand HGNC ID', right_on='hgnc_id')
    
    # Rename columns for ligand info
    origDF = origDF.rename(columns={
        f"{species}_homolog_associated_gene_name": f"{species} Ligand", 
        f"{species}_homolog_ensembl_gene": f"{species} Ligand Ensembl ID"
    })

    # Drop duplicate 'hgnc_id' column
    origDF = origDF.drop(columns=['hgnc_id'])

    # Merge with receptor data
    origDF = origDF.merge(species_info, how='left', 
                           left_on='Receptor HGNC ID', right_on='hgnc_id')

    # Rename columns for receptor info
    origDF = origDF.rename(columns={
        f"{species}_homolog_associated_gene_name": f"{species} Receptor", 
        f"{species}_homolog_ensembl_gene": f"{species} Receptor Ensembl ID"
    })

        # Drop duplicate 'hgnc_id' column
    origDF = origDF.drop(columns=['hgnc_id'])

    # Drop columns where all values are NaN
    origDF = origDF.dropna(axis=1, how='all')

    return origDF

species_list = [
    "ptroglodytes", "ggallus", "sscrofa", "btaurus", 
    "clfamiliaris", "ecaballus", "oarambouillet"
]

# Loop through each species and update gene_pair
for species in species_list:
    gene_pair = appendOtherSpeciesInfo(species, gene_pair)

In [17]:
gene_pair.columns

Index(['Human LR Pair', 'Interaction Source', 'PMID support', 'Ligand',
       'Ligand HGNC ID', 'Ligand location', 'Receptor', 'Receptor HGNC ID',
       'Receptor location', 'HGNC L R', 'secondary source?', 'PMID link',
       'Perplexity', 'sanity check', 'curator', 'Ligand name', 'Ligand MGI ID',
       'Ligand RGD ID', 'Mouse Ligand', 'Rat Ligand', 'Ligand ZFIN ID',
       'Zebrafish Ligand', 'Zebrafish Ligand name', 'Receptor name',
       'Receptor MGI ID', 'Receptor RGD ID', 'Mouse Receptor', 'Rat Receptor',
       'Receptor ZFIN ID', 'Zebrafish Receptor', 'Zebrafish Receptor name',
       'ptroglodytes Ligand Ensembl ID', 'ptroglodytes Ligand',
       'ptroglodytes Receptor Ensembl ID', 'ptroglodytes Receptor',
       'ggallus Ligand Ensembl ID', 'ggallus Ligand',
       'ggallus Receptor Ensembl ID', 'ggallus Receptor',
       'sscrofa Ligand Ensembl ID', 'sscrofa Ligand',
       'sscrofa Receptor Ensembl ID', 'sscrofa Receptor',
       'btaurus Ligand Ensembl ID', 'btaurus L

In [None]:
def add_row(change):
    global gene_pair
    # Add a new row at the top with None values
    new_row = {col: None for col in gene_pair.columns}
    gene_pair = pd.DataFrame([new_row] + gene_pair.to_dict(orient="records"))
    update_table()

# Function to remove the last row of the dataframe
def remove_row(change):
    global gene_pair
    if len(gene_pair) > 0:
        gene_pair = gene_pair[:-1]  # Remove the last row
        update_table()

In [None]:
gene_pair.columns

In [None]:
duplicates = gene_pair00[gene_pair00["Human LR Pair"].duplicated()]
print(duplicates["Human LR Pair"])

In [2]:
## Function to create horizontal bar plots of each gene in Human Taxon --expression log(x+1) transformed with cell types as y-axis

import requests
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
import plotly.graph_objects as go

sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

# Input file
input_file="data/connectome_j.tsv" #"data/connectome_j.tsv" # data/ExpressionGenes.txt
# Get all unique genes
ligand_list = gene_pair0["Ligand"].tolist()
receptor_list = gene_pair0["Receptor"].tolist()
unique_genes = list(set(ligand_list + receptor_list))  # Combine and remove duplicates

connectomeDB = pd.read_table(input_file, sep="\t")
# All Taxon for now
#connectomeDB = connectomeDB[connectomeDB["Taxon"]== "Human"]
if "Taxon" in connectomeDB.columns:
    connectomeDB = connectomeDB.drop(columns=["Localization", "Taxon"] + [col for col in connectomeDB.columns if col.startswith("F5_")])

In [3]:
column_sums = connectomeDB.iloc[:, 1:].sum()

In [5]:
connectomeDB.iloc[:, 1:].sum()

Adipocyte Breast           819735.859
Adipocyte Omental          883713.058
Adipocyte Perirenal        944323.351
Adipocyte Subcutaneous     805931.943
Alveolar Epithelial       1212346.313
                             ...     
Synoviocyte                769408.174
Tenocyte                   818224.667
Trabecular Meshwork       1048484.198
Tracheal Epithelial       1224157.145
Urothelial                 782699.893
Length: 144, dtype: float64

In [7]:
intersection = pd.Series(list(set(connectomeDB['ApprovedSymbol']).intersection(unique_genes)))
intersection

connectomeDB = connectomeDB[connectomeDB["ApprovedSymbol"].isin(intersection)]
connectomeDB
    

Unnamed: 0,ApprovedSymbol,Adipocyte Breast,Adipocyte Omental,Adipocyte Perirenal,Adipocyte Subcutaneous,Alveolar Epithelial,Amniotic Epithelial,Amniotic Membrane,Annulus Pulposus,Astrocyte Cerebellum,...,Smooth Muscle Subclavian Artery,Smooth Muscle Tracheal,Smooth Muscle Umbilical Artery,Smooth Muscle Umbilical Vein,Smooth Muscle Uterine,Synoviocyte,Tenocyte,Trabecular Meshwork,Tracheal Epithelial,Urothelial
2,A2M,90.272,121.423,50.596,63.397,0.000,0.000,0.819,0.000,1.364,...,0.146,37.598,2.324,0.143,3.193,2.208,5.266,0.000,0.223,0.000
16,AANAT,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
28,ABCA1,35.775,39.428,15.702,53.803,4.630,4.476,15.274,13.863,7.470,...,48.214,8.643,24.584,22.316,26.252,12.729,16.151,7.366,16.106,26.339
133,ACE,0.973,1.616,0.000,2.308,0.000,0.132,0.819,0.765,0.878,...,0.513,4.649,0.606,0.336,1.419,28.083,17.078,1.634,0.447,0.115
140,ACKR2,22.320,18.048,20.064,2.658,12.704,43.018,89.229,0.537,0.987,...,0.368,0.868,0.179,0.313,0.000,1.258,0.875,0.000,0.511,7.038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16424,XCR1,0.000,0.000,0.000,0.000,0.000,0.000,0.656,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
16470,YBX1,58.803,54.388,58.447,73.160,76.354,80.877,43.623,85.979,113.238,...,67.607,104.832,91.272,102.638,51.795,57.435,74.201,106.607,67.368,69.804
16676,ZG16B,0.091,0.828,6.106,0.000,0.698,0.218,0.000,0.579,0.146,...,5.168,0.000,1.987,0.000,1.419,1.233,0.442,0.190,1.655,10.423
17213,ZNRF3,2.787,4.775,0.872,0.944,8.512,4.954,6.021,2.709,8.275,...,1.799,3.743,2.169,2.564,1.774,4.336,3.029,8.329,5.204,5.714


In [10]:
# log(x+1) transform
connectomeDB.iloc[:, 1:] = np.log1p(connectomeDB.iloc[:, 1:])
# Reshape 
connectomeDB_long = connectomeDB.melt(id_vars=["ApprovedSymbol"], 
                                      var_name="cellTypes", value_name="expr_val")
cellCat = pd.read_csv("data/cell_categories.csv")
connectomeDB_long = connectomeDB_long.merge(cellCat, how='left', left_on='cellTypes', right_on='cellType')
connectomeDB_long = connectomeDB_long.drop(columns=["cellType"])

intersection = pd.Series(list(set(connectomeDB_long['cellTypes']).intersection(set(cellCat['cellType']))))
intersection

diff_df = pd.Series(list(set(connectomeDB_long['cellTypes']).difference(set(cellCat['cellType']))))
diff_df

def plot_gene_expression(df):
    # Define the colors for each cell category
    colors = {
        "missing": "#B0B0B0",  # Neutral gray
        "other": "#D4A76A",  # Warm gold
        "mesenchymal": "#377EB8",  # Vibrant blue
        "epithelial": "#E41A1C",  # Bold red
        "hematopoietic": "#4DAF4A",  # Fresh green
        "endothelial": "#984EA3",  # Deep purple
        "nervous system": "#FF7F00",  # Bright orange
    }

    # Define sorting order for cell categories
    category_order = {cat: i for i, cat in enumerate(colors.keys())}

    for gene, sub_df in df.groupby("ApprovedSymbol"):
        # Sort by category first, then by expression value (highest first)
        sub_df = sub_df.copy()
        sub_df["category_order"] = sub_df["cellCategory"].map(category_order).fillna(len(category_order))
        sub_df = sub_df.sort_values(["category_order", "expr_val"], ascending=[True, False])

        num_bars = len(sub_df)

        # Plotly Figure setup
        fig = go.Figure()

        # Loop through each category and create a trace for it
        for category, color in colors.items():
            # Filter data for the current category
            category_data = sub_df[sub_df["cellCategory"] == category]

            # Add the trace for the current category
            fig.add_trace(go.Bar(
                y=category_data["cellTypes"],  # Categories for y-axis
                x=category_data["expr_val"],  # Expression values for x-axis
                orientation='h',  # Horizontal bars
                marker=dict(color=color),
                hovertemplate=
                    '<b>%{y}</b><br>' +  # Cell type (y-axis value)
                    'Expression Value: %{x}',  # Expression value (x-axis value)
                    #'Category: %{text}',  # Custom text (cell category)
                #text=category_data["cellCategory"],  # Pass the cell category as custom text
                name=category,  # Use the category name for the legend
                showlegend=True,  # Ensure the legend is shown for this trace
            ))

        # Update layout settings
        fig.update_layout(
            title="",
            xaxis_title="log(x+1) Expression value",
            yaxis_title="Cell Types",
            yaxis=dict(
                tickmode='array',
                tickvals=np.arange(num_bars),
                ticktext=sub_df["cellTypes"],
                tickangle=0,  # Avoid overlapping labels by setting the angle to 0
                tickfont=dict(size=6),  # Set font size for the labels
            ),
            showlegend=True,
            legend_title="Cell Category",
            legend=dict(
                orientation="v",  # Vertical legend
                yanchor="top",
                y=1,
                xanchor="",
                x=1.05,  # Position the legend outside of the plot area
                font=dict(size=10)
            ),
            margin=dict(t=50, b=50, l=150, r=50),
            height=min(1000, max(500, num_bars * 30)),  # Adjust plot height
            plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
            paper_bgcolor='rgba(0,0,0,0)',  # Transparent paper background
        )

        # Save to HTML file
        fig.write_html(f"data/gene_expr_plots/{gene}.html")


plot_gene_expression(connectomeDB_long)

In [1]:
connectomeDB_long

NameError: name 'connectomeDB_long' is not defined

## Testing Liana+

In [3]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Downloading data from `https://omnipathdb.org/queries/enzsub?format=json`
Downloading data from `https://omnipathdb.org/queries/interactions?format=json`
Downloading data from `https://omnipathdb.org/queries/complexes?format=json`
Downloading data from `https://omnipathdb.org/queries/annotations?format=json`
Downloading data from `https://omnipathdb.org/queries/intercell?format=json`
Downloading data from `https://omnipathdb.org/about?format=text`


In [None]:
import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

### Pathway Annotations

In [None]:
# load PROGENy pathways, we use decoupler as a proxy as it formats the data in a more convenient way
progeny = dc.get_progeny(top=10000)
progeny

In [None]:
lr_pairs = gene_pair0[["Ligand", "Receptor"]]
lr_pairs.columns = lr_pairs.columns.str.lower()

In [None]:
lr_pairs

In [None]:
# generate ligand-receptor geneset
lr_progeny = li.rs.generate_lr_geneset(lr_pairs, progeny, lr_sep="^")

In [None]:
lr_progeny

In [None]:
# some of the pairs are missing
len(lr_progeny["interaction"].unique())

In [None]:
output_file="data/pathway_annotations_per_pair.csv"
lr_progeny.to_csv(output_file, index=False)

In [None]:
whichDB= 'DisGeNet'
# A database of expression profiles related to human diseases, including cancer
diseases = op.requests.Annotations.get(
    resources = [whichDB]
    )

In [None]:
diseases

In [None]:
diseases.to_csv("data/" + whichDB + ".csv")

### Disease Annotations

In [None]:
# DisGeNet
diseases = op.requests.Annotations.get(
    resources = ['DisGeNet']
    )

In [None]:
diseases = diseases[['genesymbol', 'label', 'value']]
diseases = diseases.pivot_table(index='genesymbol',
                                columns='label', values='value',
                                aggfunc=lambda x: '; '.join(x)).reset_index()
diseases = diseases[['genesymbol', 'disease']]
diseases['disease'] = diseases['disease'].str.split('; ')
diseases = diseases.explode('disease')
lr_diseases = li.rs.generate_lr_geneset(lr_pairs, diseases, source='disease', target='genesymbol', weight=None, lr_sep="^")
lr_diseases.sort_values("interaction")

In [75]:
# some of the pairs are missing
len(lr_diseases["interaction"].unique())

NameError: name 'lr_diseases' is not defined

In [None]:
output_file="data/disease_annotations_per_pair.csv"
lr_diseases.to_csv(output_file, index=False)

In [None]:
op.requests.Annotations.resources()

### Get FASTA sequences for each gene

In [18]:
import requests
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath("src"))  # Add src directory to path/
from createDataTable import gene_pair0


# Get all unique genes
ligand_list = gene_pair0["Ligand"].tolist()
receptor_list = gene_pair0["Receptor"].tolist()
unique_genes = list(set(ligand_list + receptor_list)) 
LR_pairs=gene_pair0['Human LR Pair'].unique()

In [19]:
df= pd.read_csv("data/pathway_annotations_per_pair.csv")
df = df[df["interaction"].isin(LR_pairs)]
# Step 1: Sort by absolute value of 'weight', descending (larger abs(weight) first)
df_sorted = df.reindex(df['weight'].abs().sort_values(ascending=False).index)
# Keep only the first occurrence for each unique 'interaction'
df_unique = df_sorted.drop_duplicates(subset='interaction', keep='first')
df = df_unique.reset_index(drop=True)
top_pathway_df = df[["interaction", "source"]]
top_pathway_df = top_pathway_df.rename(columns={
                                      "source": "Top Pathway"
}
                            )
top_pathway_df

Unnamed: 0,interaction,Top Pathway
0,CXCL8 CXCR1,TNFa
1,CCL20 CXCR3,TNFa
2,CXCL10 SDC4,NFkB
3,IL1B IL1R2,NFkB
4,CXCL3 CXCR1,TNFa
...,...,...
2023,BGN TLR4,Estrogen
2024,GNAI2 C5AR1,Androgen
2025,C1QA CSPG4,WNT
2026,FN1 NT5E,WNT


In [20]:
df= pd.read_csv("data/disease_annotations_per_pair.csv")

In [21]:
df = df[["interaction_x", "Disease Type"]].drop_duplicates()
df['Disease Type'] = df['Disease Type'].astype(str)
# Group by 'col1' and combine 'col2' values with ', '
df_combined = df.groupby('interaction_x')['Disease Type'].apply(', '.join).reset_index()
df_combined

Unnamed: 0,interaction_x,Disease Type
0,ACE BDKRB2,"Cardiovascular Diseases, Neurological Disorder..."
1,ADAM12 ITGA9,Cancers & Neoplasms
2,ADAM17 ITGB1,Liver Diseases
3,ADCYAP1 ADCYAP1R1,Psychiatric Disorders
4,ADCYAP1 VIPR2,Psychiatric Disorders
...,...,...
588,VWF ITGA2B,Liver Diseases
589,WNT3A APCDD1,Endocrine & Metabolic Disorders
590,WNT5A FZD2,Genetic & Congenital Disorders
591,WNT5A ROR2,Genetic & Congenital Disorders


In [22]:
# Function to update the "Cancer-related" column and modify "col2" if needed

# Create "Cancer-related" column based on whether "Cancers & Neoplasms" is in col2
df_combined['Cancer-related'] = df_combined['Disease Type'].apply(lambda x: 'Yes' if 'Cancers & Neoplasms' in x else 'No')


In [23]:
df_combined

Unnamed: 0,interaction_x,Disease Type,Cancer-related
0,ACE BDKRB2,"Cardiovascular Diseases, Neurological Disorder...",No
1,ADAM12 ITGA9,Cancers & Neoplasms,Yes
2,ADAM17 ITGB1,Liver Diseases,No
3,ADCYAP1 ADCYAP1R1,Psychiatric Disorders,No
4,ADCYAP1 VIPR2,Psychiatric Disorders,No
...,...,...,...
588,VWF ITGA2B,Liver Diseases,No
589,WNT3A APCDD1,Endocrine & Metabolic Disorders,No
590,WNT5A FZD2,Genetic & Congenital Disorders,No
591,WNT5A ROR2,Genetic & Congenital Disorders,No


In [25]:
df_combined.to_csv("data/diseaseType_per_pair.csv", index =False)

In [104]:
pathway_list = df['source'].unique()
len(disease_list)
pair_list = df['interaction'].unique()

In [105]:
len(pair_list)

2028

In [106]:
uniquepairs = list(set(pair_list) & set(LR_pairs)) 
len(uniquepairs)

2028

In [108]:
# Get pair count per Disease
pairPerDisease = disease_df.groupby('source')['interaction'].nunique().reset_index()
pairPerDisease.to_csv("data/pairPerPathwayCount.csv")

In [109]:
# Get Disease count per LRPair
pairPerDisease = disease_df.groupby('interaction')['source'].nunique().reset_index()
pairPerDisease.to_csv("data/PathwayPerLRPair.csv")

In [78]:
len(df["Gene Symbol"].unique())

NameError: name 'df' is not defined

In [None]:
df= pd.read_table("data/human_uniprot_isoforms.tsv", sep="\t")

In [None]:
df.columns

In [None]:
df = df[['UniProt ID', 'Gene Symbol', 'Isoform Type', 'FASTA Sequence']]

In [None]:
df

In [None]:
lim_df = gene_pair0[["Human LR Pair", "Ligand", "Receptor"]]

In [None]:
lim_df

In [None]:
lim_df = lim_df.merge(df, how='left', left_on='Ligand', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Ligand Sequence",
                                "UniProt ID": "Ligand Isoform Uniprot ID",
                                "Isoform Type": "Ligand Isoform Type"})
lim_df

In [None]:
lim_df = lim_df.merge(df, how='left', left_on='Receptor', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Receptor Sequence",
                                "UniProt ID": "Receptor Isoform Uniprot ID",
                                "Isoform Type": "Receptor Isoform Type"})
lim_df

In [None]:
lim_df.to_csv("data/LRpair_uniprot_sequences.tsv", sep="\t", index=False)

In [None]:
import gzip
import re
import pandas as pd

# Step 1: Extract Gene Symbol Mapping from GTF
gtf_file = "data/gencode.v47.annotation.gtf.gz"
gene_map = {}

# Read GTF file and extract gene_id -> gene_name mapping
with gzip.open(gtf_file, "rt") as f:
    for line in f:
        if line.startswith("#"):  # Skip comments
            continue
        
        fields = line.strip().split("\t")
        if fields[2] == "gene":  # Only extract gene entries
            info = {key.strip(): value.strip('"') for key, value in re.findall(r'(\S+) "([^"]+)"', fields[8])}
            if "gene_id" in info and "gene_name" in info:
                gene_map[info["gene_id"]] = info["gene_name"]

print(f"✅ Extracted {len(gene_map)} gene mappings from GTF.")

In [None]:
# Step 2: Parse GENCODE Protein FASTA and Add Gene Symbols
fasta_file = "data/gencode.v47.pc_translations.fa.gz"

# Store extracted data
records = []

# Open the GENCODE FASTA file and parse sequences
with gzip.open(fasta_file, "rt") as f:
    header = None
    sequence = []
    
    for line in f:
        line = line.strip()
        
        if line.startswith(">"):
            # Store previous sequence if exists
            if header and sequence:
                # Extract the Gene Symbol using the Gene ID
                gene_symbol = gene_map.get(header["gene_id"], "Unknown")
                isoform_type = "Canonical" if "-1" in header["protein_id"] else "Alternative Isoform"
                
                # Append the parsed data to records
                records.append([header["protein_id"], header["transcript_id"], header["gene_id"], gene_symbol, isoform_type, "".join(sequence)])
            
            # Split header by '|' and extract necessary fields
            fields = line[1:].split("|")  # Skip the '>' symbol and split by '|'
            if len(fields) >= 6:
                header = {
                    "protein_id": fields[0], 
                    "transcript_id": fields[1], 
                    "gene_id": fields[2]  
                }
                sequence = []
            else:
                header = None
        
        elif header:
            sequence.append(line)

    # Add the last record if needed
    if header and sequence:
        gene_symbol = gene_map.get(header["gene_id"], "Unknown")
        isoform_type = "Canonical" if "-1" in header["protein_id"] else "Alternative Isoform"
        records.append([header["protein_id"], header["transcript_id"], header["gene_id"], gene_symbol, isoform_type, "".join(sequence)])

# Step 3: Convert to pandas DataFrame and Save to TSV
df = pd.DataFrame(records, columns=["Ensembl Protein ID", "Ensembl Transcript ID", "Ensembl Gene ID", "Gene Symbol", "Isoform Type", "FASTA Sequence"])

# Save to TSV
df.to_csv("data/gencode_protein_isoforms_with_symbols.tsv", sep="\t", index=False)

# Print completion message
print(f"✅ Extracted {len(df)} protein sequences with Gene Symbols and saved to 'gencode_protein_isoforms_with_symbols.tsv'.")

In [None]:
lim_df = gene_pair0[["Human LR Pair", "Ligand", "Receptor"]]
lim_df = lim_df.merge(df, how='left', left_on='Ligand', right_on='Gene Symbol')

In [None]:
lim_df

In [None]:
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Ligand Sequence",
                                "Ensembl Protein ID": "Ligand Ensembl Protein ID",
                                "Ensembl Transcript ID": "Ligand Ensembl Transcript ID",
                                "Ensembl Gene ID": "Ligand Ensembl Gene ID",
                                "Isoform Type": "Ligand Isoform Type"})

In [None]:
lim_df

In [None]:
lim_df = lim_df.merge(df, how='left', left_on='Receptor', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Receptor Sequence",
                                "Ensembl Protein ID": "Receptor Ensembl Protein ID",
                                "Ensembl Transcript ID": "Receptor Ensembl Transcript ID",
                                "Ensembl Gene ID": "Receptor Ensembl Gene ID",
                                "Isoform Type": "Receptor Isoform Type"})

In [None]:
lim_df

In [None]:
lim_df.to_csv("data/LRpair_gencode_sequences.tsv", sep="\t", index=False)

####################################################################

In [54]:
## Function to scrape data from Pubmed for Title, Abstract, Journal, and Year
### IMPORTANT: TURN OFF VPN and make sure you have the data directory (from Sakura)

import sys
import requests
import pandas as pd
import time
import os
import xml.etree.ElementTree as ET

sys.path.append(os.path.abspath("src"))  
import fetchGSheet

# Read the API key from a file
with open("data/ncbi_api_key.txt", "r") as file:
    ncbi_api_key = file.read().strip()

# File to save the results
output_file = "data/pubmed_results.csv"

# Example of fetching HGNC gene symbols (you should have the `fetchGSheet.pop_up_info` dataframe ready)
def extract_hgnc_symbols(fetchGSheet):
    # Concatenate Approved, Alias, and Previous symbols, then extract unique symbols
    hgnc_symbols = pd.concat([
        fetchGSheet['Approved symbol'],
        fetchGSheet['Alias symbol'],
        fetchGSheet['Previous symbol']
    ], axis=0).dropna().str.upper().unique()  # Remove NaNs and make uppercase for matching
     # Remove any empty strings from the list
    hgnc_symbols = [symbol for symbol in hgnc_symbols if symbol != ""]
    return set(hgnc_symbols)  # Return as a set for fast lookup
    
hgnc_symbols = extract_hgnc_symbols(fetchGSheet.pop_up_info)

In [56]:
len(hgnc_symbols)

100941

In [None]:
# Official species names and their corresponding terms (scientific names)
# Load your list of PMIDs
pmid_list = source
species_dict = {
    "human": "Homo sapiens",
    "mouse": "Mus musculus",
    "rat": "Rattus norvegicus",
    "rabbit": "Oryctolagus cuniculus",
    "monkey": "Macaca spp.",
    "dog": "Canis lupus familiaris",
    "pig": "Sus scrofa",
    "zebra fish": "Danio rerio",
    "chicken": "Gallus gallus",
    "horse": "Equus ferus caballus",
    "cat": "Felis catus",
    "sheep": "Ovis aries",
    "cow": "Bos taurus",
    "fruit fly": "Drosophila melanogaster",
    "c. elegans": "Caenorhabditis elegans",
}

def fetch_pubmed_data(pmid_list, hgnc_symbols):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    results = []

    # Load existing data if output file exists
    if os.path.exists(output_file):
        existing_data = pd.read_csv(output_file)
    else:
        existing_data = pd.DataFrame(columns=["PMID", "Title", "Abstract", "Journal", "Year", "Species"])

    # Split PMIDs into batches
    batch_size = 50
    pmid_batches = [pmid_list[i:i + batch_size] for i in range(0, len(pmid_list), batch_size)]

    # Iterate over the batches
    for batch in pmid_batches:
        params = {
            "db": "pubmed",
            "id": ",".join(batch),  # Join PMIDs as comma-separated
            "retmode": "xml",
            "api_key": ncbi_api_key
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()

            # Parse the XML response
            root = ET.fromstring(response.text)
            for article in root.findall(".//PubmedArticle"):
                # Extract Title and Abstract
                title = article.findtext(".//ArticleTitle", default="N/A")
                abstract = article.findtext(".//AbstractText", default="No abstract available")

                # Extract Journal Title
                journal_tag = article.find(".//Journal/Title")
                journal = journal_tag.text.strip() if journal_tag is not None and journal_tag.text else "N/A"

                # Extract Publication Year
                pub_date = article.find(".//PubDate")
                if pub_date is not None:
                    year_tag = pub_date.find("Year")
                    year = year_tag.text if year_tag is not None else "N/A"

                    # Fallback to MedlineDate if Year is missing
                    if year == "N/A":
                        medline_date_tag = pub_date.find("MedlineDate")
                        year = medline_date_tag.text.split()[0] if medline_date_tag is not None else "N/A"
                else:
                    year = "N/A"  # PubDate is completely missing

                # Initialize species as N/A
                species = "N/A"

                # Check if the word "patient" is detected in title or abstract (assume human)
                if "patient" in title.lower() or "patient" in abstract.lower():
                    species = "Homo sapiens"
                elif "human" in title.lower() or "human" in abstract.lower():
                    species = "Homo sapiens"
                else:
                    # Look for HGNC gene symbols in title or abstract (assume human if found)
                    for gene in hgnc_symbols:
                        if gene in title or gene in abstract:
                            species = "Homo sapiens"
                            break
                    else:
                        # Look for MeSH terms related to species
                        for mesh_heading in article.findall(".//MeshHeadingList/MeshHeading"):
                            descriptor_name = mesh_heading.findtext("DescriptorName")
                            if descriptor_name:
                                # Match official species names using the species_dict
                                for species_term, scientific_name in species_dict.items():
                                    if species_term in descriptor_name.lower():
                                        species = scientific_name
                                        break  # Stop after finding the first match

                # Append the result
                results.append({
                    "PMID": article.findtext(".//MedlineCitation/PMID"),
                    "Title": title,
                    "Abstract": abstract,
                    "Journal": journal,
                    "Year": year,
                    "Species": species
                })

        except Exception as e:
            print(f"Error fetching batch {batch}: {e}")
            # Optionally save the response for debugging
            with open(f"error_batch_{batch[0]}_{batch[-1]}.xml", "w") as f:
                f.write(response.text)

        # Rate limiting to avoid API overload
        time.sleep(1)  # Increase delay for better API compliance

    # Save results
    new_data = pd.DataFrame(results)
    if not new_data.empty:
        # Merge existing and new data, updating missing values
        updated_data = pd.concat([existing_data, new_data])

        # Ensure all PMIDs are strings
        updated_data["PMID"] = updated_data["PMID"].astype(str)

        # Drop rows with missing PMIDs
        updated_data = updated_data.dropna(subset=["PMID"])

        # Ensure rows are ordered and remove duplicates
        updated_data = (
            updated_data.sort_values(by="PMID")  # Ensure rows are ordered
            .drop_duplicates(subset="PMID", keep="last")  # Keep the latest data
        )
        updated_data["Journal"] = updated_data["Journal"].str.split(" (", n=1, expand=False, regex=False).str[0]
        updated_data.to_csv(output_file, index=False)
    else:
        print("No new data fetched.")

    return results

# Fetch PubMed data with your list of PMIDs, output file path, and NCBI API key
fetch_pubmed_data(pmid_list, hgnc_symbols)

In [24]:
from createDataTable import gene_pair, gene_pair000

In [25]:
gene_pair000

Unnamed: 0,"<span title="" Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)"">Human LR Pair</span>","<span title="" Official Gene Symbol; Hover on symbols below to show gene names"""">Human Ligand&nbsp;&nbsp;&nbsp;</span</span>","<span title="" Official Gene Symbol; Hover on symbols below to show gene names"""">Human Receptor&nbsp;&nbsp;&nbsp;</span</span>","<span title=""Double-click header of Interaction Source to ensure all values are shown"">Interaction Source&nbsp;</span>","<span title=""Click the logo below to run Perplexity on the Human LR pair"">Perplexity&nbsp;</span>","<span title="" PubMed IDs (PMID) with Literature Evidence for LR Interaction. Click on the link for more details"">PMID support</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Ligand HGNC ID&nbsp;&nbsp;</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Receptor HGNC ID&nbsp;&nbsp;</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Ligand location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Receptor location</span>",...,"<span title=""Double-click header of Dog Receptor Ensembl ID to ensure all values are shown"">Dog Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Dog Receptor to ensure all values are shown"">Dog Receptor&nbsp;</span>","<span title=""Double-click header of Horse Ligand Ensembl ID to ensure all values are shown"">Horse Ligand Ensembl ID&nbsp;</span>","<span title=""Double-click header of Horse Ligand to ensure all values are shown"">Horse Ligand&nbsp;</span>","<span title=""Double-click header of Horse Receptor Ensembl ID to ensure all values are shown"">Horse Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Horse Receptor to ensure all values are shown"">Horse Receptor&nbsp;</span>","<span title=""Double-click header of Sheep Ligand Ensembl ID to ensure all values are shown"">Sheep Ligand Ensembl ID&nbsp;</span>","<span title=""Double-click header of Sheep Ligand to ensure all values are shown"">Sheep Ligand&nbsp;</span>","<span title=""Double-click header of Sheep Receptor Ensembl ID to ensure all values are shown"">Sheep Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Sheep Receptor to ensure all values are shown"">Sheep Receptor&nbsp;</span>"
0,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""C-C motif chemokine ligand 3 like...","<span title=""atypical chemokine receptor 2"">AC...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,,,ENSECAG00000024640,,ENSECAG00000003800,ACKR2,ENSOARG00020021765,,,
1,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""defensin beta 103B"">DEFB103B</span>","<span title=""C-C motif chemokine receptor 2"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845012144,CCR2,ENSECAG00000007143,,ENSECAG00000001214,CCR2,ENSOARG00020035870,,ENSOARG00020027153,CCR2
2,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""C-C motif chemokine ligand 3 like...","<span title=""C-C motif chemokine receptor 5"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845012125,CCR5,ENSECAG00000024640,,ENSECAG00000001114,,ENSOARG00020021765,,ENSOARG00020025170,CCR5
3,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""defensin beta 103B"">DEFB103B</span>","<span title=""C-C motif chemokine receptor 6"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845004836,CCR6,ENSECAG00000007143,,ENSECAG00000005185,CCR6,ENSOARG00020035870,,ENSOARG00020026137,CCR6
4,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""defensin beta 4A"">DEFB4A</span>","<span title=""C-C motif chemokine receptor 6"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845004836,CCR6,,,ENSECAG00000005185,CCR6,,,ENSOARG00020026137,CCR6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2360,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""killer cell immunoglobulin like r...","<span title=""PVR cell adhesion molecule"">PVR</...",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845007353,PVR,,,ENSECAG00000049310,,,,ENSOARG00020005070,PVR
2361,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""serum amyloid A1"">SAA1</span>","<span title=""scavenger receptor class B member...",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845015817,SCARB1,,,ENSECAG00000024242,SCARB1,,,ENSOARG00020004389,SCARB1
2362,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""serum amyloid A1"">SAA1</span>","<span title=""toll like receptor 2"">TLR2</span>",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845009391,TLR2,,,ENSECAG00000018028,TLR2,,,ENSOARG00020017860,TLR2
2363,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""serum amyloid A1"">SAA1</span>","<span title=""toll like receptor 4"">TLR4</span>",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845012026,TLR4,,,ENSECAG00000010339,TLR4,,,ENSOARG00020004476,TLR4


In [26]:
gene_pair

Unnamed: 0,"<span title="" Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)"">Human LR Pair</span>","<span title="" Official Gene Symbol; Hover on symbols below to show gene names"">Ligand&nbsp;&nbsp;&nbsp;</span>","<span title="" Official Gene Symbol; Hover on symbols below to show gene names"">Receptor&nbsp;&nbsp;&nbsp;</span>","<span title=""Double-click header of Interaction Source to ensure all values are shown"">Interaction Source&nbsp;</span>","<span title=""Click the logo below to run Perplexity on the Human LR pair"">Perplexity&nbsp;</span>","<span title="" PubMed IDs (PMID) with Literature Evidence for LR Interaction. Click on the link for more details"">PMID support</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Ligand HGNC ID&nbsp;&nbsp;</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Receptor HGNC ID&nbsp;&nbsp;</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Ligand location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Receptor location</span>",...,"<span title=""Double-click header of Dog Receptor Ensembl ID to ensure all values are shown"">Dog Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Dog Receptor to ensure all values are shown"">Dog Receptor&nbsp;</span>","<span title=""Double-click header of Horse Ligand Ensembl ID to ensure all values are shown"">Horse Ligand Ensembl ID&nbsp;</span>","<span title=""Double-click header of Horse Ligand to ensure all values are shown"">Horse Ligand&nbsp;</span>","<span title=""Double-click header of Horse Receptor Ensembl ID to ensure all values are shown"">Horse Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Horse Receptor to ensure all values are shown"">Horse Receptor&nbsp;</span>","<span title=""Double-click header of Sheep Ligand Ensembl ID to ensure all values are shown"">Sheep Ligand Ensembl ID&nbsp;</span>","<span title=""Double-click header of Sheep Ligand to ensure all values are shown"">Sheep Ligand&nbsp;</span>","<span title=""Double-click header of Sheep Receptor Ensembl ID to ensure all values are shown"">Sheep Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Sheep Receptor to ensure all values are shown"">Sheep Receptor&nbsp;</span>"
0,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""C-C motif chemokine ligand 3 like...","<span title=""atypical chemokine receptor 2"">AC...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,,,ENSECAG00000024640,,ENSECAG00000003800,ACKR2,ENSOARG00020021765,,,
1,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""defensin beta 103B"">DEFB103B</span>","<span title=""C-C motif chemokine receptor 2"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845012144,CCR2,ENSECAG00000007143,,ENSECAG00000001214,CCR2,ENSOARG00020035870,,ENSOARG00020027153,CCR2
2,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""C-C motif chemokine ligand 3 like...","<span title=""C-C motif chemokine receptor 5"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845012125,CCR5,ENSECAG00000024640,,ENSECAG00000001114,,ENSOARG00020021765,,ENSOARG00020025170,CCR5
3,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""defensin beta 103B"">DEFB103B</span>","<span title=""C-C motif chemokine receptor 6"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845004836,CCR6,ENSECAG00000007143,,ENSECAG00000005185,CCR6,ENSOARG00020035870,,ENSOARG00020026137,CCR6
4,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""defensin beta 4A"">DEFB4A</span>","<span title=""C-C motif chemokine receptor 6"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845004836,CCR6,,,ENSECAG00000005185,CCR6,,,ENSOARG00020026137,CCR6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2360,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""killer cell immunoglobulin like r...","<span title=""PVR cell adhesion molecule"">PVR</...",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845007353,PVR,,,ENSECAG00000049310,,,,ENSOARG00020005070,PVR
2361,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""serum amyloid A1"">SAA1</span>","<span title=""scavenger receptor class B member...",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845015817,SCARB1,,,ENSECAG00000024242,SCARB1,,,ENSOARG00020004389,SCARB1
2362,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""serum amyloid A1"">SAA1</span>","<span title=""toll like receptor 2"">TLR2</span>",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845009391,TLR2,,,ENSECAG00000018028,TLR2,,,ENSOARG00020017860,TLR2
2363,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""serum amyloid A1"">SAA1</span>","<span title=""toll like receptor 4"">TLR4</span>",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845012026,TLR4,,,ENSECAG00000010339,TLR4,,,ENSOARG00020004476,TLR4


In [45]:
import pandas as pd
from bs4 import BeautifulSoup

# Example DataFrame with HTML tags in column names
data = {
    "<span title='Double-click header of Sheep Ligand Ensembl ID to ensure all values are shown'>Sheep Ligand Ensembl ID&nbsp;</span>": [1, 2, 3],
    "<span title='Double-click header of Cow Ligand Ensembl ID to ensure all values are shown'>Cow Ligand Ensembl ID&nbsp;</span>": [4, 5, 6],
    "<span title='Double-click header of Dog Ligand Ensembl ID to ensure all values are shown'>Dog Ligand Ensembl ID&nbsp;</span>": [7, 8, 9],
}

species_gene_pair = pd.DataFrame(data)

In [46]:
species_gene_pair

Unnamed: 0,<span title='Double-click header of Sheep Ligand Ensembl ID to ensure all values are shown'>Sheep Ligand Ensembl ID&nbsp;</span>,<span title='Double-click header of Cow Ligand Ensembl ID to ensure all values are shown'>Cow Ligand Ensembl ID&nbsp;</span>,<span title='Double-click header of Dog Ligand Ensembl ID to ensure all values are shown'>Dog Ligand Ensembl ID&nbsp;</span>
0,1,4,7
1,2,5,8
2,3,6,9


In [15]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

### Parameters
topN = 10000 #Number of top pathways to be included
pathway_output_file="data/pathway_annotations_per_pair.csv"
output_file="data/disease_annotations_per_pair.csv"

### Pathway Annotations

# load PROGENy pathways, we use decoupler as a proxy as it formats the data in a more convenient way
progeny = dc.get_progeny(top=topN)
# import connectomeDB database ligands and receptors
lr_pairs = gene_pair0[["Ligand", "Receptor"]]
lr_pairs.columns = lr_pairs.columns.str.lower()

# generate ligand-receptor geneset
lr_progeny = li.rs.generate_lr_geneset(lr_pairs, progeny, lr_sep="^")
# some of the pairs are missing
len(lr_progeny["interaction"].unique())
# Replace '^' with ' ' in the 2nd column
lr_progeny.iloc[:, 1] = lr_progeny.iloc[:, 1].str.replace(r'\^', ' ', regex=True)
lr_progeny

ModuleNotFoundError: No module named 'createDataTable'

In [56]:
lr_progeny.iloc[:, 1] = lr_progeny.iloc[:, 1].str.replace(r'\^', ' ', regex=True)
lr_progeny.iloc[:, 1] 

14         IFNA13 IFNAR1
46         IFNA13 IFNAR1
57         IFNA13 IFNAR1
89         IFNA13 IFNAR2
108        IFNA13 IFNAR2
               ...      
140018       LRFN4 PTPRS
140068       LRFN5 PTPRD
140157      KIR2DL5A PVR
140179      KIR2DL5A PVR
140265         SAA1 TLR2
Name: interaction, Length: 5754, dtype: object

In [35]:
ligand_index =4
ligand_col = [col for col in species_gene_pair.columns if "Ligand&nbsp;" in col][ligand_index]

In [36]:
ligand_col

'<span title="Double-click header of Chimpanzee Ligand to ensure all values are shown">Chimpanzee Ligand&nbsp;</span>'

In [37]:
    # Rename columns to remove species name
    species_gene_pair.columns = [
        col.replace(f"{species} ", "").strip() if "Ligand" in col or "Receptor" in col else col
        for col in species_gene_pair.columns
    ]

In [38]:
    ligand_col = [col for col in species_gene_pair.columns if "Ligand&nbsp;" in col][ligand_index]

In [39]:
ligand_col

'<span title="Double-click header of Ligand to ensure all values are shown">Ligand&nbsp;</span>'

In [40]:
species_gene_pair

Unnamed: 0,"<span title="" Ligand-Receptor Interacting Pair, as described in Liu et al. (PMID: XXXXXX)"">Human LR Pair</span>","<span title="" Official Gene Symbol; Hover on symbols below to show gene names"""">Human Ligand&nbsp;&nbsp;&nbsp;</span</span>","<span title="" Official Gene Symbol; Hover on symbols below to show gene names"""">Human Receptor&nbsp;&nbsp;&nbsp;</span</span>","<span title=""Double-click header of Interaction Source to ensure all values are shown"">Interaction Source&nbsp;</span>","<span title=""Click the logo below to run Perplexity on the Human LR pair"">Perplexity&nbsp;</span>","<span title="" PubMed IDs (PMID) with Literature Evidence for LR Interaction. Click on the link for more details"">PMID support</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Ligand HGNC ID&nbsp;&nbsp;</span>","<span title=""HUGO Gene Nomenclature Committee (HGNC) ID. Click on the link for more details"">Receptor HGNC ID&nbsp;&nbsp;</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Ligand location</span>","<span title=""Location based on the predicted subcellular localization of the human proteome, as described in Ramilowski et al. (PMID: 26198319)"">Receptor location</span>",...,"<span title=""Double-click header of Dog Receptor Ensembl ID to ensure all values are shown"">Dog Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Dog Receptor to ensure all values are shown"">Dog Receptor&nbsp;</span>","<span title=""Double-click header of Horse Ligand Ensembl ID to ensure all values are shown"">Horse Ligand Ensembl ID&nbsp;</span>","<span title=""Double-click header of Horse Ligand to ensure all values are shown"">Horse Ligand&nbsp;</span>","<span title=""Double-click header of Horse Receptor Ensembl ID to ensure all values are shown"">Horse Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Horse Receptor to ensure all values are shown"">Horse Receptor&nbsp;</span>","<span title=""Double-click header of Sheep Ligand Ensembl ID to ensure all values are shown"">Sheep Ligand Ensembl ID&nbsp;</span>","<span title=""Double-click header of Sheep Ligand to ensure all values are shown"">Sheep Ligand&nbsp;</span>","<span title=""Double-click header of Sheep Receptor Ensembl ID to ensure all values are shown"">Sheep Receptor Ensembl ID&nbsp;</span>","<span title=""Double-click header of Sheep Receptor to ensure all values are shown"">Sheep Receptor&nbsp;</span>"
2,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""C-C motif chemokine ligand 3 like...","<span title=""C-C motif chemokine receptor 5"">C...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845012125,CCR5,ENSECAG00000024640,,ENSECAG00000001114,,ENSOARG00020021765,,ENSOARG00020025170,CCR5
8,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""nephronectin"">NPNT</span>","<span title=""integrin subunit beta 1"">ITGB1</s...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845002947,ITGB1,ENSECAG00000008518,NPNT,ENSECAG00000022498,ITGB1,ENSOARG00020025758,NPNT,ENSOARG00020022585,ITGB1
10,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""rabphilin 3A"">RPH3A</span>","<span title=""neurexin 1"">NRXN1</span>",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845006920,NRXN1,ENSECAG00000013526,RPH3A,ENSECAG00000009451,NRXN1,ENSOARG00020018314,RPH3A,ENSOARG00020001627,NRXN1
13,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""Epstein-Barr virus induced 3"">EBI...","<span title=""interleukin 27 receptor subunit a...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845025289,IL27RA,ENSECAG00000018667,EBI3,ENSECAG00000011964,IL27RA,ENSOARG00020016010,EBI3,ENSOARG00020001311,IL27RA
15,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""interleukin 17F"">IL17F</span>","<span title=""interleukin 17 receptor A"">IL17RA...",Ramilowski_2015_Literature_supported,"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",secreted,plasma membrane,...,ENSCAFG00845030604,,ENSECAG00000031935,IL17F,ENSECAG00000017077,IL17RA,ENSOARG00020019944,,ENSOARG00020025712,IL17RA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2341,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""fibronectin leucine rich transmem...","<span title=""adhesion G protein-coupled recept...",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845014364,ADGRL3,ENSECAG00000002569,FLRT3,ENSECAG00000018863,ADGRL3,ENSOARG00020025346,FLRT3,ENSOARG00020020864,ADGRL3
2342,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""fibronectin leucine rich transmem...","<span title=""adhesion G protein-coupled recept...",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845014364,ADGRL3,ENSECAG00000009011,FLRT2,ENSECAG00000018863,ADGRL3,ENSOARG00020033067,FLRT2,ENSOARG00020020864,ADGRL3
2345,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""teneurin transmembrane protein 2""...","<span title=""adhesion G protein-coupled recept...",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845014364,ADGRL3,ENSECAG00000024454,TENM2,ENSECAG00000018863,ADGRL3,ENSOARG00020013483,TENM2,ENSOARG00020020864,ADGRL3
2357,"<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<span title=""leucine rich repeat and fibronect...","<span title=""protein tyrosine phosphatase rece...",ConnectomeDB2025 (this publication),"<a href=""https://www.perplexity.ai/search?q=Do...","<a href=""https://comp.med.yokohama-cu.ac.jp/co...","<a href=""https://www.genenames.org/data/gene-s...","<a href=""https://www.genenames.org/data/gene-s...",,plasma membrane,...,ENSCAFG00845026437,PTPRD,ENSECAG00000024060,LRFN5,ENSECAG00000009864,PTPRD,ENSOARG00020006238,LRFN5,ENSOARG00020010720,PTPRD
