# Python Notebook

In [None]:
def add_row(change):
    global gene_pair
    # Add a new row at the top with None values
    new_row = {col: None for col in gene_pair.columns}
    gene_pair = pd.DataFrame([new_row] + gene_pair.to_dict(orient="records"))
    update_table()

# Function to remove the last row of the dataframe
def remove_row(change):
    global gene_pair
    if len(gene_pair) > 0:
        gene_pair = gene_pair[:-1]  # Remove the last row
        update_table()

In [65]:
gene_pair.columns

Index(['Human LR Pair', 'Ligand', 'Receptor', 'Interaction Source',
       'PMID support', 'Ligand HGNC ID', 'Ligand location', 'Receptor HGNC ID',
       'Receptor location', 'Perplexity', 'Ligand name', 'Ligand MGI ID',
       'Ligand RGD ID', 'Mouse Ligand', 'Rat Ligand', 'Ligand ZFIN ID',
       'Ligand ZFIN ID', 'Zebrafish Ligand', 'Zebrafish Ligand name',
       'Receptor name', 'Receptor MGI ID', 'Receptor RGD ID', 'Mouse Receptor',
       'Rat Receptor', 'Ligand ZFIN ID', 'Ligand ZFIN ID',
       'Zebrafish Receptor', 'Zebrafish Receptor name', 'HGNC L R',
       'sanity check', 'curator', 'secondary source?'],
      dtype='object')

In [9]:
duplicates = gene_pair00[gene_pair00["Human LR Pair"].duplicated()]
print(duplicates["Human LR Pair"])


2312       TAFA4 FPR1
2333    SLAMF1 SLAMF1
2342     FLRT3 ADGRL3
2344     TENM2 ADGRL1
Name: Human LR Pair, dtype: object


## Testing Liana+

In [14]:
import liana as li
import omnipath as op
import decoupler as dc
import pandas as pd

In [15]:
import sys
import os
sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

### Pathway Annotations

In [43]:
# load PROGENy pathways, we use decoupler as a proxy as it formats the data in a more convenient way
progeny = dc.get_progeny(top=10000)
progeny

Unnamed: 0,source,target,weight,p_value
0,Androgen,TMPRSS2,11.490631,0.0
1,Androgen,NKX3-1,10.622551,0.0
2,Androgen,MBOAT2,10.472733,0.0
3,Androgen,KLK2,10.176186,0.0
4,Androgen,SARG,11.386852,0.0
...,...,...,...,...
139455,p53,SIN3B,0.347078,0.269528
139456,p53,SS18,-0.516365,0.26954
139457,p53,CAPN3,1.14331,0.269546
139458,p53,BRICD5,-0.38892,0.269551


In [44]:
lr_pairs = gene_pair0[["Ligand", "Receptor"]]
lr_pairs.columns = lr_pairs.columns.str.lower()

In [45]:
lr_pairs

Unnamed: 0,ligand,receptor
0,CCL3L3,ACKR2
1,DEFB103B,CCR2
2,CCL3L3,CCR5
3,DEFB103B,CCR6
4,DEFB4A,CCR6
...,...,...
2361,KIR2DL5A,PVR
2362,SAA1,SCARB1
2363,SAA1,TLR2
2364,SAA1,TLR4


In [46]:
# generate ligand-receptor geneset
lr_progeny = li.rs.generate_lr_geneset(lr_pairs, progeny, lr_sep="^")

In [47]:
lr_progeny

Unnamed: 0,source,interaction,weight
14,NFkB,IFNA13^IFNAR1,0.582020
46,TNFa,IFNA13^IFNAR1,1.219279
57,Trail,IFNA13^IFNAR1,0.339437
89,NFkB,IFNA13^IFNAR2,2.453309
108,TNFa,IFNA13^IFNAR2,4.271504
...,...,...,...
140018,PI3K,LRFN4^PTPRS,-0.742439
140068,Trail,LRFN5^PTPRD,0.367912
140157,EGFR,KIR2DL5A^PVR,0.801227
140179,MAPK,KIR2DL5A^PVR,0.933643


In [48]:
# some of the pairs are missing
len(lr_progeny["interaction"].unique())

2028

In [49]:
output_file="data/pathway_annotations_per_pair.csv"
lr_progeny.to_csv(output_file, index=False)

### Disease Annotations

In [24]:
diseases = op.requests.Annotations.get(
    resources = ['DisGeNet']
    )

Downloading annotations for all proteins from the following resources: `['DisGeNet']`


In [26]:
diseases = diseases[['genesymbol', 'label', 'value']]
diseases = diseases.pivot_table(index='genesymbol',
                                columns='label', values='value',
                                aggfunc=lambda x: '; '.join(x)).reset_index()
diseases = diseases[['genesymbol', 'disease']]
diseases['disease'] = diseases['disease'].str.split('; ')
diseases = diseases.explode('disease')
lr_diseases = li.rs.generate_lr_geneset(lr_pairs, diseases, source='disease', target='genesymbol', weight=None, lr_sep="^")
lr_diseases.sort_values("interaction")



Unnamed: 0,disease,interaction
786337,Bipolar Disorder,ACE^BDKRB2
788038,Hypotension,ACE^BDKRB2
788618,Major Depressive Disorder,ACE^BDKRB2
790218,Unipolar Depression,ACE^BDKRB2
787989,Hypertensive disease,ACE^BDKRB2
...,...,...
835601,"Mammary Neoplasms, Human",YBX1^NOTCH1
835101,Colonic Neoplasms,YBX1^NOTCH1
835012,Breast Carcinoma,YBX1^NOTCH1
835283,Malignant neoplasm of breast,YBX1^NOTCH1


In [34]:
# some of the pairs are missing
len(lr_diseases["interaction"].unique())

593

In [29]:
output_file="data/disease_annotations_per_pair.csv"
lr_diseases.to_csv(output_file, index=False)

In [37]:
op.requests.Annotations.resources()

('Adhesome',
 'Almen2009',
 'Baccin2019',
 'CORUM_Funcat',
 'CORUM_GO',
 'CSPA',
 'CSPA_celltype',
 'CancerDrugsDB',
 'CancerGeneCensus',
 'CancerSEA',
 'CellCall',
 'CellCellInteractions',
 'CellChatDB',
 'CellChatDB_complex',
 'CellPhoneDB',
 'CellPhoneDB_complex',
 'CellTalkDB',
 'CellTypist',
 'Cellinker',
 'Cellinker_complex',
 'ComPPI',
 'CytoSig',
 'DGIdb',
 'DisGeNet',
 'EMBRACE',
 'Exocarta',
 'GO_Intercell',
 'GPCRdb',
 'Guide2Pharma',
 'HGNC',
 'HPA_secretome',
 'HPA_subcellular',
 'HPA_tissue',
 'HPMR',
 'HumanCellMap',
 'ICELLNET',
 'ICELLNET_complex',
 'IntOGen',
 'Integrins',
 'InterPro',
 'KEGG-PC',
 'Kirouac2010',
 'LOCATE',
 'LRdb',
 'Lambert2018',
 'MCAM',
 'MSigDB',
 'Matrisome',
 'MatrixDB',
 'Membranome',
 'NetPath',
 'OPM',
 'PROGENy',
 'PanglaoDB',
 'Phobius',
 'Phosphatome',
 'Ramilowski2015',
 'Ramilowski_location',
 'SIGNOR',
 'SignaLink_function',
 'SignaLink_pathway',
 'Surfaceome',
 'TCDB',
 'TFcensus',
 'TopDB',
 'UniProt_family',
 'UniProt_keyword',
 'Un

### Get FASTA sequences for each gene

In [150]:
import requests
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath("src"))  # Add src directory to path
from createDataTable import gene_pair0

# Get all unique genes
ligand_list = gene_pair0["Ligand"].tolist()
receptor_list = gene_pair0["Receptor"].tolist()
unique_genes = list(set(ligand_list + receptor_list))  # Combine and remove duplicates

In [182]:
import re
import pandas as pd
import gzip

# Input FASTA file
fasta_file = "data/uniprotkb_proteome_UP000005640_AND_revi_2025_03_27.fasta.gz"

# Regex pattern to extract details from header
header_pattern = re.compile(r"^>sp\|(?P<uniprot_id>[A-Z0-9]+(?:-\d+)?)\|(?P<protein_name>.+?) OS=Homo sapiens OX=9606 GN=(?P<gene_name>[A-Za-z0-9-]+)")

# Store extracted data
records = []

# Read and parse the file
with gzip.open(fasta_file, "rt") as f:
    header = None
    sequence = []
    
    for line in f:
        line = line.strip()
        
        if line.startswith(">"):
            # Store previous sequence if exists
            if header and sequence:
                isoform_type = "Canonical" if "-" not in header["uniprot_id"] else "Alternative Isoform"
                records.append([header["uniprot_id"], header["gene_name"], header["protein_name"], isoform_type, "".join(sequence)])
            
            # Match new header
            match = header_pattern.match(line)
            if match:
                header = match.groupdict()
                sequence = []
            else:
                header = None
        
        elif header:
            sequence.append(line)

    # Add the last record
    if header and sequence:
        isoform_type = "Canonical" if "-" not in header["uniprot_id"] else "Alternative Isoform"
        records.append([header["uniprot_id"], header["gene_name"], header["protein_name"], isoform_type, "".join(sequence)])

# Convert to pandas DataFrame
df = pd.DataFrame(records, columns=["UniProt ID", "Gene Symbol", "Protein Name", "Isoform Type", "FASTA Sequence"])

# Save as TSV
df.to_csv("data/human_uniprot_isoforms.tsv", sep="\t", index=False)

print(f"✅ Extracted {len(df)} Homo sapiens protein sequences and saved to 'human_uniprot_isoforms.tsv'.")


✅ Extracted 42389 Homo sapiens protein sequences and saved to 'human_uniprot_isoforms.tsv'.


In [183]:
len(df["Gene Symbol"].unique())

20207

In [198]:
df= pd.read_table("data/human_uniprot_isoforms.tsv", sep="\t")

In [199]:
df.columns

Index(['UniProt ID', 'Gene Symbol', 'Protein Name', 'Isoform Type',
       'FASTA Sequence'],
      dtype='object')

In [200]:
df = df[['UniProt ID', 'Gene Symbol', 'Isoform Type', 'FASTA Sequence']]

In [201]:
df

Unnamed: 0,UniProt ID,Gene Symbol,Isoform Type,FASTA Sequence
0,A0A087X1C5,CYP2D7,Canonical,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...
1,A0A0B4J2F0,PIGBOS1,Canonical,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...
2,A0A0C5B5G6,MT-RNR1,Canonical,MRWQEMGYIFYPRKLR
3,A0A0K2S4Q6,CD300H,Canonical,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...
4,A0A0U1RRE5,NBDY,Canonical,MGDQPCASGRSTLPPGNAREAKPPKKRCLLAPRWDYPEGTPNGGST...
...,...,...,...,...
42384,Q9Y6S9-2,RPS6KL1,Alternative Isoform,MSLVACECLPSPGLEPEPCSRARSQAHVYLEQIRNRVALGVPDMTK...
42385,Q9Y6S9-4,RPS6KL1,Alternative Isoform,MSLVACECLPSPGLEPEPCSRARSQAHVYLEQIRNRVALGVPDMTK...
42386,Q9Y6V7-2,DDX49,Alternative Isoform,MDMVAQALELSRKPHVVIATPGRLADHLRSSNTFSIKKIRFLVMDE...
42387,Q9Y6X4-2,FAM169A,Alternative Isoform,MAFPVDMLENCSHEELENSAEDYMSDLRCGDPENPECFSLLNITIP...


In [202]:
lim_df = gene_pair0[["Human LR Pair", "Ligand", "Receptor"]]

In [203]:
lim_df

Unnamed: 0,Human LR Pair,Ligand,Receptor
0,CCL3L3 ACKR2,CCL3L3,ACKR2
1,DEFB103B CCR2,DEFB103B,CCR2
2,CCL3L3 CCR5,CCL3L3,CCR5
3,DEFB103B CCR6,DEFB103B,CCR6
4,DEFB4A CCR6,DEFB4A,CCR6
...,...,...,...
2361,KIR2DL5A PVR,KIR2DL5A,PVR
2362,SAA1 SCARB1,SAA1,SCARB1
2363,SAA1 TLR2,SAA1,TLR2
2364,SAA1 TLR4,SAA1,TLR4


In [204]:
lim_df = lim_df.merge(df, how='left', left_on='Ligand', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Ligand Sequence",
                                "UniProt ID": "Ligand Isoform Uniprot ID",
                                "Isoform Type": "Ligand Isoform Type"})
lim_df

Unnamed: 0,Human LR Pair,Ligand,Receptor,Ligand Isoform Uniprot ID,Ligand Isoform Type,Ligand Sequence
0,CCL3L3 ACKR2,CCL3L3,ACKR2,,,
1,DEFB103B CCR2,DEFB103B,CCR2,,,
2,CCL3L3 CCR5,CCL3L3,CCR5,,,
3,DEFB103B CCR6,DEFB103B,CCR6,,,
4,DEFB4A CCR6,DEFB4A,CCR6,O15263,Canonical,MRVLYLLFSFLFIFLMPLPGVFGGIGDPVTCLKSGAICHPVFCPRR...
...,...,...,...,...,...,...
5309,KIR2DL5A PVR,KIR2DL5A,PVR,Q8N109,Canonical,MSLMVISMACVGFFLLQGAWTHEGGQDKPLLSAWPSAVVPRGGHVT...
5310,SAA1 SCARB1,SAA1,SCARB1,P0DJI8,Canonical,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...
5311,SAA1 TLR2,SAA1,TLR2,P0DJI8,Canonical,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...
5312,SAA1 TLR4,SAA1,TLR4,P0DJI8,Canonical,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...


In [205]:
lim_df = lim_df.merge(df, how='left', left_on='Receptor', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Receptor Sequence",
                                "UniProt ID": "Receptor Isoform Uniprot ID",
                                "Isoform Type": "Receptor Isoform Type"})
lim_df

Unnamed: 0,Human LR Pair,Ligand,Receptor,Ligand Isoform Uniprot ID,Ligand Isoform Type,Ligand Sequence,Receptor Isoform Uniprot ID,Receptor Isoform Type,Receptor Sequence
0,CCL3L3 ACKR2,CCL3L3,ACKR2,,,,O00590,Canonical,MAATASPQPLATEDADSENSSFYYYDYLDEVAFMLCRKDAVVSFGK...
1,DEFB103B CCR2,DEFB103B,CCR2,,,,P41597,Canonical,MLSTSRSRFIRNTNESGEEVTTFFDYDYGAPCHKFDVKQIGAQLLP...
2,DEFB103B CCR2,DEFB103B,CCR2,,,,P41597-2,Alternative Isoform,MLSTSRSRFIRNTNESGEEVTTFFDYDYGAPCHKFDVKQIGAQLLP...
3,CCL3L3 CCR5,CCL3L3,CCR5,,,,P51681,Canonical,MDYQVSSPIYDINYYTSEPCQKINVKQIAARLLPPLYSLVFIFGFV...
4,DEFB103B CCR6,DEFB103B,CCR6,,,,P51684,Canonical,MSGESMNFSDVFDSSEDYFVSVNTSYYSVDSEMLLCSLQEVRQFSR...
...,...,...,...,...,...,...,...,...,...
16143,SAA1 TLR4,SAA1,TLR4,P0DJI8,Canonical,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...,O00206,Canonical,MMSASRLAGTLIPAMAFLSCVRPESWEPCVEVVPNITYQCMELNFY...
16144,SAA1 TLR4,SAA1,TLR4,P0DJI8,Canonical,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...,O00206-2,Alternative Isoform,MELNFYKIPDNLPFSTKNLDLSFNPLRHLGSYSFFSFPELQVLDLS...
16145,SAA1 TLR4,SAA1,TLR4,P0DJI8,Canonical,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...,O00206-3,Alternative Isoform,MPLLNLSLDLSLNPMNFIQPGAFKEIRLHKLTLRNNFDSLNVMKTC...
16146,SAA3P LY96,SAA3P,LY96,,,,Q9Y6Y9,Canonical,MLPFLFFSTLFSSIFTEAQKQYWVCNSSDASISYTYCDKMQYPISI...


In [206]:
lim_df.to_csv("data/LRpair_uniprot_sequences.tsv", sep="\t", index=False)

In [124]:
import gzip
import re
import pandas as pd

# Step 1: Extract Gene Symbol Mapping from GTF
gtf_file = "data/gencode.v47.annotation.gtf.gz"
gene_map = {}

# Read GTF file and extract gene_id -> gene_name mapping
with gzip.open(gtf_file, "rt") as f:
    for line in f:
        if line.startswith("#"):  # Skip comments
            continue
        
        fields = line.strip().split("\t")
        if fields[2] == "gene":  # Only extract gene entries
            info = {key.strip(): value.strip('"') for key, value in re.findall(r'(\S+) "([^"]+)"', fields[8])}
            if "gene_id" in info and "gene_name" in info:
                gene_map[info["gene_id"]] = info["gene_name"]

print(f"✅ Extracted {len(gene_map)} gene mappings from GTF.")

✅ Extracted 78724 gene mappings from GTF.


In [127]:
# Step 2: Parse GENCODE Protein FASTA and Add Gene Symbols
fasta_file = "data/gencode.v47.pc_translations.fa.gz"

# Store extracted data
records = []

# Open the GENCODE FASTA file and parse sequences
with gzip.open(fasta_file, "rt") as f:
    header = None
    sequence = []
    
    for line in f:
        line = line.strip()
        
        if line.startswith(">"):
            # Store previous sequence if exists
            if header and sequence:
                # Extract the Gene Symbol using the Gene ID
                gene_symbol = gene_map.get(header["gene_id"], "Unknown")
                isoform_type = "Canonical" if "-1" in header["protein_id"] else "Alternative Isoform"
                
                # Append the parsed data to records
                records.append([header["protein_id"], header["transcript_id"], header["gene_id"], gene_symbol, isoform_type, "".join(sequence)])
            
            # Split header by '|' and extract necessary fields
            fields = line[1:].split("|")  # Skip the '>' symbol and split by '|'
            if len(fields) >= 6:
                header = {
                    "protein_id": fields[0], 
                    "transcript_id": fields[1], 
                    "gene_id": fields[2]  
                }
                sequence = []
            else:
                header = None
        
        elif header:
            sequence.append(line)

    # Add the last record if needed
    if header and sequence:
        gene_symbol = gene_map.get(header["gene_id"], "Unknown")
        isoform_type = "Canonical" if "-1" in header["protein_id"] else "Alternative Isoform"
        records.append([header["protein_id"], header["transcript_id"], header["gene_id"], gene_symbol, isoform_type, "".join(sequence)])

# Step 3: Convert to pandas DataFrame and Save to TSV
df = pd.DataFrame(records, columns=["Ensembl Protein ID", "Ensembl Transcript ID", "Ensembl Gene ID", "Gene Symbol", "Isoform Type", "FASTA Sequence"])

# Save to TSV
df.to_csv("data/gencode_protein_isoforms_with_symbols.tsv", sep="\t", index=False)

# Print completion message
print(f"✅ Extracted {len(df)} protein sequences with Gene Symbols and saved to 'gencode_protein_isoforms_with_symbols.tsv'.")

✅ Extracted 112218 protein sequences with Gene Symbols and saved to 'gencode_protein_isoforms_with_symbols.tsv'.


Unnamed: 0,Ensembl Protein ID,Ensembl Transcript ID,Ensembl Gene ID,Gene Symbol,Isoform Type,FASTA Sequence
0,ENSP00000493376.2,ENST00000641515.2,ENSG00000186092.7,OR4F5,Alternative Isoform,MKKVTAEAISWNESTSETNNSMVTEFIFLGLSDSQELQTFLFMLFF...
1,ENSP00000409316.1,ENST00000426406.4,ENSG00000284733.2,OR4F29,Alternative Isoform,MDGENHSVVSEFLFLGLTHSWEIQLLLLVFSSVLYVASITGNILIV...
2,ENSP00000329982.2,ENST00000332831.5,ENSG00000284662.2,OR4F16,Alternative Isoform,MDGENHSVVSEFLFLGLTHSWEIQLLLLVFSSVLYVASITGNILIV...
3,ENSP00000478421.2,ENST00000616016.5,ENSG00000187634.13,SAMD11,Alternative Isoform,MPAVKKEFPGREDLALALATFHPTLAALPLPPLPGYLAPLPAAAAL...
4,ENSP00000480678.2,ENST00000618323.5,ENSG00000187634.13,SAMD11,Alternative Isoform,MPAVKKEFPGREDLALALATFHPTLAALPLPPLPGYLAPLPAAAAL...
...,...,...,...,...,...,...
112213,ENSP00000354728.1,ENST00000361335.1,ENSG00000212907.2,MT-ND4L,Alternative Isoform,MPLIYMNIMLAFTISLLGMLVYRSHLMSSLLCLEGMMLSLFIMATL...
112214,ENSP00000354961.2,ENST00000361381.2,ENSG00000198886.2,MT-ND4,Alternative Isoform,MLKLIVPTIMLLPLTWLSKKHMIWINTTTHSLIISIIPLLFFNQIN...
112215,ENSP00000354813.2,ENST00000361567.2,ENSG00000198786.2,MT-ND5,Alternative Isoform,MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTFII...
112216,ENSP00000354665.2,ENST00000361681.2,ENSG00000198695.2,MT-ND6,Alternative Isoform,MMYALFLLSVGLVMGFVGFSSKPSPIYGGLVLIVSGVVGCVIILNF...


In [134]:
lim_df = gene_pair0[["Human LR Pair", "Ligand", "Receptor"]]
lim_df = lim_df.merge(df, how='left', left_on='Ligand', right_on='Gene Symbol')

In [135]:
lim_df

Unnamed: 0,Human LR Pair,Ligand,Receptor,Ensembl Protein ID,Ensembl Transcript ID,Ensembl Gene ID,Gene Symbol,Isoform Type,FASTA Sequence
0,CCL3L3 ACKR2,CCL3L3,ACKR2,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,CCL3L3,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...
1,DEFB103B CCR2,DEFB103B,CCR2,ENSP00000324633.3,ENST00000318124.3,ENSG00000177243.3,DEFB103B,Alternative Isoform,MRIHYLLFALLFLFLVPVPGHGGIINTLQKYYCRVRGGRCAVLSCL...
2,CCL3L3 CCR5,CCL3L3,CCR5,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,CCL3L3,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...
3,DEFB103B CCR6,DEFB103B,CCR6,ENSP00000324633.3,ENST00000318124.3,ENSG00000177243.3,DEFB103B,Alternative Isoform,MRIHYLLFALLFLFLVPVPGHGGIINTLQKYYCRVRGGRCAVLSCL...
4,DEFB4A CCR6,DEFB4A,CCR6,ENSP00000303532.2,ENST00000302247.3,ENSG00000171711.3,DEFB4A,Alternative Isoform,MRVLYLLFSFLFIFLMPLPGVFGGIGDPVTCLKSGAICHPVFCPRR...
...,...,...,...,...,...,...,...,...,...
11570,SAA1 TLR4,SAA1,TLR4,ENSP00000497498.1,ENST00000649195.1,ENSG00000173432.13,SAA1,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...
11571,SAA1 TLR4,SAA1,TLR4,ENSP00000509190.1,ENST00000689650.1,ENSG00000173432.13,SAA1,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...
11572,SAA1 TLR4,SAA1,TLR4,ENSP00000348918.4,ENST00000356524.9,ENSG00000173432.13,SAA1,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...
11573,SAA1 TLR4,SAA1,TLR4,ENSP00000384906.2,ENST00000405158.2,ENSG00000173432.13,SAA1,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...


In [136]:
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Ligand Sequence",
                                "Ensembl Protein ID": "Ligand Ensembl Protein ID",
                                "Ensembl Transcript ID": "Ligand Ensembl Transcript ID",
                                "Ensembl Gene ID": "Ligand Ensembl Gene ID",
                                "Isoform Type": "Ligand Isoform Type"})

In [137]:
lim_df

Unnamed: 0,Human LR Pair,Ligand,Receptor,Ligand Ensembl Protein ID,Ligand Ensembl Transcript ID,Ligand Ensembl Gene ID,Ligand Isoform Type,Ligand Sequence
0,CCL3L3 ACKR2,CCL3L3,ACKR2,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...
1,DEFB103B CCR2,DEFB103B,CCR2,ENSP00000324633.3,ENST00000318124.3,ENSG00000177243.3,Alternative Isoform,MRIHYLLFALLFLFLVPVPGHGGIINTLQKYYCRVRGGRCAVLSCL...
2,CCL3L3 CCR5,CCL3L3,CCR5,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...
3,DEFB103B CCR6,DEFB103B,CCR6,ENSP00000324633.3,ENST00000318124.3,ENSG00000177243.3,Alternative Isoform,MRIHYLLFALLFLFLVPVPGHGGIINTLQKYYCRVRGGRCAVLSCL...
4,DEFB4A CCR6,DEFB4A,CCR6,ENSP00000303532.2,ENST00000302247.3,ENSG00000171711.3,Alternative Isoform,MRVLYLLFSFLFIFLMPLPGVFGGIGDPVTCLKSGAICHPVFCPRR...
...,...,...,...,...,...,...,...,...
11570,SAA1 TLR4,SAA1,TLR4,ENSP00000497498.1,ENST00000649195.1,ENSG00000173432.13,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...
11571,SAA1 TLR4,SAA1,TLR4,ENSP00000509190.1,ENST00000689650.1,ENSG00000173432.13,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...
11572,SAA1 TLR4,SAA1,TLR4,ENSP00000348918.4,ENST00000356524.9,ENSG00000173432.13,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...
11573,SAA1 TLR4,SAA1,TLR4,ENSP00000384906.2,ENST00000405158.2,ENSG00000173432.13,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...


In [138]:
lim_df = lim_df.merge(df, how='left', left_on='Receptor', right_on='Gene Symbol')
lim_df = lim_df.drop(columns=["Gene Symbol"])
lim_df = lim_df.rename(columns={"FASTA Sequence": "Receptor Sequence",
                                "Ensembl Protein ID": "Receptor Ensembl Protein ID",
                                "Ensembl Transcript ID": "Receptor Ensembl Transcript ID",
                                "Ensembl Gene ID": "Receptor Ensembl Gene ID",
                                "Isoform Type": "Receptor Isoform Type"})

In [139]:
lim_df

Unnamed: 0,Human LR Pair,Ligand,Receptor,Ligand Ensembl Protein ID,Ligand Ensembl Transcript ID,Ligand Ensembl Gene ID,Ligand Isoform Type,Ligand Sequence,Receptor Ensembl Protein ID,Receptor Ensembl Transcript ID,Receptor Ensembl Gene ID,Receptor Isoform Type,Receptor Sequence
0,CCL3L3 ACKR2,CCL3L3,ACKR2,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...,ENSP00000416996.1,ENST00000422265.6,ENSG00000144648.16,Alternative Isoform,MAATASPQPLATEDADSENSSFYYYDYLDEVAFMLCRKDAVVSFGK...
1,CCL3L3 ACKR2,CCL3L3,ACKR2,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...,ENSP00000396150.1,ENST00000442925.5,ENSG00000144648.16,Alternative Isoform,MAATASPQPLATEDADSENSSFYYYDYLDEVAFMLCRKDAVVSFGK...
2,CCL3L3 ACKR2,CCL3L3,ACKR2,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...,ENSP00000476901.1,ENST00000497921.2,ENSG00000144648.16,Alternative Isoform,MAATASPQPLATEDADSENSSFYYYDYLDEVAFMLCRKDAVVSFGK...
3,CCL3L3 ACKR2,CCL3L3,ACKR2,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...,ENSP00000477475.1,ENST00000492609.1,ENSG00000144648.16,Alternative Isoform,MAATASPQPLATEDADSENSSFYYYDYLDEVAFMLCRKDAVVSFGK...
4,CCL3L3 ACKR2,CCL3L3,ACKR2,ENSP00000480558.1,ENST00000619989.1,ENSG00000276085.1,Alternative Isoform,MQVSTAALAVLLCTMALCNQVLSAPLAADTPTACCFSYTSRQIPQN...,ENSP00000477157.1,ENST00000494619.1,ENSG00000144648.16,Alternative Isoform,MAATASPQPLATEDADSENSSFYYYDYLDEVAFMLCRKDAVVSFGK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
89921,SAA1 TLR4,SAA1,TLR4,ENSP00000384906.2,ENST00000405158.2,ENSG00000173432.13,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...,ENSP00000496429.1,ENST00000472304.2,ENSG00000136869.16,Alternative Isoform,MMSASRLAGTLIPAMAFLSCVRPESWEPCVEV
89922,SAA1 TLR4,SAA1,TLR4,ENSP00000384906.2,ENST00000405158.2,ENSG00000173432.13,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...,ENSP00000377997.4,ENST00000394487.5,ENSG00000136869.16,Alternative Isoform,MELNFYKIPDNLPFSTKNLDLSFNPLRHLGSYSFFSFPELQVLDLS...
89923,SAA1 TLR4,SAA1,TLR4,ENSP00000384906.2,ENST00000405158.2,ENSG00000173432.13,Alternative Isoform,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...,ENSP00000363089.5,ENST00000355622.8,ENSG00000136869.16,Alternative Isoform,MMSASRLAGTLIPAMAFLSCVRPESWEPCVEVVPNITYQCMELNFY...
89924,SAA3P LY96,SAA3P,LY96,,,,,,ENSP00000284818.2,ENST00000284818.7,ENSG00000154589.7,Alternative Isoform,MLPFLFFSTLFSSIFTEAQKQYWVCNSSDASISYTYCDKMQYPISI...


In [140]:
lim_df.to_csv("data/LRpair_gencode_sequences.tsv", sep="\t", index=False)

####################################################################

In [223]:
## Function to scrape data from Pubmed for Title, Abstract, Journal, and Year
### IMPORTANT: TURN OFF VPN and make sure you have the data directory (from Sakura)

import sys
import requests
import pandas as pd
import time
import os
import xml.etree.ElementTree as ET

sys.path.append(os.path.abspath("src"))  
import fetchGSheet

# Read the API key from a file
with open("data/ncbi_api_key.txt", "r") as file:
    ncbi_api_key = file.read().strip()

# File to save the results
output_file = "data/pubmed_results.csv"

# Load your list of PMIDs
pmid_list = source

# Example of fetching HGNC gene symbols (you should have the `fetchGSheet.pop_up_info` dataframe ready)
def extract_hgnc_symbols(fetchGSheet):
    # Concatenate Approved, Alias, and Previous symbols, then extract unique symbols
    hgnc_symbols = pd.concat([
        fetchGSheet['Approved symbol'],
        fetchGSheet['Alias symbol'],
        fetchGSheet['Previous symbol']
    ], axis=0).dropna().str.upper().unique()  # Remove NaNs and make uppercase for matching
     # Remove any empty strings from the list
    hgnc_symbols = [symbol for symbol in hgnc_symbols if symbol != ""]
    return set(hgnc_symbols)  # Return as a set for fast lookup
    
hgnc_symbols = extract_hgnc_symbols(fetchGSheet.pop_up_info)

In [224]:
# Example of fetching HGNC gene symbols (you should have the `fetchGSheet.pop_up_info` dataframe ready)
def extract_hgnc_symbols(fetchGSheet):
    # Concatenate Approved, Alias, and Previous symbols, then extract unique symbols
    hgnc_symbols = pd.concat([
        fetchGSheet['Approved symbol'],
        fetchGSheet['Alias symbol'],
        fetchGSheet['Previous symbol']
    ], axis=0).dropna().str.upper().unique()  # Remove NaNs and make uppercase for matching
     # Remove any empty strings from the list
    hgnc_symbols = [symbol for symbol in hgnc_symbols if symbol != ""]
    return set(hgnc_symbols)  # Return as a set for fast lookup
hgnc_symbols = extract_hgnc_symbols(fetchGSheet.pop_up_info)

In [226]:
# Official species names and their corresponding terms (scientific names)
species_dict = {
    "human": "Homo sapiens",
    "mouse": "Mus musculus",
    "rat": "Rattus norvegicus",
    "rabbit": "Oryctolagus cuniculus",
    "monkey": "Macaca spp.",
    "dog": "Canis lupus familiaris",
    "pig": "Sus scrofa",
    "zebra fish": "Danio rerio",
    "chicken": "Gallus gallus",
    "horse": "Equus ferus caballus",
    "cat": "Felis catus",
    "sheep": "Ovis aries",
    "cow": "Bos taurus",
    "fruit fly": "Drosophila melanogaster",
    "c. elegans": "Caenorhabditis elegans",
}

def fetch_pubmed_data(pmid_list, hgnc_symbols):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    results = []

    # Load existing data if output file exists
    if os.path.exists(output_file):
        existing_data = pd.read_csv(output_file)
    else:
        existing_data = pd.DataFrame(columns=["PMID", "Title", "Abstract", "Journal", "Year", "Species"])

    # Split PMIDs into batches
    batch_size = 50
    pmid_batches = [pmid_list[i:i + batch_size] for i in range(0, len(pmid_list), batch_size)]

    # Iterate over the batches
    for batch in pmid_batches:
        params = {
            "db": "pubmed",
            "id": ",".join(batch),  # Join PMIDs as comma-separated
            "retmode": "xml",
            "api_key": ncbi_api_key
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()

            # Parse the XML response
            root = ET.fromstring(response.text)
            for article in root.findall(".//PubmedArticle"):
                # Extract Title and Abstract
                title = article.findtext(".//ArticleTitle", default="N/A")
                abstract = article.findtext(".//AbstractText", default="No abstract available")

                # Extract Journal Title
                journal_tag = article.find(".//Journal/Title")
                journal = journal_tag.text.strip() if journal_tag is not None and journal_tag.text else "N/A"

                # Extract Publication Year
                pub_date = article.find(".//PubDate")
                if pub_date is not None:
                    year_tag = pub_date.find("Year")
                    year = year_tag.text if year_tag is not None else "N/A"

                    # Fallback to MedlineDate if Year is missing
                    if year == "N/A":
                        medline_date_tag = pub_date.find("MedlineDate")
                        year = medline_date_tag.text.split()[0] if medline_date_tag is not None else "N/A"
                else:
                    year = "N/A"  # PubDate is completely missing

                # Initialize species as N/A
                species = "N/A"

                # Check if the word "patient" is detected in title or abstract (assume human)
                if "patient" in title.lower() or "patient" in abstract.lower():
                    species = "Homo sapiens"
                elif "human" in title.lower() or "human" in abstract.lower():
                    species = "Homo sapiens"
                else:
                    # Look for HGNC gene symbols in title or abstract (assume human if found)
                    for gene in hgnc_symbols:
                        if gene in title or gene in abstract:
                            species = "Homo sapiens"
                            break
                    else:
                        # Look for MeSH terms related to species
                        for mesh_heading in article.findall(".//MeshHeadingList/MeshHeading"):
                            descriptor_name = mesh_heading.findtext("DescriptorName")
                            if descriptor_name:
                                # Match official species names using the species_dict
                                for species_term, scientific_name in species_dict.items():
                                    if species_term in descriptor_name.lower():
                                        species = scientific_name
                                        break  # Stop after finding the first match

                # Append the result
                results.append({
                    "PMID": article.findtext(".//MedlineCitation/PMID"),
                    "Title": title,
                    "Abstract": abstract,
                    "Journal": journal,
                    "Year": year,
                    "Species": species
                })

        except Exception as e:
            print(f"Error fetching batch {batch}: {e}")
            # Optionally save the response for debugging
            with open(f"error_batch_{batch[0]}_{batch[-1]}.xml", "w") as f:
                f.write(response.text)

        # Rate limiting to avoid API overload
        time.sleep(1)  # Increase delay for better API compliance

    # Save results
    new_data = pd.DataFrame(results)
    if not new_data.empty:
        # Merge existing and new data, updating missing values
        updated_data = pd.concat([existing_data, new_data])

        # Ensure all PMIDs are strings
        updated_data["PMID"] = updated_data["PMID"].astype(str)

        # Drop rows with missing PMIDs
        updated_data = updated_data.dropna(subset=["PMID"])

        # Ensure rows are ordered and remove duplicates
        updated_data = (
            updated_data.sort_values(by="PMID")  # Ensure rows are ordered
            .drop_duplicates(subset="PMID", keep="last")  # Keep the latest data
        )
        updated_data["Journal"] = updated_data["Journal"].str.split(" (", n=1, expand=False, regex=False).str[0]
        updated_data.to_csv(output_file, index=False)
    else:
        print("No new data fetched.")

    return results

# Fetch PubMed data with your list of PMIDs, output file path, and NCBI API key
fetch_pubmed_data(pmid_list, hgnc_symbols)

[{'PMID': '10025398',
  'Title': 'The integrin alpha v beta 6 binds and activates latent TGF beta 1: a mechanism for regulating pulmonary inflammation and fibrosis.',
  'Abstract': 'Transforming growth factor beta (TGF beta) family members are secreted in inactive complexes with a latency-associated peptide (LAP), a protein derived from the N-terminal region of the TGF beta gene product. Extracellular activation of these complexes is a critical but incompletely understood step in regulation of TGF beta function in vivo. We show that TGF beta 1 LAP is a ligand for the integrin alpha v beta 6 and that alpha v beta 6-expressing cells induce spatially restricted activation of TGF beta 1. This finding explains why mice lacking this integrin develop exaggerated inflammation and, as we show, are protected from pulmonary fibrosis. These data identify a novel mechanism for locally regulating TGF beta 1 function in vivo by regulating expression of the alpha v beta 6 integrin.',
  'Journal': 'Cel