# Python Notebook

In [None]:
def add_row(change):
    global gene_pair
    # Add a new row at the top with None values
    new_row = {col: None for col in gene_pair.columns}
    gene_pair = pd.DataFrame([new_row] + gene_pair.to_dict(orient="records"))
    update_table()

# Function to remove the last row of the dataframe
def remove_row(change):
    global gene_pair
    if len(gene_pair) > 0:
        gene_pair = gene_pair[:-1]  # Remove the last row
        update_table()

In [None]:
## Function to convert Humans (HGNC symbol) to Other species ID via biomart
import os
import sys
from biomart import BiomartServer
import pandas as pd

# Add source directory to the path
sys.path.append(os.path.abspath("src"))
from createDataTable import hgnc_id

biomart_server_url = "http://www.ensembl.org/biomart"
server = BiomartServer(biomart_server_url)
# Select zebrafish dataset
dataset_name = "drerio_gene_ensembl"
dataset = server.datasets[dataset_name]
dataset

In [None]:
## Function to convert Humans (HGNC symbol) to Other species ID via biomart
import os
import sys
from biomart import BiomartServer
import pandas as pd

# Add source directory to the path
sys.path.append(os.path.abspath("src"))
from createDataTable import hgnc_id

# Species-specific parameters
species_id_prefix = "ZFIN"  # "RGD" for rat, "MGI" for mouse
dataset_name = "drerio_gene_ensembl" # "drerio_gene_ensembl" for zebra fish, "rnorvegicus_gene_ensembl" for rat, "mmusculus_gene_ensembl" for mouse
gene_id_field = "zfin_id"  # "external_gene_name" for mouse
gene_symbol_field = "zfin_symbol"  # "external_gene_name" for mouse
output_filename="data/hgnc_to_zfin_mapping.csv"

# Define function for conversion
def convert_hgnc_to_zfin(hgnc_ids, output_file=None):
    """
    Converts a list of HGNC IDs (human) to Other Species (e.g. ZFIN IDs (zebrafish)) using Ensembl Biomart.
    
    Args:
        hgnc_ids (list): A list of HGNC IDs to be converted.
        output_file (str): Path to save the conversion results (optional).
    
    Returns:
        pd.DataFrame: DataFrame containing input HGNC IDs and their corresponding ZFIN IDs.
    """
    # Biomart server configuration
    biomart_server_url = "http://www.ensembl.org/biomart"
    server = BiomartServer(biomart_server_url)
    dataset = server.datasets[dataset_name]

    # Query attributes
    attributes = [
        "ensembl_gene_id",  # Ensembl Gene ID
        "hgnc_id",          # HGNC ID (human)
         gene_id_field,      # Species to convert to
         gene_symbol_field
    ]
    
    # Build query
    response = dataset.search({
        "filters": {
            "hgnc_id": hgnc_ids
        },
        "attributes": attributes,
    })

    # Parse response to DataFrame
    results = pd.read_csv(response, sep="\t", header=None, names=attributes)

    # Save results if output_file is provided
    if output_file:
        results.to_csv(output_file, index=False)

    return results

# Example usage
if __name__ == "__main__":
    # Call function and print results
    result_df = convert_hgnc_to_zfin(hgnc_id, output_filename)
    print(result_df)


In [44]:
import pandas as pd
import re
import ijson
from transformers import AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Set output file name
output_file = "data/llm_results.csv"
top_n = 8

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# Define your custom stop/include words
custom_stop_words = [
    "make", "significant", "activity", "made", "makes", "significantly",
    "activities", "activity", "attenuated", "induced", "enhanced", "attenuates", 
    "induces", "enhances", "available", "abstract", "rarely", "result", "results",
    "produced", "produce", "important", "prominent", "role", "11", "12", "10",
    "18", "19", "",
]
custom_include_keywords = [
    "cancer", "tumor", "tumour", "signaling", "signalling", "cell communication", "protein-coding",
    "protein coding", "non-protein coding"
    "lncrna", "lnc-rna", "differentiation", "immune response", "non-inflammatory", "inflammatory",
    "hypoxia", "TME", "microenvironment", "cell-cell", "cell-to-cell", "dendritic differentiation",
    "gene regulation", "heterogeneity", "growth factor"
]

# Combine default English stop words and custom stop words
combined_stop_words = list(ENGLISH_STOP_WORDS.union(custom_stop_words))

# Initialize CountVectorizer with the updated stop words
vectorizer = CountVectorizer(max_features=top_n, stop_words=combined_stop_words)


def count_tokens(text):
    """
    Count the number of tokens in a given text using the tokenizer.
    """
    return len(tokenizer.encode(text, truncation=False))


def chunk_abstracts(abstracts, max_tokens=16000):
    """
    Split the abstracts into chunks based on the maximum token limit.
    """
    chunks = []
    current_chunk = []
    current_tokens = 0

    for abstract in abstracts:
        abstract_tokens = count_tokens(abstract)
        if current_tokens + abstract_tokens > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [abstract]
            current_tokens = abstract_tokens
        else:
            current_chunk.append(abstract)
            current_tokens += abstract_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def extract_keywords(text, top_n=top_n):
    """
    Extract top N keywords (including multi-word keywords) from a text using CountVectorizer.
    Exclude combined stop words, ensure inclusion of custom keywords, and filter out numeric keywords.
    Handle empty vocabulary errors.
    """
    if not text.strip():
        return ""  # Return an empty string if the text is empty

    try:
        # Remove purely numeric words from the text
        filtered_text = " ".join(word for word in text.split() if not re.match(r"^\d+$", word))

        # Configure CountVectorizer to include n-grams (e.g., unigrams and bigrams)
        vectorizer = CountVectorizer(
            max_features=top_n,
            stop_words=combined_stop_words,
            ngram_range=(1, 5)  # This includes unigrams (single words), bigrams (two words) till 5 words
        )

        # Fit the vectorizer to the filtered text and extract keywords
        X = vectorizer.fit_transform([filtered_text])
        keywords = set(vectorizer.get_feature_names_out())

        # Check which custom keywords (including multi-word) are present in the original text
        found_custom_keywords = [keyword for keyword in custom_include_keywords if keyword in text.lower()]

        # Add the found custom keywords to the set of extracted keywords
        keywords.update(found_custom_keywords)

        return ", ".join(keywords)
    except ValueError as e:
        # Handle empty vocabulary error
        if "empty vocabulary" in str(e):
            return ""  # Return an empty string if no valid tokens remain
        raise  # Re-raise other unexpected errors


def process_human_lr_pair(human_lr_pair, abstracts):
    """
    Process each ligand-receptor pair and extract relevant biological keywords.
    """
    # Chunk abstracts into smaller parts
    chunks = chunk_abstracts(abstracts)
    all_keywords = []

    for chunk in chunks:
        # Extract keywords from each chunk
        keywords = extract_keywords(chunk)
        all_keywords.append(keywords)

    # Combine all the extracted keywords from each chunk
    return {"Human LR Pair": human_lr_pair, "Relevance Keywords": ", ".join(all_keywords)}


# Stream JSON and process incrementally
with open("data_for_llm.json", "r") as f:
    parser = ijson.items(f, "item")
    results = []

    for entry in parser:
        human_lr_pair = entry["Human LR Pair"]
        abstracts = entry["Abstracts"]
        result = process_human_lr_pair(human_lr_pair, abstracts)

        # Skip entries where keywords are empty or only numerical
        if not result["Relevance Keywords"] or result["Relevance Keywords"].replace(",", "").isdigit():
            continue

        results.append(result)

# Write the final results to CSV (overwrite mode)
pd.DataFrame(results).to_csv(output_file, mode="w", header=True, index=False)

print(f"Results written to {output_file}")


Results written to data/llm_results.csv


In [42]:
import pandas as pd
import ijson
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from joblib import Parallel, delayed
import json

# Load BioBERT for NER
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# Load summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)  # Use GPU if available

output_file = "data/bio_filtered_llm_results.csv"

def count_tokens(text, tokenizer):
    """
    Count the number of tokens in the text.
    """
    return len(tokenizer.encode(text, truncation=False))

def chunk_abstracts(abstracts, tokenizer, max_tokens=16000):
    """
    Split abstracts into chunks without exceeding the token limit.
    """
    chunks = []
    current_chunk = []
    current_tokens = 0

    for abstract in abstracts:
        abstract_tokens = count_tokens(abstract, tokenizer)
        if current_tokens + abstract_tokens > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [abstract]
            current_tokens = abstract_tokens
        else:
            current_chunk.append(abstract)
            current_tokens += abstract_tokens

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Predefined keywords for fallback mechanism
RELEVANT_KEYWORDS = {
    "cancer", "tumor", "pathway", "inflammation", "inflammatory", "protein coding", "protein-coding",
    "mouse", "mice", "zebrafish", "non-inflammatory", "HIV infection", "cytokine", "TME", "interferons",
    "apoptosis", "disease", "signaling","signalling", "metastasis", "microenvironment",
    "lncrna", "cell-cell", "cell to cell", "cell-to-cell", "cell communication",
    "disease", "disorder", "lncrna", "gene regulation", "growth factor", 
    "chemokine", "chemokines", "cell cycle", "proliferation", "immunoresponse", "immune", "toxicity", 
    "differentiation", "stem cell", "abnormalities", "in vitro", "in vivo", "in-vitro", "in-vivo",
    "heterogeneity", "hypoxia", "interleukin", "lnc-rna", "infection",
}

def extract_keywords_fallback(text):
    """
    Fallback to extract relevant keywords from the text.
    """
    found_keywords = [word for word in RELEVANT_KEYWORDS if word in text.lower()]
    print("Fallback Keywords:", found_keywords)  # Debugging
    return found_keywords

def extract_relevant_entities(summary):
    """
    Use BioBERT to extract relevant entities from the summary.
    If no entities are found, fallback to predefined keywords.
    """
    entities = ner_pipeline(summary)
    
    # Log all extracted entities for debugging
    print("Extracted Entities:", entities)
    
    # Filter entities by group
    relevant_entities = [
        entity["word"] for entity in entities
        if entity["entity_group"] in {"DISEASE", "PATHWAY", "GENE", "PROCESS"}
    ]
    
    if not relevant_entities:
        # No entities found by BioBERT, use fallback
        print("No BioBERT entities found. Using fallback mechanism.")
        relevant_entities = extract_keywords_fallback(summary)
    
    # Deduplicate and return results
    return list(set(relevant_entities))

def analyze_relevance_hf_with_chunks(abstracts, human_lr_pair, tokenizer, max_tokens=1024):
    """
    Analyze biological relevance by summarizing abstracts and extracting entities.
    Adds a fallback mechanism for keyword-based filtering.
    """
    chunks = chunk_abstracts(abstracts, tokenizer, max_tokens=max_tokens)
    filtered_entities = []

    for chunk in chunks:
        input_text = (
            f"PubMed Abstracts:\n{chunk}\n\n"
            f"Summarize the biological relevance of the interaction between the "
            f"ligand-receptor pair: {human_lr_pair}."
        )
        summary = summarizer(input_text, max_length=80, min_length=20, do_sample=False)
        summary_text = summary[0]["summary_text"]
        
        # Log summarizer output for debugging
        print("Summarizer Output:", summary_text)
        
        # Extract entities using BioBERT (with fallback)
        entities = extract_relevant_entities(summary_text)
        filtered_entities.extend(entities)

    if not filtered_entities:
        print(f"No relevant entities found for {human_lr_pair}")

    return ", ".join(set(filtered_entities))  # Combine and deduplicate

def process_human_lr_pair(human_lr_pair, abstracts, tokenizer):
    """
    Process a single ligand-receptor pair.
    """
    relevance = analyze_relevance_hf_with_chunks(abstracts, human_lr_pair, tokenizer)
    return {"Human LR Pair": human_lr_pair, "Relevance Keywords": relevance}

# Stream JSON and process incrementally
with open("data_for_llm.json", "r") as f:
    parser = ijson.items(f, "item")
    results = []

    for entry in parser:
        human_lr_pair = entry["Human LR Pair"]
        abstracts = entry["Abstracts"]
        result = process_human_lr_pair(human_lr_pair, abstracts, tokenizer)
        results.append(result)

        # Write results incrementally to CSV to save memory
        pd.DataFrame([result]).to_csv(output_file, mode='a', header=not bool(results), index=False)

print(f"Filtered results written to {output_file}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0
Device set to use mps:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Summarizer Output: Chemokines play diverse roles in inflammatory and non-inflammatory situations via activation of heptahelical G-protein-coupled receptors. Many chemokine receptors can act as cofactors for cellular entry of human immunodeficiency virus (HIV) in vitro. CCR5, a receptor for chemokines MIP-1alpha (LD78alpha),
Extracted Entities: [{'entity_group': 'LABEL_1', 'score': np.float32(0.55091596), 'word': 'ch', 'start': 0, 'end': 2}, {'entity_group': 'LABEL_0', 'score': np.float32(0.5798595), 'word': '##emokines play diverse roles in inflammatory and non - inflammatory situations via activation of', 'start': 2, 'end': 95}, {'entity_group': 'LABEL_1', 'score': np.float32(0.55827105), 'word': 'heptahelical g', 'start': 96, 'end': 110}, {'entity_group': 'LABEL_0', 'score': np.float32(0.53384435), 'word': '- protein', 'start': 110, 'end': 118}, {'entity_group': 'LABEL_1', 'score': np.float32(0.5399335), 'word': '- coupled receptors', 'start': 118, 'end': 136}, {'entity_group': 'LABE

KeyboardInterrupt: 

In [45]:
## Function to create an evidence page per Human LR Pair with each tab per PMID

import sys
import os
import pandas as pd

# Paths
TEMPLATE_PATH = "HTML/pmidTemplate.html"
OUTPUT_DIR = "data/pubmed/"

# Add the src directory to the path for importing modules
sys.path.append(os.path.abspath("src"))
from createDataTable import gene_pair00

# Load PubMed data
pubmed_data = pd.read_csv("data/pubmed_results.csv")
pubmed_data["Year"] = pubmed_data["Year"].astype(str).str.replace(".0", 
                                                                  "", 
                                                                  regex=False).astype(int)

pubmed_data["PMID"] = pubmed_data["PMID"].astype(str)

# add llm results
bio_keywords = pd.read_csv("data/llm_results.csv")


# Replace spaces in "Human LR Pair" with a placeholder
gene_pair00["Human LR Pair"] = gene_pair00["Human LR Pair"].str.replace(" ", "——")

gene_pair000 = gene_pair00.merge(bio_keywords, how='left', left_on="Human LR Pair", right_on='Human LR Pair')
gene_pair000.

Unnamed: 0,Human LR Pair,PMID support,Relevance Keywords
0,CCL3L3——ACKR2,10364178,"mip, 1alphap, ccr5, non-inflammatory, mip 1alp..."
1,DEFB103B——CCR2,23390582,"dendritic, dendritic cells defb103, cells defb..."
2,CCL3L3——CCR5,1173455810364178,"human, 1alphap, ld78beta, ccr5, binding, mip, ..."
3,DEFB103B——CCR6,23390582,"dendritic, dendritic cells defb103, cells defb..."
4,DEFB4A——CCR6,1052134711714836,"chemokine, dendritic, immature, immune respons..."
...,...,...,...
2368,KIR2DL5A——PVR,36377656,"cancer, human, shp, kir2dl5 pvr, signaling, ki..."
2369,SAA1——SCARB1,15561721,"bi, saa, lipid, hdl, uptake, sr bi, selective, sr"
2370,SAA1——TLR2,18566366,"saa, tlr2, activation, expression, inflammator..."
2371,SAA1——TLR4,35247611,"phase response protein obesity, response, nafl..."


In [46]:
gene_pair000.columns

Index(['Human LR Pair', 'PMID support', 'Relevance Keywords'], dtype='object')

In [47]:
gene_pair000["Human LR Pair"]

0        CCL3L3——ACKR2
1       DEFB103B——CCR2
2         CCL3L3——CCR5
3       DEFB103B——CCR6
4         DEFB4A——CCR6
             ...      
2368     KIR2DL5A——PVR
2369      SAA1——SCARB1
2370        SAA1——TLR2
2371        SAA1——TLR4
2372       SAA3P——LY96
Name: Human LR Pair, Length: 2373, dtype: object

In [48]:
gene_pair000["Human LR Pair"]  = gene_pair000["Human LR Pair"].astype(str)
gene_pair000["Human LR Pair"]

0        CCL3L3——ACKR2
1       DEFB103B——CCR2
2         CCL3L3——CCR5
3       DEFB103B——CCR6
4         DEFB4A——CCR6
             ...      
2368     KIR2DL5A——PVR
2369      SAA1——SCARB1
2370        SAA1——TLR2
2371        SAA1——TLR4
2372       SAA3P——LY96
Name: Human LR Pair, Length: 2373, dtype: object