In [1]:
import spacy
from transformers import pipeline

# ---------- Gene NER (SciSpacy) ----------
nlp = spacy.load("en_ner_bionlp13cg_md")

# ---------- Disease NER (BioBERT) ----------
disease_ner = pipeline(
    "ner",
    model="alvaroalon2/biobert_diseases_ner",
    aggregation_strategy="simple"
)

text = "BRCA1 mutations increase the risk of ovarian cancer."

# Gene extraction
doc = nlp(text)
genes = [ent.text for ent in doc.ents if ent.label_ == "GENE_OR_GENE_PRODUCT"]

# Disease extraction
diseases = [e["word"] for e in disease_ner(text) if e["entity_group"] == "DISEASE"]

print("Genes:", genes)
print("Diseases:", diseases)


  from .autonotebook import tqdm as notebook_tqdm
  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
Device set to use cpu


Genes: ['BRCA1']
Diseases: ['ovarian cancer']


In [2]:
import spacy
from transformers import pipeline

# ---------- Gene NER (SciSpacy) ----------
nlp = spacy.load("en_ner_bionlp13cg_md")

# ---------- Disease NER (BioBERT, PyTorch) ----------
disease_ner = pipeline(
    "ner",
    model="alvaroalon2/biobert_diseases_ner",
    aggregation_strategy="simple",
    framework="pt"
)

# ---------- Relation keywords ----------
RELATION_KEYWORDS = {
    "association": [
        "associated with", "linked to", "related to", "correlated with"
    ],
    "risk": [
        "increase the risk of", "risk of", "predispose to"
    ],
    "cause": [
        "cause", "causes", "lead to", "results in"
    ],
    "mutation_effect": [
        "mutation", "mutations", "variant", "variants"
    ]
}

def extract_relations(text):
    text_lower = text.lower()
    relations_found = []

    for relation, keywords in RELATION_KEYWORDS.items():
        for kw in keywords:
            if kw in text_lower:
                relations_found.append(relation)

    return list(set(relations_found))


# ---------- Input text ----------
text = "BRCA1 mutations increase the risk of ovarian cancer."

# Gene extraction
doc = nlp(text)
genes = [ent.text for ent in doc.ents if ent.label_ == "GENE_OR_GENE_PRODUCT"]

# Disease extraction
diseases = [e["word"] for e in disease_ner(text) if e["entity_group"] == "DISEASE"]

# Relation extraction
relations = extract_relations(text)

print("Genes:", genes)
print("Diseases:", diseases)
print("Relations:", relations)


Device set to use cpu


Genes: ['BRCA1']
Diseases: ['ovarian cancer']
Relations: ['mutation_effect', 'risk']
