In [None]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# Initialize BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

def get_biobert_embeddings(text):
    """Get contextual embeddings from BioBERT"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def extract_relation(sentence, entity1, entity2):
    """Extract relationship between two entities using BioBERT embeddings"""
    # Mark entities in sentence
    marked_sentence = sentence.replace(entity1, f"[E1]{entity1}[/E1]") \
                             .replace(entity2, f"[E2]{entity2}[/E2]")
    
    # Get embeddings for the full sentence and entities
    sentence_embedding = get_biobert_embeddings(marked_sentence)
    e1_embedding = get_biobert_embeddings(entity1)
    e2_embedding = get_biobert_embeddings(entity2)
    
    # Find context between entities
    e1_pos = marked_sentence.find(f"[E1]{entity1}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{entity2}[/E2]")
    
    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos+len(entity1)+10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos+len(entity2)+10:e1_pos]
    
    # Get relation type based on similarity to known patterns
    relation_type = classify_relation(between_text.strip())
    
    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": float(cosine_similarity(
            [e1_embedding], 
            [e2_embedding]
        )[0][0])
    }

def classify_relation(text):
    """Classify relationship type based on keywords and embeddings"""
    if not text:
        return "unknown"
    
    # Predefined relation patterns and their embeddings
    relation_patterns = {
        "association": ["associated with", "linked to", "related to"],
        "location": ["located on", "found on", "positioned at"],
        "mutation": ["mutated in", "variant in", "alteration in"],
        "deletion": ["deleted in", "loss of", "missing in"],
        "expression": ["expressed in", "overexpressed in", "underexpressed in"]
    }
    
    # Check for direct keyword matches first
    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type
    
    # If no direct match, use embedding similarity
    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0
    
    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type
    
    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    """Process JSON file and extract relations using BioBERT"""
    with open(input_file) as f:
        data = json.load(f)
    
    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            disease = entry['target1_word']
            chromosome = entry['target2_word']
            sentence = entry['sentence']
            
            relation = extract_relation(sentence, disease, chromosome)
            
            result = {
                "pmid": entry['pmid'],
                "disease": {
                    "name": disease,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "chromosome": {
                    "name": chromosome,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)
            
        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue
    
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/disease_chromosome_relations.json"
    output_json = "./data/steps data output/step-5/biobert_disease_chromosome_relations.json"
    
    print("Starting relation extraction with BioBERT...")
    results = process_relations(input_json, output_json, limit=None)  # Remove limit for full processing
    
    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")
    
    # Print examples
    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Disease: {rel['disease']['name']}")
        print(f"Chromosome: {rel['chromosome']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")

Starting relation extraction with BioBERT...

Relation extraction complete!
Results saved to biobert_disease_chromosome_relations.json

Example extracted relations:

Example 1:
PMID: 26892345
Disease: recessive disease
Chromosome: 14q24
Relation phrase: 2q24.3 microdeletion that are associated with
Relation type: association
Similarity score: 0.73
Sentence excerpt: We performed exome sequencing to examine other causes for the phenotype and queried genes present in...

Example 2:
PMID: 26892345
Disease: recessive disease
Chromosome: 2q24.3
Relation phrase: microdeletion that are associated with
Relation type: association
Similarity score: 0.74
Sentence excerpt: We performed exome sequencing to examine other causes for the phenotype and queried genes present in...

Example 3:
PMID: 18281524
Disease: tumors
Chromosome: 19p
Relation phrase: include amplifications of 6q, 7q, 12q, and
Relation type: mutation
Similarity score: 0.73
Sentence excerpt: Genomic aberrations in regions associated w

In [72]:
len(results)

77

In [None]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

def get_biobert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def extract_relation(sentence, disease, gene):
    marked_sentence = sentence.replace(disease, f"[E1]{disease}[/E1]") \
                              .replace(gene, f"[E2]{gene}[/E2]")

    sentence_embedding = get_biobert_embeddings(marked_sentence)
    disease_embedding = get_biobert_embeddings(disease)
    gene_embedding = get_biobert_embeddings(gene)

    e1_pos = marked_sentence.find(f"[E1]{disease}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{gene}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos+len(disease)+10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos+len(gene)+10:e1_pos]

    relation_type = classify_relation(between_text.strip())

    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": float(cosine_similarity(
            [disease_embedding], 
            [gene_embedding]
        )[0][0])
    }

def classify_relation(text):
    if not text:
        return "unknown"

    # Updated relation patterns to reflect disease-gene associations
    relation_patterns = {
        "association": ["associated with", "linked to", "related to", "connected to"],
        "causation": ["caused by", "due to mutation in", "responsible for", "results from"],
        "mutation": ["mutation in", "mutations in", "variant of", "altered in"],
        "expression": ["expression of", "overexpressed", "underexpressed", "silenced"],
        "susceptibility": ["confers susceptibility", "predisposes to", "risk gene for"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0

    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            disease = entry['target1_word']
            gene = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, disease, gene)

            result = {
                "pmid": entry['pmid'],
                "disease": {
                    "name": disease,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "gene": {
                    "name": gene,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/disease_gene_relations.json"
    output_json = "./data/steps data output/step-5/biobert_disease_gene_relations.json"

    print("Starting relation extraction between disease and gene with BioBERT...")
    results = process_relations(input_json, output_json,limit=None)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Disease: {rel['disease']['name']}")
        print(f"Gene: {rel['gene']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")


In [74]:
len(results)

6063

In [None]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

# Load BioBERT and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device)
model.eval()

# Cache to avoid re-embedding repeated text
@lru_cache(maxsize=10000)
def get_biobert_embedding_cached(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def get_biobert_embeddings(text):
    # Wrap the cached function for compatibility
    return get_biobert_embedding_cached(text)

def extract_relation(sentence, disease1, disease2):
    marked_sentence = sentence.replace(disease1, f"[E1]{disease1}[/E1]") \
                              .replace(disease2, f"[E2]{disease2}[/E2]")

    sentence_embedding = get_biobert_embeddings(marked_sentence)
    d1_embedding = get_biobert_embeddings(disease1)
    d2_embedding = get_biobert_embeddings(disease2)

    e1_pos = marked_sentence.find(f"[E1]{disease1}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{disease2}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos+len(disease1)+10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos+len(disease2)+10:e1_pos]

    relation_type = classify_relation(between_text.strip())
    similarity = float(cosine_similarity([d1_embedding], [d2_embedding])[0][0])

    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": similarity
    }

def classify_relation(text):
    if not text:
        return "unknown"

    relation_patterns = {
        "comorbidity": ["co-occurs with", "comorbid with", "coexisting with"],
        "differential": ["differential diagnosis", "distinguished from"],
        "shared_risk": ["share risk factors", "similar causes", "common etiology"],
        "progression": ["leads to", "develops into", "progresses to"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0

    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            disease1 = entry['target1_word']
            disease2 = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, disease1, disease2)

            result = {
                "pmid": entry['pmid'],
                "disease1": {
                    "name": disease1,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "disease2": {
                    "name": disease2,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/disease_disease_relations.json"
    output_json = "./data/steps data output/step-5/biobert_disease_disease_relations.json"

    print("Starting relation extraction between disease and disease with BioBERT...")
    results = process_relations(input_json, output_json)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Disease 1: {rel['disease1']['name']}")
        print(f"Disease 2: {rel['disease2']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")

Starting relation extraction between disease and disease with BioBERT...


In [None]:
len(results)

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

# Load BioBERT with GPU support
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device)
model.eval()

# Cache embeddings to avoid redundant computation
@lru_cache(maxsize=10000)
def get_biobert_embedding_cached(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def get_biobert_embeddings(text):
    return get_biobert_embedding_cached(text)

def extract_relation(sentence, disease, variant):
    marked_sentence = sentence.replace(disease, f"[E1]{disease}[/E1]") \
                              .replace(variant, f"[E2]{variant}[/E2]")

    sentence_embedding = get_biobert_embeddings(marked_sentence)
    d_embedding = get_biobert_embeddings(disease)
    v_embedding = get_biobert_embeddings(variant)

    e1_pos = marked_sentence.find(f"[E1]{disease}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{variant}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos+len(disease)+10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos+len(variant)+10:e1_pos]

    relation_type = classify_relation(between_text.strip())

    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": float(cosine_similarity([d_embedding], [v_embedding])[0][0])
    }

def classify_relation(text):
    if not text:
        return "unknown"

    relation_patterns = {
        "causative": ["caused by", "results from", "due to"],
        "mutation": ["mutation in", "mutated", "variant of", "carrying the variant", "substitution in"],
        "risk": ["associated with", "linked to", "confers risk for", "increases susceptibility to"],
        "protective": ["protective against", "reduces risk of", "negatively associated with"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0

    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            disease = entry['target1_word']
            variant = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, disease, variant)

            result = {
                "pmid": entry['pmid'],
                "disease": {
                    "name": disease,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "variant": {
                    "name": variant,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/disease_variant_relations.json"
    output_json = "./data/steps data output/step-5/biobert_disease_variant_relations.json"

    print("Starting relation extraction between disease and variant with BioBERT...")
    results = process_relations(input_json, output_json)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Disease: {rel['disease']['name']}")
        print(f"Variant: {rel['variant']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")


In [3]:
len(results)

264

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

# Load BioBERT with GPU acceleration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device)
model.eval()

# Embedding cache to speed up repeated computations
@lru_cache(maxsize=10000)
def get_biobert_embedding_cached(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def get_biobert_embeddings(text):
    return get_biobert_embedding_cached(text)

def extract_relation(sentence, disease, chemical):
    marked_sentence = sentence.replace(disease, f"[E1]{disease}[/E1]") \
                              .replace(chemical, f"[E2]{chemical}[/E2]")

    sentence_embedding = get_biobert_embeddings(marked_sentence)
    d_embedding = get_biobert_embeddings(disease)
    c_embedding = get_biobert_embeddings(chemical)

    e1_pos = marked_sentence.find(f"[E1]{disease}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{chemical}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos + len(disease) + 10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos + len(chemical) + 10:e1_pos]

    relation_type = classify_relation(between_text.strip())

    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": float(cosine_similarity([d_embedding], [c_embedding])[0][0])
    }

def classify_relation(text):
    if not text:
        return "unknown"

    # Relation types between disease and chemical
    relation_patterns = {
        "treatment": ["treated with", "alleviated by", "managed with", "therapy using"],
        "induction": ["induced by", "caused by", "triggered by"],
        "inhibition": ["inhibited by", "suppressed by"],
        "resistance": ["resistant to", "not affected by", "ineffective against"],
        "side_effect": ["adverse effect", "toxicity", "harmful to", "exacerbates"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0

    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            disease = entry['target1_word']
            chemical = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, disease, chemical)

            result = {
                "pmid": entry['pmid'],
                "disease": {
                    "name": disease,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "chemical": {
                    "name": chemical,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/disease_chemical_relations.json"
    output_json = "./data/steps data output/step-5/biobert_disease_chemical_relations.json"

    print("Starting relation extraction between disease and chemical with BioBERT...")
    results = process_relations(input_json, output_json)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Disease: {rel['disease']['name']}")
        print(f"Chemical: {rel['chemical']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")

Starting relation extraction between disease and chemical with BioBERT...

Relation extraction complete!
Results saved to biobert_disease_chemical_relations.json

Example extracted relations:

Example 1:
PMID: 25588595
Disease: systemic embolism
Chemical: rivaroxaban
Relation phrase: is used to prevent stroke and
Relation type: side_effect
Similarity score: 0.78
Sentence excerpt: BACKGROUND: In nonvalvular atrial fibrillation (NVAF), rivaroxaban is used to prevent stroke and sys...

Example 2:
PMID: 25588595
Disease: stroke
Chemical: rivaroxaban
Relation phrase: is used to prevent
Relation type: induction
Similarity score: 0.68
Sentence excerpt: BACKGROUND: In nonvalvular atrial fibrillation (NVAF), rivaroxaban is used to prevent stroke and sys...

Example 3:
PMID: 25588595
Disease: atrial fibrillation
Chemical: rivaroxaban
Relation phrase: (NVAF),
Relation type: treatment
Similarity score: 0.73
Sentence excerpt: BACKGROUND: In nonvalvular atrial fibrillation (NVAF), rivaroxaban is use

In [11]:
len(results)

13598

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

# Load BioBERT model with GPU support if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device)
model.eval()

# Cache for embedding repeated terms (genes, keywords, etc.)
@lru_cache(maxsize=10000)
def get_biobert_embedding_cached(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def get_biobert_embeddings(text):
    return get_biobert_embedding_cached(text)

def extract_relation(sentence, gene1, gene2):
    marked_sentence = sentence.replace(gene1, f"[E1]{gene1}[/E1]") \
                              .replace(gene2, f"[E2]{gene2}[/E2]")

    sentence_embedding = get_biobert_embeddings(marked_sentence)
    g1_embedding = get_biobert_embeddings(gene1)
    g2_embedding = get_biobert_embeddings(gene2)

    e1_pos = marked_sentence.find(f"[E1]{gene1}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{gene2}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos + len(gene1) + 10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos + len(gene2) + 10:e1_pos]

    relation_type = classify_relation(between_text.strip())

    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": float(cosine_similarity([g1_embedding], [g2_embedding])[0][0])
    }

def classify_relation(text):
    if not text:
        return "unknown"

    relation_patterns = {
        "interaction": ["interacts with", "binds to", "forms complex with"],
        "regulation": ["regulates", "upregulates", "downregulates", "inhibits", "activates"],
        "coexpression": ["co-expressed with", "coexpression of", "correlated with"],
        "pathway": ["in same pathway", "participates in", "part of pathway"],
        "genetic_link": ["genetically linked", "functionally associated", "epistatic to"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0

    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            gene1 = entry['target1_word']
            gene2 = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, gene1, gene2)

            result = {
                "pmid": entry['pmid'],
                "gene1": {
                    "name": gene1,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "gene2": {
                    "name": gene2,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/gene_gene_relations.json"
    output_json = "./data/steps data output/step-5/biobert_gene_gene_relations.json"

    print("Starting relation extraction between gene and gene with BioBERT...")
    results = process_relations(input_json, output_json)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Gene 1: {rel['gene1']['name']}")
        print(f"Gene 2: {rel['gene2']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")

Starting relation extraction between gene and gene with BioBERT...

Relation extraction complete!
Results saved to biobert_gene_gene_relations.json

Example extracted relations:

Example 1:
PMID: 30173558
Gene 1: MMP13
Gene 2: COL2A1
Relation phrase: and
Relation type: pathway
Similarity score: 0.81
Sentence excerpt: MMP13 and COL2A1 were more highly expressed in medial versus lateral compartment....

Example 2:
PMID: 33041797
Gene 1: MMP13
Gene 2: NFKBIA
Relation phrase: NLRP3, TRIM21, GBP1, ADORA2A, PTAFR, TNF, MLNR, IL1B,
Relation type: coexpression
Similarity score: 0.80
Sentence excerpt: The shared genes included MMP13, NLRP3, TRIM21, GBP1, ADORA2A, PTAFR, TNF, MLNR, IL1B, NFKBIA, ADRB2...

Example 3:
PMID: 33041797
Gene 1: MMP13
Gene 2: IL6
Relation phrase: NLRP3, TRIM21, GBP1, ADORA2A, PTAFR, TNF, MLNR, IL1B, NFKBIA, ADRB2, and
Relation type: coexpression
Similarity score: 0.85
Sentence excerpt: The shared genes included MMP13, NLRP3, TRIM21, GBP1, ADORA2A, PTAFR, TNF, MLNR, IL1

In [9]:
len(results)

8724

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BioBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device).eval()

# Cache frequently used embeddings to avoid redundant computation
@lru_cache(maxsize=10000)
def get_biobert_embedding_cached(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def extract_relation(sentence, gene, chemical):
    marked_sentence = sentence.replace(gene, f"[E1]{gene}[/E1]") \
                              .replace(chemical, f"[E2]{chemical}[/E2]")

    g_embedding = get_biobert_embedding_cached(gene)
    c_embedding = get_biobert_embedding_cached(chemical)

    e1_pos = marked_sentence.find(f"[E1]{gene}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{chemical}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos + len(gene) + 10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos + len(chemical) + 10:e1_pos]

    relation_phrase = between_text.strip()
    relation_type = classify_relation(relation_phrase)

    return {
        "relation_phrase": relation_phrase,
        "relation_type": relation_type,
        "similarity_score": float(cosine_similarity([g_embedding], [c_embedding])[0][0])
    }

def classify_relation(text):
    if not text:
        return "unknown"

    relation_patterns = {
        "activation": ["activated by", "induces", "stimulates", "enhances"],
        "inhibition": ["inhibited by", "suppressed by", "downregulated by"],
        "binding": ["binds to", "interacts with", "affinity for"],
        "regulation": ["regulated by", "controlled by", "modulated by"],
        "metabolism": ["metabolized by", "processed by", "biotransformed by"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embedding_cached(text)
    best_match = "unknown"
    highest_sim = 0.0

    for rel_type, keywords in relation_patterns.items():
        for keyword in keywords:
            pattern_embedding = get_biobert_embedding_cached(keyword)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            gene = entry['target1_word']
            chemical = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, gene, chemical)

            result = {
                "pmid": entry['pmid'],
                "gene": {
                    "name": gene,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "chemical": {
                    "name": chemical,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/gene_chemical_relations.json"
    output_json = "./data/steps data output/step-5/biobert_gene_chemical_relations.json"

    print("Starting relation extraction between gene and chemical with BioBERT...")
    results = process_relations(input_json, output_json)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Gene: {rel['gene']['name']}")
        print(f"Chemical: {rel['chemical']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")

Starting relation extraction between gene and chemical with BioBERT...

Relation extraction complete!
Results saved to biobert_gene_chemical_relations.json

Example extracted relations:

Example 1:
PMID: 32393603
Gene: isocitrate dehydrogenase
Chemical: ivosidenib
Relation phrase: (IDH) inhibitors
Relation type: inhibition
Similarity score: 0.76
Sentence excerpt: PURPOSE: Differentiation syndrome (DS) is a serious adverse reaction of isocitrate dehydrogenase (ID...

Example 2:
PMID: 32393603
Gene: isocitrate dehydrogenase
Chemical: enasidenib
Relation phrase: (IDH) inhibitors ivosidenib and
Relation type: inhibition
Similarity score: 0.73
Sentence excerpt: PURPOSE: Differentiation syndrome (DS) is a serious adverse reaction of isocitrate dehydrogenase (ID...

Example 3:
PMID: 32393603
Gene: IDH2
Chemical: ivosidenib
Relation phrase: and enasidenib in patients with (IDH)1- and
Relation type: regulation
Similarity score: 0.86
Sentence excerpt: PURPOSE: Differentiation syndrome (DS) is a 

In [7]:
len(results)

1908

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device)
model.eval()

# Cached embeddings for repeated gene/chromosome/pattern terms
@lru_cache(maxsize=10000)
def get_biobert_embedding_cached(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def get_biobert_embeddings(text):
    return get_biobert_embedding_cached(text)

def extract_relation(sentence, gene, chromosome):
    marked_sentence = sentence.replace(gene, f"[E1]{gene}[/E1]") \
                              .replace(chromosome, f"[E2]{chromosome}[/E2]")

    sentence_embedding = get_biobert_embeddings(marked_sentence)
    g_embedding = get_biobert_embeddings(gene)
    c_embedding = get_biobert_embeddings(chromosome)

    e1_pos = marked_sentence.find(f"[E1]{gene}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{chromosome}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos + len(gene) + 10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos + len(chromosome) + 10:e1_pos]

    relation_type = classify_relation(between_text.strip())

    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": float(cosine_similarity([g_embedding], [c_embedding])[0][0])
    }

def classify_relation(text):
    if not text:
        return "unknown"

    relation_patterns = {
        "location": ["located on", "mapped to", "found on", "resides in", "positioned at"],
        "translocation": ["translocated to", "translocated from"],
        "deletion": ["deleted from", "loss at", "missing on"],
        "duplication": ["duplicated at", "gain on", "copy number increase at"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0

    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            gene = entry['target1_word']
            chromosome = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, gene, chromosome)

            result = {
                "pmid": entry['pmid'],
                "gene": {
                    "name": gene,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "chromosome": {
                    "name": chromosome,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/gene_chromosome_relations.json"
    output_json = "./data/steps data output/step-5/biobert_gene_chromosome_relations.json"

    print("Starting relation extraction between gene and chromosome with BioBERT...")
    results = process_relations(input_json, output_json)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Gene: {rel['gene']['name']}")
        print(f"Chromosome: {rel['chromosome']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")

Starting relation extraction between gene and chromosome with BioBERT...

Relation extraction complete!
Results saved to biobert_gene_chromosome_relations.json

Example extracted relations:

Example 1:
PMID: 31706190
Gene: MAP2K5
Chromosome: chromosome 2
Relation phrase: SKOR1, TOX3, and an intergenic region on
Relation type: duplication
Similarity score: 0.75
Sentence excerpt: This study investigated whether any of the six initially discovered genomic loci associating with RL...

Example 2:
PMID: 31706190
Gene: SKOR1
Chromosome: chromosome 2
Relation phrase: TOX3, and an intergenic region on
Relation type: duplication
Similarity score: 0.77
Sentence excerpt: This study investigated whether any of the six initially discovered genomic loci associating with RL...

Example 3:
PMID: 31706190
Gene: TOX3
Chromosome: chromosome 2
Relation phrase: and an intergenic region on
Relation type: duplication
Similarity score: 0.77
Sentence excerpt: This study investigated whether any of the six initi

In [3]:
len(results)

194

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device)
model.eval()

# Cache BioBERT embeddings for repeated phrases (genes, variants, keywords)
@lru_cache(maxsize=10000)
def get_biobert_embedding_cached(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def get_biobert_embeddings(text):
    return get_biobert_embedding_cached(text)

def extract_relation(sentence, gene, variant):
    marked_sentence = sentence.replace(gene, f"[E1]{gene}[/E1]") \
                              .replace(variant, f"[E2]{variant}[/E2]")

    sentence_embedding = get_biobert_embeddings(marked_sentence)
    g_embedding = get_biobert_embeddings(gene)
    v_embedding = get_biobert_embeddings(variant)

    e1_pos = marked_sentence.find(f"[E1]{gene}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{variant}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos + len(gene) + 10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos + len(variant) + 10:e1_pos]

    relation_type = classify_relation(between_text.strip())

    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": float(cosine_similarity([g_embedding], [v_embedding])[0][0])
    }

def classify_relation(text):
    if not text:
        return "unknown"

    relation_patterns = {
        "mutation": ["mutation in", "variant of", "substitution in", "alteration of", "harboring"],
        "association": ["associated with", "linked to", "involved in", "correlated with"],
        "expression": ["affects expression of", "expressed with", "disrupts"],
        "function": ["impairs function", "enhances activity", "affects function of"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0

    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            gene = entry['target1_word']
            variant = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, gene, variant)

            result = {
                "pmid": entry['pmid'],
                "gene": {
                    "name": gene,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "variant": {
                    "name": variant,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/gene_variant_relations.json"
    output_json = "./data/steps data output/step-5/biobert_gene_variant_relations.json"

    print("Starting relation extraction between gene and variant with BioBERT...")
    results = process_relations(input_json, output_json)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Gene: {rel['gene']['name']}")
        print(f"Variant: {rel['variant']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")


Starting relation extraction between gene and variant with BioBERT...

Relation extraction complete!
Results saved to biobert_gene_variant_relations.json

Example extracted relations:

Example 1:
PMID: 33764904
Gene: PALLD
Variant: c.G154A
Relation phrase: gene (NM_001166108.1:
Relation type: unknown
Similarity score: 0.64
Sentence excerpt: Compartment-specific gene expression data and immunohistochemistry were also queried.RESULTSThe iden...

Example 2:
PMID: 33764904
Gene: PALLD
Variant: p.D52N
Relation phrase: gene (NM_001166108.1:c.G154A:
Relation type: unknown
Similarity score: 0.67
Sentence excerpt: Compartment-specific gene expression data and immunohistochemistry were also queried.RESULTSThe iden...

Example 3:
PMID: 33801891
Gene: ERCC6
Variant: rs2228528
Relation phrase: in
Relation type: association
Similarity score: 0.77
Sentence excerpt: This study suggests that rs2228528 in ERCC6 could be a potential predictor of response to FOLFIRINOX...


In [5]:
len(results)

172

In [None]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

# Load BioBERT and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device)
model.eval()

# Cache to avoid re-embedding repeated text
@lru_cache(maxsize=10000)
def get_biobert_embedding_cached(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def get_biobert_embeddings(text):
    # Wrap the cached function for compatibility
    return get_biobert_embedding_cached(text)

def extract_relation(sentence, disease1, disease2):
    marked_sentence = sentence.replace(disease1, f"[E1]{disease1}[/E1]") \
                              .replace(disease2, f"[E2]{disease2}[/E2]")

    sentence_embedding = get_biobert_embeddings(marked_sentence)
    d1_embedding = get_biobert_embeddings(disease1)
    d2_embedding = get_biobert_embeddings(disease2)

    e1_pos = marked_sentence.find(f"[E1]{disease1}[/E1]")
    e2_pos = marked_sentence.find(f"[E2]{disease2}[/E2]")

    if e1_pos < e2_pos:
        between_text = marked_sentence[e1_pos+len(disease1)+10:e2_pos]
    else:
        between_text = marked_sentence[e2_pos+len(disease2)+10:e1_pos]

    relation_type = classify_relation(between_text.strip())
    similarity = float(cosine_similarity([d1_embedding], [d2_embedding])[0][0])

    return {
        "relation_phrase": between_text.strip(),
        "relation_type": relation_type,
        "similarity_score": similarity
    }

def classify_relation(text):
    if not text:
        return "unknown"

    relation_patterns = {
        "comorbidity": ["co-occurs with", "comorbid with", "coexisting with"],
        "differential": ["differential diagnosis", "distinguished from"],
        "shared_risk": ["share risk factors", "similar causes", "common etiology"],
        "progression": ["leads to", "develops into", "progresses to"]
    }

    text_lower = text.lower()
    for rel_type, keywords in relation_patterns.items():
        if any(keyword in text_lower for keyword in keywords):
            return rel_type

    text_embedding = get_biobert_embeddings(text)
    best_match = "unknown"
    highest_sim = 0

    for rel_type, keywords in relation_patterns.items():
        for pattern in keywords:
            pattern_embedding = get_biobert_embeddings(pattern)
            sim = cosine_similarity([text_embedding], [pattern_embedding])[0][0]
            if sim > highest_sim:
                highest_sim = sim
                best_match = rel_type

    return best_match if highest_sim > 0.7 else "unknown"

def process_relations(input_file, output_file, limit=None):
    with open(input_file) as f:
        data = json.load(f)

    results = []
    for i, entry in enumerate(data[:limit] if limit else data):
        try:
            disease1 = entry['target1_word']
            disease2 = entry['target2_word']
            sentence = entry['sentence']

            relation = extract_relation(sentence, disease1, disease2)

            result = {
                "pmid": entry['pmid'],
                "disease1": {
                    "name": disease1,
                    "type": entry['target1_type'],
                    "identifier": entry['target1_identifier']
                },
                "disease2": {
                    "name": disease2,
                    "type": entry['target2_type'],
                    "identifier": entry['target2_identifier']
                },
                "sentence": sentence,
                "relation_info": relation
            }
            results.append(result)

        except Exception as e:
            print(f"Error processing entry {i}: {str(e)}")
            continue

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    return results

if __name__ == "__main__":
    input_json = "./data/steps data output/step-4/Relation Extraction/disease_disease_relations.json"
    output_json = "./data/steps data output/step-5/biobert_disease_disease_relations.json"

    print("Starting relation extraction between disease and disease with BioBERT...")
    results = process_relations(input_json, output_json)

    print("\nRelation extraction complete!")
    print(f"Results saved to {output_json}")

    print("\nExample extracted relations:")
    for i, rel in enumerate(results[:3]):
        print(f"\nExample {i+1}:")
        print(f"PMID: {rel['pmid']}")
        print(f"Disease 1: {rel['disease1']['name']}")
        print(f"Disease 2: {rel['disease2']['name']}")
        print(f"Relation phrase: {rel['relation_info']['relation_phrase']}")
        print(f"Relation type: {rel['relation_info']['relation_type']}")
        print(f"Similarity score: {rel['relation_info']['similarity_score']:.2f}")
        print(f"Sentence excerpt: {rel['sentence'][:100]}...")


Starting relation extraction between disease and disease with BioBERT...

Relation extraction complete!
Results saved to biobert_disease_disease_relations.json

Example extracted relations:

Example 1:
PMID: 17440981
Disease 1: neurofibromatosis type I
Disease 2: tumors
Relation phrase: in childhood, sometimes combined with
Relation type: shared_risk
Similarity score: 0.59
Sentence excerpt: Case reports have shown that homozygosity or compound heterozygosity for MMR gene mutations can caus...

Example 2:
PMID: 24903423
Disease 1: neurofibromatosis type 1
Disease 2: tumors
Relation phrase: may arise sporadically or be associated to various syndromes, namely multiple endocrine neoplasia type 2,
Relation type: shared_risk
Similarity score: 0.57
Sentence excerpt: These tumors may arise sporadically or be associated to various syndromes, namely multiple endocrine...

Example 3:
PMID: 24903423
Disease 1: neurofibromatosis type 1
Disease 2: Von Hippel-Lindau syndrome
Relation phrase: 
Relatio

In [2]:
len(results)

72225