In [1]:
# ====================================
# 📦 1. Load Mayo Corpus (JSON File)
# ====================================
import json

with open("mayo_corpus.json", "r", encoding="utf-8") as f:
    mayo_corpus = json.load(f)

print(f"✅ Loaded {len(mayo_corpus)} articles")
print("🧠 First title:", mayo_corpus[0]["title"])

✅ Loaded 1082 articles
🧠 First title: Autonomic neuropathy


In [2]:
# ====================================
# 🧼 2. Clean Corpus (Strip HTML Tags)
# ====================================
from bs4 import BeautifulSoup

# Example cleaning for one doc
raw_html = mayo_corpus[0]["content"]
cleaned_text = BeautifulSoup(raw_html, "html.parser").get_text()

# Clean entire corpus
for entry in mayo_corpus:
    soup = BeautifulSoup(entry["content"], "html.parser")

    # Try to get main content inside <article> or <main>
    main_section = soup.find("article") or soup.find("main")
    if main_section:
        clean_text = main_section.get_text(separator=" ", strip=True)
    else:
        clean_text = soup.get_text(separator=" ", strip=True)

    entry["content"] = clean_text

In [3]:
# ===================
# 🧼 3. Load BioBERT
# ===================
from sentence_transformers import SentenceTransformer
import torch

# It is trained over the SNLI, MNLI, SCINLI, SCITAIL, MEDNLI and STSB datasets for providing robust sentence embeddings.
model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
#model = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")

# Detect device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

  from tqdm.autonotebook import tqdm, trange
2025-07-17 18:05:57.413952: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-17 18:05:57.954041: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No sentence-transformers model found with name pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb. Creating a new one with mean pooling.


OSError: There was a specific connection error when trying to load pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb:
401 Client Error: Unauthorized for url: https://huggingface.co/pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb/resolve/main/config.json (Request ID: Root=1-68791f69-23e8c2443cd1ebbb5f44cddc;32c947c5-3f19-4f14-aebc-425a18a26b41)

Invalid credentials in Authorization header

In [None]:
# ====================================
# 🚀 4. Set Up Qdrant & Upload Corpus
# ====================================
#!pip install qdrant-client
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import numpy as np
from tqdm import tqdm

# Start in-memory Qdrant
client = QdrantClient(":memory:")

# Define vector schema
client.recreate_collection(
    collection_name="mayo",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)
)

# Embed corpus using BioBERT
corpus_texts = [entry["content"] for entry in mayo_corpus]
corpus_embeddings = model.encode(corpus_texts, convert_to_tensor=False, show_progress_bar=True)

# Upload embeddings to Qdrant
points = [
    PointStruct(
        id=i,
        vector=embedding.tolist(),
        payload={
            "title": mayo_corpus[i]["title"],
            "url": mayo_corpus[i]["url"],
            "content": mayo_corpus[i]["content"]
        }
    )
    for i, embedding in enumerate(corpus_embeddings)
]

client.upsert(collection_name="mayo", points=points)

In [None]:
# ====================================
# 🧼 5. Preprocess User Health Claim
# ====================================
import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Suspicious/buzz words to flag
suspicious_words = {"miracle", "instant", "guaranteed", "cure", "magic", "secret", "detox", "superfood"}

def preprocess_claim(text):
    # Lowercase & remove punctuation
    cleaned = re.sub(r'[^\w\s]', '', text.lower())
    doc = nlp(cleaned)

    # Tokens: lemmatized, filtered
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Named Entities
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in {"DISEASE", "DRUG", "SYMPTOM", "TREATMENT", "ORG", "PERSON"}]

    # Suspicious terms
    found_sus = [word for word in tokens if word in suspicious_words]

    return {
        "original": text,
        "cleaned": cleaned,
        "tokens": tokens,
        "entities": entities,
        "suspicious_terms": found_sus
    }

# 🧠 User input
user_claim = input("💬 Enter a health claim: ")

# Run preprocessing
result = preprocess_claim(user_claim)

# Display results
print("\n✅ Claim Analysis:")
print("🔸 Cleaned:", result["cleaned"])
print("🔸 Tokens:", result["tokens"])
print("🔸 Entities:", result["entities"])
print("⚠ Suspicious Words:", result["suspicious_terms"])

In [None]:
# ====================================
# 🧠 6. Embed Claim with PubMedBERT
# ====================================
claim_embedding = model.encode(result["original"], convert_to_tensor=True)

print("✅ Claim embedded with shape:", claim_embedding.shape)

In [None]:
# ====================================
# 🔍 7. Semantic Search in Qdrant
# ====================================
top_k = 5  # top matches to retrieve

search_results = client.search(
    collection_name="mayo",
    query_vector=claim_embedding.tolist(),
    limit=top_k
)

# Display top results
for hit in search_results:
    print(f"\n🔹 Score: {hit.score:.3f}")
    print(f"📘 Title: {hit.payload['title']}")
    print(f"🔗 URL: {hit.payload['url']}")
    print(f"📝 Snippet: {hit.payload['content'][:200]}...")

In [None]:
# ====================================
# ✅ 8. Verdict Engine
# ====================================
top_score = search_results[0].score

if top_score >= 0.7:
    verdict = "✅ Trusted"
elif top_score >= 0.5:
    verdict = "⚠ Unclear"
else:
    verdict = "❌ Risky"

print(f"\n🧠 Verdict: {verdict}")