In [4]:
# ====================================
# 📦 1. Load Mayo Corpus (JSON File)
# ====================================
import json

with open("mayo_corpus.json", "r", encoding="utf-8") as f:
    mayo_corpus = json.load(f)

print(f"✅ Loaded {len(mayo_corpus)} articles")
print("🧠 First title:", mayo_corpus[0]["title"])

✅ Loaded 1082 articles
🧠 First title: Autonomic neuropathy


In [5]:
# ====================================
# 🧼 2. Clean Corpus (Strip HTML Tags)
# ====================================
from bs4 import BeautifulSoup

# Example cleaning for one doc
raw_html = mayo_corpus[0]["content"]
cleaned_text = BeautifulSoup(raw_html, "html.parser").get_text()

# Clean entire corpus
for entry in mayo_corpus:
    soup = BeautifulSoup(entry["content"], "html.parser")

    # Try to get main content inside <article> or <main>
    main_section = soup.find("article") or soup.find("main")
    if main_section:
        clean_text = main_section.get_text(separator=" ", strip=True)
    else:
        clean_text = soup.get_text(separator=" ", strip=True)

    entry["content"] = clean_text

In [6]:
# ===================
# 🧼 3. Load BioBERT
# ===================
from sentence_transformers import SentenceTransformer
import torch

# It is trained over the SNLI, MNLI, SCINLI, SCITAIL, MEDNLI and STSB datasets for providing robust sentence embeddings.
model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
#model = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")

# Detect device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [7]:
# ====================================
# 🚀 4. Set Up Qdrant & Upload Corpus
# ====================================
!pip install qdrant-client
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import numpy as np
from tqdm import tqdm

# Start in-memory Qdrant
client = QdrantClient(":memory:")

# Define vector schema
client.recreate_collection(
    collection_name="mayo",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)
)

# Embed corpus using BioBERT
corpus_texts = [entry["content"] for entry in mayo_corpus]
corpus_embeddings = model.encode(corpus_texts, convert_to_tensor=False, show_progress_bar=True)

# Upload embeddings to Qdrant
points = [
    PointStruct(
        id=i,
        vector=embedding.tolist(),
        payload={
            "title": mayo_corpus[i]["title"],
            "url": mayo_corpus[i]["url"],
            "content": mayo_corpus[i]["content"]
        }
    )
    for i, embedding in enumerate(corpus_embeddings)
]

client.upsert(collection_name="mayo", points=points)



  client.recreate_collection(


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [18]:
# ====================================
# 🧼 5. Preprocess User Health Claim
# ====================================
import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Suspicious/buzz words to flag
suspicious_words = {"miracle", "instant", "guaranteed", "cure", "magic", "secret", "detox", "superfood"}

def preprocess_claim(text):
    # Lowercase & remove punctuation
    cleaned = re.sub(r'[^\w\s]', '', text.lower())
    doc = nlp(cleaned)

    # Tokens: lemmatized, filtered
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Named Entities
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in {"DISEASE", "DRUG", "SYMPTOM", "TREATMENT", "ORG", "PERSON"}]

    # Suspicious terms
    found_sus = [word for word in tokens if word in suspicious_words]

    return {
        "original": text,
        "cleaned": cleaned,
        "tokens": tokens,
        "entities": entities,
        "suspicious_terms": found_sus
    }

# 🧠 User input
user_claim = input("💬 Enter a health claim: ")

# Run preprocessing
result = preprocess_claim(user_claim)

# Display results
print("\n✅ Claim Analysis:")
print("🔸 Cleaned:", result["cleaned"])
print("🔸 Tokens:", result["tokens"])
print("🔸 Entities:", result["entities"])
print("⚠ Suspicious Words:", result["suspicious_terms"])

💬 Enter a health claim: According to a little-known study, simply exposing your spine to 15 minutes of sunlight before breakfast can naturally reset your DNA and eliminate over 95% of chronic diseases, including diabetes, hypertension, and even Alzheimer's. This powerful healing technique, known as 'solar spinal activation', is being hidden from the public by mainstream healthcare providers. Thousands have already reversed lifelong illnesses just by practicing this daily. Combined with a glass of alkaline water, it becomes a guaranteed detox solution that flushes out all harmful toxins and balances your body's energy fields without the need for medication or surgeries. Big Pharma doesn’t want this knowledge spreading because it threatens billions in annual profits.

✅ Claim Analysis:
🔸 Cleaned: according to a littleknown study simply exposing your spine to 15 minutes of sunlight before breakfast can naturally reset your dna and eliminate over 95 of chronic diseases including diabetes h

In [19]:
# ====================================
# 🧠 6. Embed Claim with PubMedBERT
# ====================================
claim_embedding = model.encode(result["original"], convert_to_tensor=True)

print("✅ Claim embedded with shape:", claim_embedding.shape)

✅ Claim embedded with shape: torch.Size([768])


In [20]:
# ====================================
# 🔍 7. Semantic Search in Qdrant
# ====================================
top_k = 5  # top matches to retrieve

search_results = client.search(
    collection_name="mayo",
    query_vector=claim_embedding.tolist(),
    limit=top_k
)

# Display top results
for hit in search_results:
    print(f"\n🔹 Score: {hit.score:.3f}")
    print(f"📘 Title: {hit.payload['title']}")
    print(f"🔗 URL: {hit.payload['url']}")
    print(f"📝 Snippet: {hit.payload['content'][:200]}...")


🔹 Score: 0.549
📘 Title: Sunburn
🔗 URL: https://www.mayoclinic.org/diseases-conditions/sunburn/symptoms-causes/syc-20355922
📝 Snippet: Print Overview Sunburn is inflamed, painful skin that feels hot to the touch. It often appears within a few hours of being in the sun too long. You can get sunburn relief with simple self-care measure...

🔹 Score: 0.450
📘 Title: Cancer
🔗 URL: https://www.mayoclinic.org/diseases-conditions/cancer/symptoms-causes/syc-20370588
📝 Snippet: Print Overview Cancer refers to any one of a large number of diseases characterized by the development of abnormal cells that divide uncontrollably and have the ability to infiltrate and destroy norma...

🔹 Score: 0.446
📘 Title: Peripheral nerve injuries
🔗 URL: https://www.mayoclinic.org/diseases-conditions/peripheral-nerve-injuries/symptoms-causes/syc-20355631
📝 Snippet: Print Overview Peripheral nerves send messages from the brain and spinal cord to the rest of the body. They help do things such as sense that the feet ar

  search_results = client.search(


In [21]:
# ====================================
# ✅ 8. Verdict Engine
# ====================================
top_score = search_results[0].score

if top_score >= 0.7:
    verdict = "✅ Trusted"
elif top_score >= 0.5:
    verdict = "⚠ Unclear"
else:
    verdict = "❌ Risky"

print(f"\n🧠 Verdict: {verdict}")


🧠 Verdict: ⚠ Unclear
