In [1]:
# ====================================
# 📦 1. Load Mayo Corpus (JSON File)
# ====================================
import json

with open("mayo_corpus.json", "r", encoding="utf-8") as f:
    mayo_corpus = json.load(f)

print(f"✅ Loaded {len(mayo_corpus)} articles")
print("🧠 First title:", mayo_corpus[0]["title"])

✅ Loaded 1082 articles
🧠 First title: Autonomic neuropathy


In [2]:
# ====================================
# 🧼 2. Clean Corpus (Strip HTML Tags)
# ====================================
from bs4 import BeautifulSoup

# Example cleaning for one doc
raw_html = mayo_corpus[0]["content"]
cleaned_text = BeautifulSoup(raw_html, "html.parser").get_text()

# Clean entire corpus
for entry in mayo_corpus:
    soup = BeautifulSoup(entry["content"], "html.parser")

    # Try to get main content inside <article> or <main>
    main_section = soup.find("article") or soup.find("main")
    if main_section:
        clean_text = main_section.get_text(separator=" ", strip=True)
    else:
        clean_text = soup.get_text(separator=" ", strip=True)

    entry["content"] = clean_text

In [3]:
# ===================
# 🧼 3. Load BioBERT
# ===================
from sentence_transformers import SentenceTransformer
import torch

# It is trained over the SNLI, MNLI, SCINLI, SCITAIL, MEDNLI and STSB datasets for providing robust sentence embeddings.
model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
#model = SentenceTransformer("pritamdeka/S-PubMedBert-MS-MARCO")

# Detect device
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# ====================================
# 🚀 4. Set Up Qdrant & Upload Corpus
# ====================================
!pip install qdrant-client
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import numpy as np
from tqdm import tqdm

# Start in-memory Qdrant
client = QdrantClient(":memory:")

# Define vector schema
client.recreate_collection(
    collection_name="mayo",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)
)

# Embed corpus using BioBERT
corpus_texts = [entry["content"] for entry in mayo_corpus]
corpus_embeddings = model.encode(corpus_texts, convert_to_tensor=False, show_progress_bar=True)

# Upload embeddings to Qdrant
points = [
    PointStruct(
        id=i,
        vector=embedding.tolist(),
        payload={
            "title": mayo_corpus[i]["title"],
            "url": mayo_corpus[i]["url"],
            "content": mayo_corpus[i]["content"]
        }
    )
    for i, embedding in enumerate(corpus_embeddings)
]

client.upsert(collection_name="mayo", points=points)

Collecting qdrant-client
  Downloading qdrant_client-1.14.3-py3-none-any.whl.metadata (10 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading qdrant_client-1.14.3-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-2.10.1 qdrant-client-1.14.3


  client.recreate_collection(


Batches:   0%|          | 0/34 [00:00<?, ?it/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [5]:
# ====================================
# 🧼 5. Preprocess User Health Claim
# ====================================
import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Suspicious/buzz words to flag
suspicious_words = {"miracle", "instant", "guaranteed", "cure", "magic", "secret", "detox", "superfood"}

def preprocess_claim(text):
    # Lowercase & remove punctuation
    cleaned = re.sub(r'[^\w\s]', '', text.lower())
    doc = nlp(cleaned)

    # Tokens: lemmatized, filtered
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    # Named Entities
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in {"DISEASE", "DRUG", "SYMPTOM", "TREATMENT", "ORG", "PERSON"}]

    # Suspicious terms
    found_sus = [word for word in tokens if word in suspicious_words]

    return {
        "original": text,
        "cleaned": cleaned,
        "tokens": tokens,
        "entities": entities,
        "suspicious_terms": found_sus
    }

# 🧠 User input
user_claim = input("💬 Enter a health claim: ")

# Run preprocessing
result = preprocess_claim(user_claim)

# Display results
print("\n✅ Claim Analysis:")
print("🔸 Cleaned:", result["cleaned"])
print("🔸 Tokens:", result["tokens"])
print("🔸 Entities:", result["entities"])
print("⚠ Suspicious Words:", result["suspicious_terms"])

💬 Enter a health claim: Childhood schizophrenia is an uncommon but severe mental disorder in which children and teenagers interpret reality abnormally. Schizophrenia involves a range of problems with thinking (cognitive), behavior or emotions. It may result in some combination of hallucinations, delusions, and extremely disordered thinking and behavior that impairs your child's ability to function.  Childhood schizophrenia is essentially the same as schizophrenia in adults, but it starts early in life — generally in the teenage years — and has a profound impact on a child's behavior and development. With childhood schizophrenia, the early age of onset presents special challenges for diagnosis, treatment, education, and emotional and social development.  Schizophrenia is a chronic condition that requires lifelong treatment. Identifying and starting treatment for childhood schizophrenia as early as possible may significantly improve your child's long-term outcome.  Symptoms Schizophrenia

In [6]:
# ====================================
# 🧠 6. Embed Claim with PubMedBERT
# ====================================
claim_embedding = model.encode(result["original"], convert_to_tensor=True)

print("✅ Claim embedded with shape:", claim_embedding.shape)

✅ Claim embedded with shape: torch.Size([768])


In [7]:
# ====================================
# 🔍 7. Semantic Search in Qdrant
# ====================================
top_k = 5  # top matches to retrieve

search_results = client.search(
    collection_name="mayo",
    query_vector=claim_embedding.tolist(),
    limit=top_k
)

# Display top results
for hit in search_results:
    print(f"\n🔹 Score: {hit.score:.3f}")
    print(f"📘 Title: {hit.payload['title']}")
    print(f"🔗 URL: {hit.payload['url']}")
    print(f"📝 Snippet: {hit.payload['content'][:200]}...")


🔹 Score: 0.986
📘 Title: Childhood schizophrenia
🔗 URL: https://www.mayoclinic.org/diseases-conditions/childhood-schizophrenia/symptoms-causes/syc-20354483
📝 Snippet: Print Overview Childhood schizophrenia is an uncommon but severe mental disorder in which children and teenagers interpret reality abnormally. Schizophrenia involves a range of problems with thinking ...

🔹 Score: 0.589
📘 Title: Separation anxiety disorder
🔗 URL: https://www.mayoclinic.org/diseases-conditions/separation-anxiety-disorder/symptoms-causes/syc-20377455
📝 Snippet: Print Overview Separation anxiety is a normal stage of development for infants and toddlers. Young children often experience a period of separation anxiety, but most children outgrow separation anxiet...

🔹 Score: 0.560
📘 Title: Pediatric obstructive sleep apnea
🔗 URL: https://www.mayoclinic.org/diseases-conditions/pediatric-sleep-apnea/symptoms-causes/syc-20376196
📝 Snippet: Print Overview Pediatric obstructive sleep apnea is a sleep disorder in whi

  search_results = client.search(


In [8]:
# ====================================
# ✅ 8. Verdict Engine
# ====================================
top_score = search_results[0].score

if top_score >= 0.7:
    verdict = "✅ Trusted"
elif top_score >= 0.5:
    verdict = "⚠ Unclear"
else:
    verdict = "❌ Risky"

print(f"\n🧠 Verdict: {verdict}")


🧠 Verdict: ✅ Trusted
