In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch.nn.functional as F
from duckduckgo_search import DDGS
import os
import re # Import regex for query refinement

# --- Device Setup ---
# Prioritize CUDA if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Verifier Device set to use {device}")

# --- NLI Model Loading ---
MODEL_NAME_NLI = "roberta-large-mnli"
try:
    tokenizer_nli = AutoTokenizer.from_pretrained(MODEL_NAME_NLI)
    model_nli = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_NLI).to(device)
    print(f"Verifier: Successfully loaded NLI model '{MODEL_NAME_NLI}'.")
except Exception as e:
    print(f"Verifier: Error loading NLI model '{MODEL_NAME_NLI}': {e}")
    print("Verifier: Attempting to load NLI model with an explicit cache directory.")
    cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
    os.makedirs(cache_dir, exist_ok=True)
    tokenizer_nli = AutoTokenizer.from_pretrained(MODEL_NAME_NLI, cache_dir=cache_dir)
    model_nli = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_NLI, cache_dir=cache_dir).to(device)
    print(f"Verifier: Successfully loaded NLI model '{MODEL_NAME_NLI}' using cache directory: {cache_dir}.")

label_map = {0: "CONTRADICTION", 1: "NEUTRAL", 2: "ENTAILMENT"}

# --- NER Model for Query Generation ---
MODEL_NAME_NER = "dslim/bert-base-NER" # Using the same NER as claim_extractor for consistency
try:
    # Use device.index for pipeline if CUDA is available, otherwise -1 for CPU
    ner_pipeline = pipeline("ner", model=MODEL_NAME_NER, aggregation_strategy="simple",
                            device=device.index if device.type == 'cuda' else -1)
    print(f"Verifier: Successfully loaded NER pipeline '{MODEL_NAME_NER}' for query generation.")
except Exception as e:
    print(f"Verifier: Error loading NER pipeline '{MODEL_NAME_NER}': {e}")
    print("Verifier: Attempting to load NER pipeline with an explicit cache directory.")
    cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
    os.makedirs(cache_dir, exist_ok=True)
    ner_pipeline = pipeline("ner", model=MODEL_NAME_NER, aggregation_strategy="simple",
                            device=device.index if device.type == 'cuda' else -1, cache_dir=cache_dir)
    print(f"Verifier: Successfully loaded NER pipeline '{MODEL_NAME_NER}' using cache directory: {cache_dir}.")


import spacy

# Load spaCy model once
nlp = spacy.load("en_core_web_sm")

def generate_search_query(claim):
    """
    Generates a refined search query from a claim.
    - Uses NER to extract named entities.
    - Uses noun phrase extraction to get important keywords (like "prime minister").
    - Quotes multi-word entities/phrases.
    - Adds heuristics (like "discovered").
    """
    claim_lower = claim.lower()

    # --- Step 1: Extract NER entities and merge subwords ---
    entities = ner_pipeline(claim)
    merged_tokens = []
    current = ""
    for e in entities:
        word = e["word"]
        if word.startswith("##"):
            current += word[2:]
        else:
            if current:
                merged_tokens.append(current)
            current = word
    if current:
        merged_tokens.append(current)

    ner_phrases = []
    for token in merged_tokens:
        if " " in token:
            ner_phrases.append(f'"{token}"')
        else:
            ner_phrases.append(token)

    # --- Step 2: Extract noun phrases with spaCy ---
    doc = nlp(claim)
    noun_phrases = []
    for chunk in doc.noun_chunks:
        # Avoid duplicates from NER
        if chunk.text not in merged_tokens:
            if len(chunk.text.split()) > 1:
                noun_phrases.append(f'"{chunk.text}"')
            else:
                noun_phrases.append(chunk.text)

    # --- Step 3: Heuristic for "discovered" ---
    if "discovered" in claim_lower:
        match = re.search(r'discovered ([\w\s]+)', claim_lower)
        if match:
            noun_phrases.append(f'who discovered "{match.group(1).strip()}"')
        else:
            noun_phrases.append("who discovered")

    # --- Step 4: Combine NER + noun phrases, dedupe while keeping order ---
    all_phrases = ner_phrases + noun_phrases
    query_parts = list(dict.fromkeys(all_phrases))  # removes duplicates, preserves order

    if query_parts:
        return " AND ".join(query_parts) + " facts"

    # --- Step 5: Fallback ---
    return f"{claim} facts"


def search_snippets(claim_original, num_results=10):
    """
    Searches DuckDuckGo for snippets relevant to the query.
    Takes the original claim, generates a search query from it, then searches.
    Returns a list of snippet bodies.
    """
    results = []
    search_query = generate_search_query(claim_original)
    
    print(f"\n[DEBUG verifier.py] Original claim for search: '{claim_original}'")
    print(f"  [DEBUG verifier.py] Generated search query: '{search_query}' with {num_results} results...")

    try:
        with DDGS() as ddgs:
            ddgs_results = list(ddgs.text(search_query, max_results=num_results))
            
            # Fallback if the generated query yields no results, try original claim
            if not ddgs_results:
                print(f"  [DEBUG verifier.py] No results for generated query, falling back to original claim search.")
                ddgs_results = list(ddgs.text(claim_original, max_results=num_results))

            for i, r in enumerate(ddgs_results):
                if "body" in r and r["body"].strip(): # Ensure snippet body is not empty
                    results.append(r["body"])
                    print(f"  [DEBUG verifier.py] Snippet {i+1} (first 100 chars): {r['body'][:100]}...")
                else:
                    print(f"  [DEBUG verifier.py] Snippet {i+1} had no or empty 'body' key: {r}")
            if not results:
                print("[DEBUG verifier.py] No substantial 'body' content found in any search results.")
    except Exception as e:
        print(f"[ERROR verifier.py] DuckDuckGo search failed for query '{search_query}' or '{claim_original}': {e}")
        results = []
    return results

def classify_nli(premise, hypothesis):
    """
    Runs NLI on (premise, hypothesis) and returns the best label and its score.
    Uses the pre-loaded tokenizer and model.
    """
    # max_length for RoBERTa is typically 512
    inputs = tokenizer_nli(premise, hypothesis, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        logits = model_nli(**inputs).logits
        probs = F.softmax(logits, dim=-1)[0].cpu().numpy()

    label_id = int(probs.argmax())
    return label_map[label_id], float(probs[label_id])

def verify_claim(claim, top_k=10):
    """
    Verifies a factual claim against search snippets using an NLI model.
    Returns dict: {claim, status, evidence: list of snippets}
    """
    print(f"\n[DEBUG verifier.py] Verifying claim: '{claim}' (top_k={top_k})")
    snippets = search_snippets(claim, num_results=top_k)

    if not snippets:
        print(f"[DEBUG verifier.py] No snippets available to verify the claim '{claim}'. Returning 'uncertain'.")
        return {"claim": claim, "status": "uncertain", "evidence": []}

    print(f"[DEBUG verifier.py] Type of 'snippets' before iteration: {type(snippets)}")
    print(f"[DEBUG verifier.py] Number of snippets received: {len(snippets)}")
    if snippets:
        print(f"[DEBUG verifier.py] First snippet (first 100 chars): {snippets[0][:100]}...")

    best_status = "uncertain"
    all_evidence_snippets = []

    # Tunable confidence threshold for strong classifications
    CONFIDENCE_THRESHOLD = 0.75 # Good balance, adjust if needed

    # Track highest entailment and contradiction scores separately
    max_entailment_score = 0.0
    max_contradiction_score = 0.0
    
    # Store the snippet that gave the highest entailment/contradiction score
    best_entailment_snippet = None
    best_contradiction_snippet = None
  
    print("\n--- NLI Classification Results for Each Snippet ---")
    for i, snippet in enumerate(snippets):
        all_evidence_snippets.append(snippet)
        try:
            label, score = classify_nli(snippet, claim)
            print(f"  [DEBUG verifier.py] Snippet {i+1} vs Claim:")
            print(f"    Premise (Snippet): {snippet[:150]}...")
            print(f"    Hypothesis (Claim): {claim}")
            print(f"    NLI Result: Label='{label}', Score={score:.4f}")

            if label == "ENTAILMENT":
                if score > max_entailment_score:
                    max_entailment_score = score
                    best_entailment_snippet = snippet
            elif label == "CONTRADICTION":
                if score > max_contradiction_score:
                    max_contradiction_score = score
                    best_contradiction_snippet = snippet

        except Exception as e:
            print(f"  [ERROR verifier.py] Error classifying snippet {i+1}: {e}")
            continue
    
    # --- Final Decision Logic after reviewing all snippets ---
    # Prioritize contradiction if it's very strong, to highlight potential hallucinations more aggressively
    if max_contradiction_score >= CONFIDENCE_THRESHOLD + 0.05: # Give contradiction a slight edge
        best_status = "hallucination"
    elif max_entailment_score >= CONFIDENCE_THRESHOLD:
        best_status = "verified"
    elif max_contradiction_score >= CONFIDENCE_THRESHOLD: # If contradiction is just at threshold
        best_status = "hallucination"
    else:
        best_status = "uncertain"

    # Decide which snippet to return as 'best_snippet' for context in UI
    if best_status == "verified" and best_entailment_snippet:
        final_context_snippet = best_entailment_snippet
    elif best_status == "hallucination" and best_contradiction_snippet:
        final_context_snippet = best_contradiction_snippet
    elif all_evidence_snippets: # If uncertain, just return the first snippet found for some context
        final_context_snippet = all_evidence_snippets[0]
    else:
        final_context_snippet = None # Should not happen if all_evidence_snippets is empty

    print(f"\n[DEBUG verifier.py] Final decision for claim '{claim}': Status='{best_status}', Max Entailment={max_entailment_score:.4f}, Max Contradiction={max_contradiction_score:.4f}")
    
    # Return all snippets for detailed view, but the best_snippet for the main 'evidence' in UI
    return {"claim": claim, "status": best_status, "evidence": all_evidence_snippets, "best_snippet": final_context_snippet}




  from .autonotebook import tqdm as notebook_tqdm


Verifier Device set to use cpu


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verifier: Successfully loaded NLI model 'roberta-large-mnli'.


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Verifier: Successfully loaded NER pipeline 'dslim/bert-base-NER' for query generation.


In [3]:
verify_claim("Taj Mahal is in India")


[DEBUG verifier.py] Verifying claim: 'Taj Mahal is in India' (top_k=10)

[DEBUG verifier.py] Original claim for search: 'Taj Mahal is in India'
  [DEBUG verifier.py] Generated search query: '"Taj Mahal" AND India facts' with 10 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] No results for generated query, falling back to original claim search.
[DEBUG verifier.py] No substantial 'body' content found in any search results.
[DEBUG verifier.py] No snippets available to verify the claim 'Taj Mahal is in India'. Returning 'uncertain'.


{'claim': 'Taj Mahal is in India', 'status': 'uncertain', 'evidence': []}

In [5]:
verify_claim("Taj Mahal is in India")


[DEBUG verifier.py] Verifying claim: 'Taj Mahal is in India' (top_k=10)

[DEBUG verifier.py] Original claim for search: 'Taj Mahal is in India'
  [DEBUG verifier.py] Generated search query: '"Taj Mahal" AND India facts' with 10 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] Snippet 1 (first 100 chars): Immerse yourself in the cultural and natural diversity of one of the world's most vibrant nations wi...
  [DEBUG verifier.py] Snippet 2 (first 100 chars): Explore The Indian Hotels Company Limited, home to iconic hotel brands like Taj, SeleQtions, Vivanta...
  [DEBUG verifier.py] Snippet 3 (first 100 chars): A curation of authentic living palaces and landmark hotels, Taj is the hallmark of iconic hospitalit...
  [DEBUG verifier.py] Snippet 4 (first 100 chars): If you're looking for a luxurious and unforgettable stay in India, then Taj Hotels is the perfect ch...
  [DEBUG verifier.py] Snippet 5 (first 100 chars): Aug 7, 2025 · Book your stay with exciting hotel offers and holiday packages across a host of partic...
  [DEBUG verifier.py] Snippet 6 (first 100 chars): Discover the world's most luxurious destinations with Taj Hotels. Immerse yourself in elegance and c...
  [DEBUG verifier.py] Snippet 7 (first 100 chars): 8 hours ago · Exper

{'claim': 'Taj Mahal is in India',
 'status': 'uncertain',
 'evidence': ["Immerse yourself in the cultural and natural diversity of one of the world's most vibrant nations with Taj Hotels. Relaxing beach paradises, thrilling urban getaways, exotic hill stations and …",
  'Explore The Indian Hotels Company Limited, home to iconic hotel brands like Taj, SeleQtions, Vivanta, Gateway, amã Stays & Trails, Ginger, Qmin and TajSATS. Experience world-class …',
  'A curation of authentic living palaces and landmark hotels, Taj is the hallmark of iconic hospitality across the globe. The brand is recognised for its warm and intuitive service and is the …',
  "If you're looking for a luxurious and unforgettable stay in India, then Taj Hotels is the perfect choice for you. Book your stay today and experience the Taj difference!",
  'Aug 7, 2025 · Book your stay with exciting hotel offers and holiday packages across a host of participating Taj Hotels. Keep checking-in to discover more member exclusi

In [16]:
verify_claim("Taj Mahal is in India")


[DEBUG verifier.py] Verifying claim: 'Taj Mahal is in India' (top_k=10)

[DEBUG verifier.py] Original claim for search: 'Taj Mahal is in India'
  [DEBUG verifier.py] Generated search query: '"Taj Mahal" AND India facts' with 10 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] Snippet 1 (first 100 chars): The Taj Mahal  is an ivory-white marble mausoleum on the right bank of the river Yamuna in Agra, Utt...
  [DEBUG verifier.py] Snippet 2 (first 100 chars): Jun 19, 2025 · Discover 45 fascinating facts about the Taj Mahal, from its history and architecture ...
  [DEBUG verifier.py] Snippet 3 (first 100 chars): The Taj Mahal is one of the most beautiful monuments in India. Let’s have a look at its history, arc...
  [DEBUG verifier.py] Snippet 4 (first 100 chars): Aug 23, 2025 · The Taj Mahal is a mausoleum complex in Agra, Uttar Pradesh, in northern India, built...
  [DEBUG verifier.py] Snippet 5 (first 100 chars): Sep 19, 2024 · Here are 15 fascinating facts that shed light on the Taj Mahal’s history, architectur...
  [DEBUG verifier.py] Snippet 6 (first 100 chars): Globally renowned as the city of the Taj Mahal, this royal Mughal city has many other monuments too ...
  [DEBUG verifier.py] Snippet 7 (first 100 chars): Jul 20, 2025 · In t

{'claim': 'Taj Mahal is in India',
 'status': 'verified',
 'evidence': ['The Taj Mahal  is an ivory-white marble mausoleum on the right bank of the river Yamuna in Agra, Uttar Pradesh, India. It was commissioned in 1631 by the fifth Mughal emperor, Shah Jahan (r.\u20091628–1658), to house the tomb of his beloved wife, Mumtaz Mahal; it also houses the tomb of Shah Jahan himself. The tomb is the centrepiece of a 17-hectare (42-acre) complex, which includes a mosque an…',
  'Jun 19, 2025 · Discover 45 fascinating facts about the Taj Mahal, from its history and architecture to its cultural significance and enduring allure. Uncover the secrets of this iconic monument.',
  'The Taj Mahal is one of the most beautiful monuments in India. Let’s have a look at its history, architecture, location, opening hours, entry fee, myths, legends and other interesting facts.',
  'Aug 23, 2025 · The Taj Mahal is a mausoleum complex in Agra, Uttar Pradesh, in northern India, built by the Mughal emperor Shah

In [18]:
!pip install sentence-transformers


  pid, fd = os.forkpty()


Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.1.1


In [32]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch.nn.functional as F
from duckduckgo_search import DDGS
import os
import re  # regex for query refinement
import spacy
from sentence_transformers import SentenceTransformer, util  # NEW for cosine similarity

# --- Device Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Verifier Device set to use {device}")

# --- NLI Model Loading ---
MODEL_NAME_NLI = "roberta-large-mnli"
try:
    tokenizer_nli = AutoTokenizer.from_pretrained(MODEL_NAME_NLI)
    model_nli = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_NLI).to(device)
    print(f"Verifier: Successfully loaded NLI model '{MODEL_NAME_NLI}'.")
except Exception as e:
    print(f"Verifier: Error loading NLI model '{MODEL_NAME_NLI}': {e}")
    cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
    os.makedirs(cache_dir, exist_ok=True)
    tokenizer_nli = AutoTokenizer.from_pretrained(MODEL_NAME_NLI, cache_dir=cache_dir)
    model_nli = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_NLI, cache_dir=cache_dir).to(device)
    print(f"Verifier: Successfully loaded NLI model '{MODEL_NAME_NLI}' using cache directory: {cache_dir}.")

label_map = {0: "CONTRADICTION", 1: "NEUTRAL", 2: "ENTAILMENT"}

# --- NER Model for Query Generation ---
MODEL_NAME_NER = "dslim/bert-base-NER"
try:
    ner_pipeline = pipeline("ner", model=MODEL_NAME_NER, aggregation_strategy="simple",
                            device=device.index if device.type == 'cuda' else -1)
    print(f"Verifier: Successfully loaded NER pipeline '{MODEL_NAME_NER}' for query generation.")
except Exception as e:
    print(f"Verifier: Error loading NER pipeline '{MODEL_NAME_NER}': {e}")
    cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
    os.makedirs(cache_dir, exist_ok=True)
    ner_pipeline = pipeline("ner", model=MODEL_NAME_NER, aggregation_strategy="simple",
                            device=device.index if device.type == 'cuda' else -1, cache_dir=cache_dir)
    print(f"Verifier: Successfully loaded NER pipeline '{MODEL_NAME_NER}' using cache directory: {cache_dir}.")

# --- spaCy model ---
nlp = spacy.load("en_core_web_sm")

# --- Sentence-Transformer Embedding Model (NEW) ---
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def generate_search_query(claim):
    claim_lower = claim.lower()

    # Step 1: Extract NER entities and merge subwords
    entities = ner_pipeline(claim)
    merged_tokens = []
    current = ""
    for e in entities:
        word = e["word"]
        if word.startswith("##"):
            current += word[2:]
        else:
            if current:
                merged_tokens.append(current)
            current = word
    if current:
        merged_tokens.append(current)

    ner_phrases = []
    for token in merged_tokens:
        if " " in token:
            ner_phrases.append(f'"{token}"')
        else:
            ner_phrases.append(token)

    # Step 2: Extract noun phrases with spaCy
    doc = nlp(claim)
    noun_phrases = []
    for chunk in doc.noun_chunks:
        if chunk.text not in merged_tokens:
            if len(chunk.text.split()) > 1:
                noun_phrases.append(f'"{chunk.text}"')
            else:
                noun_phrases.append(chunk.text)

    # Step 3: Heuristic for "discovered"
    if "discovered" in claim_lower:
        match = re.search(r'discovered ([\w\s]+)', claim_lower)
        if match:
            noun_phrases.append(f'who discovered "{match.group(1).strip()}"')
        else:
            noun_phrases.append("who discovered")

    # Step 4: Combine NER + noun phrases, dedupe while keeping order
    all_phrases = ner_phrases + noun_phrases
    query_parts = list(dict.fromkeys(all_phrases))

    if query_parts:
        return " AND ".join(query_parts) + " facts"
    return f"{claim} facts"

def search_snippets(claim_original, num_results=10):
    results = []
    search_query = generate_search_query(claim_original)

    print(f"\n[DEBUG verifier.py] Original claim for search: '{claim_original}'")
    print(f"  [DEBUG verifier.py] Generated search query: '{search_query}' with {num_results} results...")

    try:
        with DDGS() as ddgs:
            ddgs_results = list(ddgs.text(search_query, max_results=num_results))

            if not ddgs_results:
                print(f"  [DEBUG verifier.py] No results for generated query, falling back to original claim search.")
                ddgs_results = list(ddgs.text(claim_original, max_results=num_results))

            for i, r in enumerate(ddgs_results):
                if "body" in r and r["body"].strip():
                    results.append(r["body"])
                    print(f"  [DEBUG verifier.py] Snippet {i+1} (first 100 chars): {r['body'][:100]}...")
                else:
                    print(f"  [DEBUG verifier.py] Snippet {i+1} had no or empty 'body' key: {r}")
            if not results:
                print("[DEBUG verifier.py] No substantial 'body' content found in any search results.")
    except Exception as e:
        print(f"[ERROR verifier.py] DuckDuckGo search failed for query '{search_query}' or '{claim_original}': {e}")
        results = []
    return results

# --- NEW: Filter snippets by cosine similarity ---
def filter_snippets_by_similarity(claim, snippets, threshold=0.7):
    if not snippets:
        return []
    claim_emb = embedder.encode(claim, convert_to_tensor=True)
    snippet_embs = embedder.encode(snippets, convert_to_tensor=True)
    cosine_scores = util.cos_sim(claim_emb, snippet_embs)[0]
    filtered = [snippet for snippet, score in zip(snippets, cosine_scores) if score >= threshold]
    print(f"[DEBUG verifier.py] Filtered {len(filtered)}/{len(snippets)} snippets above similarity threshold {threshold}")
    return filtered

def classify_nli(premise, hypothesis):
    inputs = tokenizer_nli(premise, hypothesis, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        logits = model_nli(**inputs).logits
        probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
    label_id = int(probs.argmax())
    return label_map[label_id], float(probs[label_id])

def verify_claim(claim, top_k=100, sim_threshold=0.7):
    print(f"\n[DEBUG verifier.py] Verifying claim: '{claim}' (top_k={top_k}, sim_threshold={sim_threshold})")
    snippets = search_snippets(claim, num_results=top_k)

    # NEW: Filter by cosine similarity
    snippets = filter_snippets_by_similarity(claim, snippets, threshold=sim_threshold)

    if not snippets:
        print(f"[DEBUG verifier.py] No snippets available after filtering for the claim '{claim}'. Returning 'uncertain'.")
        return {"claim": claim, "status": "uncertain", "evidence": []}

    print(f"[DEBUG verifier.py] Number of snippets after filtering: {len(snippets)}")
    if snippets:
        print(f"[DEBUG verifier.py] First snippet (first 100 chars): {snippets[0][:100]}...")

    best_status = "uncertain"
    all_evidence_snippets = []

    CONFIDENCE_THRESHOLD = 0.75
    max_entailment_score = 0.0
    max_contradiction_score = 0.0
    best_entailment_snippet = None
    best_contradiction_snippet = None

    print("\n--- NLI Classification Results for Each Snippet ---")
    for i, snippet in enumerate(snippets):
        all_evidence_snippets.append(snippet)
        try:
            label, score = classify_nli(snippet, claim)
            print(f"  [DEBUG verifier.py] Snippet {i+1} vs Claim:")
            print(f"    Premise (Snippet): {snippet[:150]}...")
            print(f"    Hypothesis (Claim): {claim}")
            print(f"    NLI Result: Label='{label}', Score={score:.4f}")

            if label == "ENTAILMENT":
                if score > max_entailment_score:
                    max_entailment_score = score
                    best_entailment_snippet = snippet
            elif label == "CONTRADICTION":
                if score > max_contradiction_score:
                    max_contradiction_score = score
                    best_contradiction_snippet = snippet
        except Exception as e:
            print(f"  [ERROR verifier.py] Error classifying snippet {i+1}: {e}")
            continue

    # Decision logic
    if max_contradiction_score >= CONFIDENCE_THRESHOLD + 0.05:
        best_status = "hallucination"
    elif max_entailment_score >= CONFIDENCE_THRESHOLD:
        best_status = "verified"
    elif max_contradiction_score >= CONFIDENCE_THRESHOLD:
        best_status = "hallucination"
    else:
        best_status = "uncertain"

    if best_status == "verified" and best_entailment_snippet:
        final_context_snippet = best_entailment_snippet
    elif best_status == "hallucination" and best_contradiction_snippet:
        final_context_snippet = best_contradiction_snippet
    elif all_evidence_snippets:
        final_context_snippet = all_evidence_snippets[0]
    else:
        final_context_snippet = None

    print(f"\n[DEBUG verifier.py] Final decision for claim '{claim}': "
          f"Status='{best_status}', Max Entailment={max_entailment_score:.4f}, "
          f"Max Contradiction={max_contradiction_score:.4f}")

    return {
        "claim": claim,
        "status": best_status,
        "evidence": all_evidence_snippets,
        "best_snippet": final_context_snippet
    }


Verifier Device set to use cpu


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verifier: Successfully loaded NLI model 'roberta-large-mnli'.


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Verifier: Successfully loaded NER pipeline 'dslim/bert-base-NER' for query generation.


In [8]:
result = verify_claim("Barack Obama was the 44th President of the United States.")
print(result)


[DEBUG verifier.py] Verifying claim: 'Barack Obama was the 44th President of the United States.' (top_k=100, sim_threshold=0.7)

[DEBUG verifier.py] Original claim for search: 'Barack Obama was the 44th President of the United States.'
  [DEBUG verifier.py] Generated search query: '"Barack Obama" AND "United States" AND "the 44th President" AND "the United States" facts' with 100 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] Snippet 1 had no or empty 'body' key: {'title': 'Barack Obama | Biography, Parents, Education, Presidency, Books ...', 'href': 'https://www.britannica.com/biography/Barack-Obama', 'body': ''}
  [DEBUG verifier.py] Snippet 2 (first 100 chars): Barack Obama was the 44th president of the United States, elected in November 2008 and holding offic...
  [DEBUG verifier.py] Snippet 3 (first 100 chars): Nov 29, 2024 · The statement that Barack Obama was the 44th President of the United States is suppor...
  [DEBUG verifier.py] Snippet 4 (first 100 chars): In this study are thoughts from Barack Obama about America and the American people; democracy and in...
  [DEBUG verifier.py] Snippet 5 (first 100 chars): Barack Hussein Obama II[a] (born August 4, 1961) is an American politician who served as the 44th pr...
  [DEBUG verifier.py] Snippet 6 (first 100 chars): When Barack Obama was elected president in 2008, he became the first African American to hold the of...
  [DEBUG ve

In [10]:
result = verify_claim("Barack Obama was the 44th President of the United States.")
print(result)


[DEBUG verifier.py] Verifying claim: 'Barack Obama was the 44th President of the United States.' (top_k=100, sim_threshold=0.7)

[DEBUG verifier.py] Original claim for search: 'Barack Obama was the 44th President of the United States.'
  [DEBUG verifier.py] Generated search query: '"Barack Obama" AND "United States" AND "the 44th President" AND "the United States" facts' with 100 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] Snippet 1 (first 100 chars): Barack Hussein Obama II[a] (born August 4, 1961) is an American politician who served as the 44th pr...
  [DEBUG verifier.py] Snippet 2 (first 100 chars): Sep 19, 2025 · Barack Obama (born August 4, 1961, Honolulu, Hawaii, U.S.) is the 44th president of t...
  [DEBUG verifier.py] Snippet 3 (first 100 chars): Barack Obama was the 44th president of the United States, elected in November 2008 and holding offic...
  [DEBUG verifier.py] Snippet 4 (first 100 chars): When Barack Obama was elected president in 2008, he became the first African American to hold the of...
  [DEBUG verifier.py] Snippet 5 (first 100 chars): On November 4, 2008, Barack Obama was elected the 44th President of the United States, winning more ...
  [DEBUG verifier.py] Snippet 6 (first 100 chars): Nov 29, 2024 · The statement that Barack Obama was the 44th President of the United States is suppor...
  [DEBUG verifier.py] Snippet 7 (first 100 chars): On May 2, 2011, Pre

In [11]:
result = verify_claim("Barack Obama was the 44th President of the United States.")
print(result)


[DEBUG verifier.py] Verifying claim: 'Barack Obama was the 44th President of the United States.' (top_k=100, sim_threshold=0.7)

[DEBUG verifier.py] Original claim for search: 'Barack Obama was the 44th President of the United States.'
  [DEBUG verifier.py] Generated search query: '"Barack Obama" AND "United States" AND "the 44th President" AND "the United States" facts' with 100 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] Snippet 1 (first 100 chars): Barack Hussein Obama II[a] (born August 4, 1961) is an American politician who served as the 44th pr...
  [DEBUG verifier.py] Snippet 2 (first 100 chars): Sep 19, 2025 · Barack Obama (born August 4, 1961, Honolulu, Hawaii, U.S.) is the 44th president of t...
  [DEBUG verifier.py] Snippet 3 (first 100 chars): Barack Obama was the 44th president of the United States, elected in November 2008 and holding offic...
  [DEBUG verifier.py] Snippet 4 (first 100 chars): When Barack Obama was elected president in 2008, he became the first African American to hold the of...
  [DEBUG verifier.py] Snippet 5 (first 100 chars): On November 4, 2008, Barack Obama was elected the 44th President of the United States, winning more ...
  [DEBUG verifier.py] Snippet 6 (first 100 chars): Nov 29, 2024 · The statement that Barack Obama was the 44th President of the United States is suppor...
  [DEBUG verifier.py] Snippet 7 (first 100 chars): On May 2, 2011, Pre

In [22]:
result = verify_claim("OpenAI released GPT-5 Turbo in September 2025.")
print(result)


[DEBUG verifier.py] Verifying claim: 'OpenAI released GPT-5 Turbo in September 2025.' (top_k=100, sim_threshold=0.7)

[DEBUG verifier.py] Original claim for search: 'OpenAI released GPT-5 Turbo in September 2025.'
  [DEBUG verifier.py] Generated search query: 'OpenAI AND "GPT - 5 Turbo" AND "GPT-5 Turbo" AND September facts' with 100 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] Snippet 1 (first 100 chars): Learn best practices, features, and migration guidance for GPT-5. GPT-5 is our most intelligent mode...
  [DEBUG verifier.py] Snippet 2 (first 100 chars): Jun 27, 2025 · The launch of OpenAI's GPT-5 Turbo model has brought significant advancements that ca...
  [DEBUG verifier.py] Snippet 3 (first 100 chars): Jul 13, 2025 · In the rapidly changing landscape of artificial intelligence, one name is dominating ...
  [DEBUG verifier.py] Snippet 4 (first 100 chars): May 13, 2025 · OpenAI has officially rolled out GPT-5 Turbo, its most powerful and efficient languag...
  [DEBUG verifier.py] Snippet 5 (first 100 chars): Feb 13, 2025 · OpenAI's upcoming GPT models mark significant advancements in artificial intelligence...
  [DEBUG verifier.py] Snippet 6 (first 100 chars): Aug 13, 2025 · GPT-5 is OpenAI’s latest-generation large language model, officially released on Augu...
  [DEBUG verifier.py] Snippet 7 (first 100 chars): Jun 16, 2025 · Open

In [24]:
result = verify_claim("Barack Obama was not the 44th President of the United States.")
print(result) 


[DEBUG verifier.py] Verifying claim: 'Barack Obama was not the 44th President of the United States.' (top_k=100, sim_threshold=0.7)

[DEBUG verifier.py] Original claim for search: 'Barack Obama was not the 44th President of the United States.'
  [DEBUG verifier.py] Generated search query: '"Barack Obama" AND "United States" AND "the 44th President" AND "the United States" facts' with 100 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] Snippet 1 (first 100 chars): Barack Hussein Obama II[a] (born August 4, 1961) is an American politician who served as the 44th pr...
  [DEBUG verifier.py] Snippet 2 (first 100 chars): Sep 19, 2025 · Barack Obama (born August 4, 1961, Honolulu, Hawaii, U.S.) is the 44th president of t...
  [DEBUG verifier.py] Snippet 3 (first 100 chars): Barack Obama was the 44th president of the United States, elected in November 2008 and holding offic...
  [DEBUG verifier.py] Snippet 4 (first 100 chars): When Barack Obama was elected president in 2008, he became the first African American to hold the of...
  [DEBUG verifier.py] Snippet 5 (first 100 chars): On November 4, 2008, Barack Obama was elected the 44th President of the United States, winning more ...
  [DEBUG verifier.py] Snippet 6 (first 100 chars): Nov 29, 2024 · The statement that Barack Obama was the 44th President of the United States is suppor...
  [DEBUG verifier.py] Snippet 7 (first 100 chars): On May 2, 2011, Pre

In [30]:

result = verify_claim("He was the first to discover it.")
print(result) 


[DEBUG verifier.py] Verifying claim: 'He was the first to discover it.' (top_k=100, sim_threshold=0.7)

[DEBUG verifier.py] Original claim for search: 'He was the first to discover it.'
  [DEBUG verifier.py] Generated search query: 'He AND it facts' with 100 results...


  with DDGS() as ddgs:


  [DEBUG verifier.py] Snippet 1 (first 100 chars): HE definition: 1. used as the subject of a verb to refer to a man, boy, or male animal that has alre...
  [DEBUG verifier.py] Snippet 2 (first 100 chars): The meaning of HE is that male one who is neither speaker nor hearer. How to use he in a sentence....
  [DEBUG verifier.py] Snippet 3 (first 100 chars): Today, he is the only masculine pronoun in English. In the 18th century, it was suggested as a gende...
  [DEBUG verifier.py] Snippet 4 (first 100 chars): Another is to use the masculine and feminine singular pronouns together: he or she, she or he; he/sh...
  [DEBUG verifier.py] Snippet 5 (first 100 chars): The use of he and other masculine pronouns to refer to an unspecified person or to people in general...
[DEBUG verifier.py] Filtered 0/5 snippets above similarity threshold 0.7
[DEBUG verifier.py] No snippets available after filtering for the claim 'He was the first to discover it.'. Returning 'uncertain'.
{'claim': 'He was the f