In [1]:
# The purpose of this notebook is to rank the evidence for each claim.
# - Each claim is decomposed into a set of queries.
# - Each query is used to fetch evidence from the web. (10 links per query)
# - This module shall find the top k links per query. 
# - Each evidence must be assigned two scores. 
#     * Use title + snippet text of evidence.
#     * Semantic relevance score [Set a threshold for this score]
#     * Keyword matching score [Set a threshold for this score]
#     * If the score is above the threshold, this evidence will be shortlisted.

import json
import os
import random
import numpy as np
from sentence_transformers import util
import gc
import pandas as pd

os.chdir("/Users/abz/Desktop/UNB/Thesis/Code/Thesis-Code/FNDdataset")

from utils.gemini_interface import GeminiAPI
from crawl4evidence import should_filter_link
from utils.webpage_crawler import WebpageProcessor

with open("data/search_results_v2.json", "r", encoding="utf-8") as fp:
    search_results = json.load(fp)

dataset = pd.read_csv("data/fnd_politifact_claims.csv")

gemini_api = GeminiAPI(secrets_file="secrets/gemini_keys_new.json")
processor = WebpageProcessor()

  from tqdm.autonotebook import tqdm, trange


Using key: ****Luo_s


In [21]:
claim = random.sample(list(search_results.keys()), 1)[0]
results = search_results[claim]

print(claim, dataset[dataset['claim'] == claim].iloc[0]['label'])

print("Queries:\n", '\n'.join(list(results.keys())))

# Generate claim embedding
claim_embedding = gemini_api.get_text_embeddings([claim], task="semantic_similarity")

# Convert to numpy array
claim_embedding_np = np.array(claim_embedding)

Queries:
 Japan dropped all vaccine mandates
Vaccine mandates in Japan in 2022


In [22]:
all_top_n_evidences = {}
threshold = 0.7

# Pick all evidences which are semantically similar to the query
for claim_query, page_results in results.items():
    # Generate claim_query embedding
    claim_query_embedding = gemini_api.get_text_embeddings([claim_query], task="semantic_similarity")
    claim_query_embedding_np = np.array(claim_query_embedding)

    # Generate evidence snippet embeddings
    evidences_descriptions = []
    for page_idx, evidence_list in page_results.items():
        for evidence_idx, evidence in enumerate(evidence_list):
            if should_filter_link(evidence["link"]):
                continue
            if "title" not in evidence.keys() or "snippet" not in evidence.keys():
                continue
            title_snippet = evidence["title"] + " " + evidence["snippet"]
            evidences_descriptions.append((evidence["link"], title_snippet.strip()))

    # Generate embeddings for each evidence
    evidences_embeddings = gemini_api.get_text_embeddings([v[1] for v in evidences_descriptions], task="semantic_similarity")
    evidences_embeddings_np = np.array(evidences_embeddings)

    # Calculate cosine similarity between claim_query and evidences
    # f(x, y) = x.y / (||x|| * ||y||)
    similarity_scores = np.dot(claim_query_embedding_np, evidences_embeddings_np.T) / (np.linalg.norm(claim_query_embedding_np) * np.linalg.norm(evidences_embeddings_np, axis=1))

    # Pick all articles with similarity score > threshold
    pick_indices = np.where(similarity_scores >= threshold)[1]

    all_top_n_evidences[claim_query] = [evidences_descriptions[idx] for idx in pick_indices.flatten()]

combined_semantic_evidences = {}
for query_evidences in all_top_n_evidences.values():
    for (link, snippet) in query_evidences:
        if link not in combined_semantic_evidences:
            combined_semantic_evidences[link] = snippet
combined_semantic_evidences

Blacklisted file type:  https://www.healthdata.org/sites/default/files/covid_briefs/67_briefing_Japan.pdf


{'https://fullfact.org/online/covid-19-vaccine-mandate-japan/': 'Japan has never enforced a Covid-19 vaccine mandate - Full Fact Our verdict. Japan never enforced a Covid-19 vaccine mandate. Its government websites clearly state vaccination is encouraged but optional.',
 'https://africacheck.org/fact-checks/meta-programme-fact-checks/japan-has-never-had-covid-vaccine-mandate-warns-rare-heart': "Japan has never had a Covid vaccine mandate – but warns of 'rare ... The claim that Japan has “dropped all vaccine mandates” is false – the country never had them. But Japan's government has warned that ...",
 'https://www.bloomberg.com/news/articles/2021-11-16/japan-leads-the-g-7-in-covid-shots-without-a-mandate-in-sight': 'Japan Leads the G-7 in Covid Shots Without a Mandate in Sight The country -- which started doling out immunizations months after the US -- has the highest inoculation rate among the Group of Seven, and it did it without ...',
 'https://www.asiapacific.ca/publication/japans-v

In [23]:
failed_links = []
for link in list(combined_semantic_evidences.keys()):  # Create a list of keys
    snippet_evidence = combined_semantic_evidences[link]
    try:
        # Get page content using the processor
        page_json = processor.url2lines(link, method="trafilatura")

        if page_json == {}:
            print(f"Error processing {link}")
            failed_links.append(link)
            full_evidence = ""
        else:
            full_evidence = page_json['text']
        combined_semantic_evidences[link] = [full_evidence, snippet_evidence]
        gc.collect()

    except Exception as e:
        failed_links.append(link)

No page found
Error processing https://www.reuters.com/article/fact-check/japan-has-not-authorized-ivermectin-to-treat-covid-19-or-revoked-a-vaccine-manda-idUSL1N2TX1GK/
No page found
Error processing https://www.sciencedirect.com/science/article/abs/pii/S1341321X24002095
No page found
Error processing https://www.ahajournals.org/doi/10.1161/CIRCRESAHA.122.321881
No page found
Error processing https://www.sciencedirect.com/science/article/pii/S2666606522000578
No page found
Error processing https://www.reuters.com/business/healthcare-pharmaceuticals/us-panel-review-heart-inflammation-cases-after-pfizer-moderna-vaccines-2021-06-23/
No page found
Error processing https://www.mofa.go.jp/ca/fna/page24e_000317.html
No page found
Error processing https://www.thelancet.com/journals/lanwpc/article/PIIS2666-6065(23)00203-1/fulltext
No page found
Error processing https://www.sciencedirect.com/science/article/pii/S0264410X23002827


In [24]:
from rank_bm25 import BM25Okapi

# Extract the evidence content (full_evidence) from the combined_semantic_evidences dictionary
evidence_contents = [content for (content, _) in combined_semantic_evidences.values()]

# Tokenize the evidence content
tokenized_evidence_contents = [content.split(" ") for content in evidence_contents]

# Create a BM25Okapi object
bm25 = BM25Okapi(tokenized_evidence_contents)

# Tokenize the claim
tokenized_claim = claim.split(" ")

# Calculate BM25 scores for each evidence
doc_scores = bm25.get_scores(tokenized_claim)

In [25]:
for score, link in zip(doc_scores, combined_semantic_evidences.keys()):
    print(score, link)

13.399389016377933 https://fullfact.org/online/covid-19-vaccine-mandate-japan/
13.70206144536439 https://africacheck.org/fact-checks/meta-programme-fact-checks/japan-has-never-had-covid-vaccine-mandate-warns-rare-heart
0.0 https://www.bloomberg.com/news/articles/2021-11-16/japan-leads-the-g-7-in-covid-shots-without-a-mandate-in-sight
0.7720576435694275 https://www.asiapacific.ca/publication/japans-vaccine-hesitancy-implications-covid-olympics
0.0 https://www.reuters.com/article/fact-check/japan-has-not-authorized-ivermectin-to-treat-covid-19-or-revoked-a-vaccine-manda-idUSL1N2TX1GK/
0.0 https://www.japantimes.co.jp/news/2021/11/17/national/japan-g7-coronavirus-vaccinations/
0.0 https://www.sciencedirect.com/science/article/abs/pii/S1341321X24002095
4.913923467202693 https://pmc.ncbi.nlm.nih.gov/articles/PMC9618301/
0.0 https://www.ahajournals.org/doi/10.1161/CIRCRESAHA.122.321881
0.0 https://www.sciencedirect.com/science/article/pii/S2666606522000578
0.0 https://www.reuters.com/busines

In [None]:
# Couldn't crawl
# bloomberg.com/news/articles
# reuters.com
# sciencedirect.com
# thelancet

In [26]:
filtered_evidences = {k: v for score, (k, v) in zip(doc_scores, combined_semantic_evidences.items()) if score > 2.0}
for k in filtered_evidences.keys():
    print(k)

https://fullfact.org/online/covid-19-vaccine-mandate-japan/
https://africacheck.org/fact-checks/meta-programme-fact-checks/japan-has-never-had-covid-vaccine-mandate-warns-rare-heart
https://pmc.ncbi.nlm.nih.gov/articles/PMC9618301/
https://japan.kantei.go.jp/ongoingtopics/vaccine.html


In [2]:
import json
import os
import random
import numpy as np
# from sentence_transformers import util
import gc
import pandas as pd
from rank_bm25 import BM25Okapi

# Configuration (can be moved to a separate config file)
SEARCH_RESULTS_FILE = "data/search_results_v2.json"
CLAIMS_DATA_FILE = "data/fnd_politifact_claims.csv"
GEMINI_SECRETS_FILE = "secrets/gemini_keys_new.json"
SEMANTIC_SIMILARITY_THRESHOLD = 0.7
BM25_SCORE_THRESHOLD = 2.0
PROJECT_ROOT = "/Users/abz/Desktop/UNB/Thesis/Code/Thesis-Code/FNDdataset"

"""Main function to rank evidence for a claim."""
os.chdir(PROJECT_ROOT)

# Imports (consider moving these to the top if they are used globally)
from utils.gemini_interface import GeminiAPI
from crawl4evidence import should_filter_link
from utils.webpage_crawler import WebpageProcessor

# --- Helper Functions ---
def load_data(search_results_file, claims_data_file):
    with open(search_results_file, "r", encoding="utf-8") as fp:
        search_results = json.load(fp)
    dataset = pd.read_csv(claims_data_file)
    return search_results, dataset

def initialize_apis(gemini_secrets_file):
    gemini_api = GeminiAPI(secrets_file=gemini_secrets_file)
    processor = WebpageProcessor()
    return gemini_api, processor

def select_claim(search_results):
    return random.sample(list(search_results.keys()), 1)[0]

def generate_embeddings(api, texts, task="semantic_similarity"):
    embeddings = api.get_text_embeddings(texts, task=task)
    return np.array(embeddings)

def filter_evidence_by_semantic_similarity(embed_api, claim_query, claim_results, threshold):
    """Filters evidence based on semantic similarity to the claim query.

    Args:
        api: The Gemini API object.
        claim_query: The claim query (str).
        page_results: A dictionary of page results.
        threshold: The semantic similarity threshold.

    Returns:
        A list of tuples, each containing a link and its snippet, that exceed the threshold.
    """
    claim_query_embedding = generate_embeddings(embed_api, [claim_query])

    evidences_descriptions = []
    for page_idx, page_results in claim_results.items():
        for evidence_idx, evidence in enumerate(page_results):
            if should_filter_link(evidence["link"]):
                continue
            if "title" not in evidence.keys() or "snippet" not in evidence.keys():
                continue
            title_snippet = evidence["title"] + " " + evidence["snippet"]
            evidences_descriptions.append((evidence["link"], title_snippet.strip()))

    evidences_embeddings = generate_embeddings(embed_api, [v[1] for v in evidences_descriptions])

    similarity_scores = np.dot(claim_query_embedding, evidences_embeddings.T) / (
        np.linalg.norm(claim_query_embedding) * np.linalg.norm(evidences_embeddings, axis=1)
    )

    pick_indices = np.where(similarity_scores >= threshold)[1]
    return [evidences_descriptions[idx] for idx in pick_indices.flatten()]

def get_full_page_content(processor, link):
    """Retrieves the full content of a webpage.

    Args:
        processor: The Webpage Processor object.
        link: The URL of the webpage.

    Returns:
        The full text content of the webpage (str), or an empty string if an error occurs.
    """
    try:
        page_json = processor.url2lines(link, method="trafilatura")
        if page_json == {}:
            print(f"Error processing {link}")
            return ""
        else:
            return page_json["text"]
    except Exception as e:
        print(f"Error retrieving content for {link}: {e}")
        return ""

def combine_semantic_evidences(all_top_n_evidences, processor):
    """Combines evidence snippets with full page content.

    Args:
        all_top_n_evidences: A dictionary of top N evidence snippets for each query.
        processor: The Webpage Processor object.

    Returns:
        A dictionary where keys are links and values are lists containing [full_evidence, snippet_evidence].
    """
    combined_semantic_evidences = {}
    failed_links = []
    for query_evidences in all_top_n_evidences.values():
        for (link, snippet) in query_evidences:
            if link not in combined_semantic_evidences:
                full_evidence = get_full_page_content(processor, link)
                if full_evidence == "":
                    failed_links.append(link)
                combined_semantic_evidences[link] = [full_evidence, snippet]
                gc.collect()
    return combined_semantic_evidences, failed_links

def calculate_bm25_scores(evidence_contents, claim):
    """Calculates BM25 scores for a list of evidence contents against a claim.

    Args:
        evidence_contents: A list of evidence contents (str).
        claim: The claim (str).

    Returns:
        A list of BM25 scores.
    """
    tokenized_evidence_contents = [content.split(" ") for content in evidence_contents]
    bm25 = BM25Okapi(tokenized_evidence_contents)
    tokenized_claim = claim.split(" ")
    return bm25.get_scores(tokenized_claim)

def filter_evidence_by_bm25_score(combined_semantic_evidences, doc_scores, threshold):
    """Filters evidence based on BM25 scores.

    Args:
        combined_semantic_evidences: A dictionary of combined evidence.
        doc_scores: A list of BM25 scores.
        threshold: The BM25 score threshold.

    Returns:
        A dictionary of evidence that exceeds the BM25 score threshold.
    """
    return {
        k: v
        for score, (k, v) in zip(doc_scores, combined_semantic_evidences.items())
        if score > threshold
    }

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load data and initialize APIs
search_results, dataset = load_data(SEARCH_RESULTS_FILE, CLAIMS_DATA_FILE)
gemini_api, processor = initialize_apis(GEMINI_SECRETS_FILE)

# Select a claim
claim = select_claim(search_results)
results = search_results[claim]

print("Claim Label: ", claim, dataset[dataset["claim"] == claim].iloc[0]["label"])
print("Queries:\n", "\n".join(list(results.keys())))

# Generate claim embedding
claim_embedding = generate_embeddings(gemini_api, [claim])

# Filter evidence by semantic similarity
all_top_n_evidences = {}
for claim_query, page_results in results.items():
    top_n_evidences = filter_evidence_by_semantic_similarity(
        gemini_api, claim_query, page_results, SEMANTIC_SIMILARITY_THRESHOLD
    )
    all_top_n_evidences[claim_query] = top_n_evidences

# Combine evidence snippets with full page content
combined_semantic_evidences, failed_links = combine_semantic_evidences(all_top_n_evidences, processor)
print("Failed links:", failed_links)

# Calculate BM25 scores
evidence_contents = [content for (content, _) in combined_semantic_evidences.values()]
doc_scores = calculate_bm25_scores(evidence_contents, claim)

# Filter evidence by BM25 score
filtered_evidences = filter_evidence_by_bm25_score(
    combined_semantic_evidences, doc_scores, BM25_SCORE_THRESHOLD
)

print("Filtered Evidences (BM25):")
for k in filtered_evidences.keys():
    print(k)

Using key: ****Luo_s
Claim Label:  President Joe Biden said, “The idea that we’re going to send in tanks to Ukraine, that’s called World War III.” False
Queries:
 Joe Biden believes sending tanks to Ukraine would cause World War III
President Joe Biden said, 'The idea that we’re going to send in tanks to Ukraine, that’s called World War III.'
What is the US policy regarding sending tanks to Ukraine in 2023
No page found
Error processing https://www.reuters.com/article/fact-check/comparison-of-biden-remarks-on-sending-tanks-to-ukraine-are-missing-context-idUSL1N34J2NI/
Failed links: ['https://www.reuters.com/article/fact-check/comparison-of-biden-remarks-on-sending-tanks-to-ukraine-are-missing-context-idUSL1N34J2NI/']
Filtered Evidences (BM25):
https://www.cnn.com/2023/02/02/politics/fact-check-biden-world-war-iii/index.html
https://www.newsweek.com/fact-check-did-biden-say-sending-tanks-ukraine-would-cause-world-war-3-1778466
https://www.usatoday.com/story/news/factcheck/2023/02/17/fac