In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm

# Question generation prompt
with open("prompts/questions_template_v2.txt", "r") as f:
    question_prompt_template = f.read()

# Save all claims and search results
save_folder = "data"

df = pd.read_csv('data/fnd_politifact_claims.csv')
df.columns, len(df)

# Sample 5 random claims
# df = df.sample(1500).reset_index(drop=True)

(Index(['claim', 'claim_factcheck_url', 'claim_author', 'claim_source',
        'claim_date', 'fact_check_date', 'justification',
        'fact_checking_sources', 'issue', 'label'],
       dtype='object'),
 3377)

In [2]:
from datetime import datetime

# Input format: Month Day, Year
# Output format: YYYY-MM-DD
def extract_and_format_date(check_date, default_date="January 1, 2024"):
    # If the date is not provided, use the default date
    if check_date != "UNKNOWN":
        date_str = check_date
    else:
        date_str = default_date

    # Convert the date string to a datetime object
    date_obj = datetime.strptime(date_str, "%B %d, %Y")

    # Format the date as YYYY-MM-DD
    formatted_date = date_obj.strftime("%Y-%m-%d")
    return formatted_date

def parse_gemini_response(response_text):
    json_str = response_text.split('```json\n')[1].split('\n```')[0]
    info = json.loads(json_str)
    return info

In [3]:
from utils.gemini_interface_parallel import batch_process

secrets_file = "secrets/gemini_keys_new.json"
model_name = "gemini-2.0-flash-exp"
# model_name = "gemini-1.5-flash-latest"
temperature = 0.75
top_p = 0.95

# First part: Generate queries for all claims
claim_queries_filename = f"{save_folder}/claim_queries_v2.json"
try:    
    with open(claim_queries_filename, "r", encoding="utf-8") as fp:
        claim_queries = json.load(fp)
except FileNotFoundError:
    claim_queries = {}

print(f"{len(claim_queries)} claims already processed")

2684 claims already processed


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
prompts = []
for index, row in tqdm(df.iterrows(), total=len(df)):
    claim = row["claim"]
    if claim in claim_queries:
        continue
    
    author = row["claim_author"].strip() if "claim_author" in row and row["claim_author"] else "UNKNOWN"
    claim_date = row["claim_date"].strip() if "claim_date" in row and row["claim_date"] else "UNKNOWN"
    location_ISO_code = row["location_ISO_code"].strip() if "location_ISO_code" in row and row["location_ISO_code"] else "US"

    # Generate questions using Gemini API
    prompt = question_prompt_template.replace("[Insert the claim here]", claim)\
        .replace("[Insert the claim speaker here]", author)\
        .replace("[Insert the claim date here]", claim_date)\
        .replace("[Insert the location ISO code here]", location_ISO_code)
    
    prompts.append(
                    ("chat", 
                     prompt,
                     claim
                    )
                )
    
responses = batch_process(prompts, secrets_file, model_name, temperature, top_p)

failed_row_ids = []
for response in responses:
    claim = response[1]
    if response[0] is None:
        failed_row_ids.append(claim)
        continue

    try:    
        llm_decompostions = parse_gemini_response(response[0].text)
        llm_decompostions = set([d.strip() for d in llm_decompostions])        
        claim_queries[claim] = [(search_string, "generated_decomposition") 
                              for search_string in llm_decompostions]
    except:
        print("Failed to parse response for claim:", claim)
        print(response)
        continue

# Save claim_queries after each claim
with open(claim_queries_filename, "w", encoding="utf-8") as fp:
    json.dump(claim_queries, fp, indent=4)

In [5]:
from utils.serper_customsearch import SerperCustomSearch
serper_search = SerperCustomSearch("secrets/serper_secrets.json")

Skipping depleted API key
Changed Serper search SECRET,  1180  calls remaining


In [13]:
# Load exisitng search results
results_filename = f"{save_folder}/search_results_v2.json"
if os.path.exists(results_filename):
    with open(results_filename, "r", encoding="utf-8") as fp:
        search_results = json.load(fp)
else:
    search_results = {}

# Fetch links for each claim
pages = 1
for claim, queries in tqdm(claim_queries.items(), total=len(claim_queries)):
    if claim in search_results:
        continue

    search_results[claim] = {}
    for query in queries:
        search_results[claim][query[0]] = serper_search.fetch_results(search_string=query[0], 
                                                                   pages_before_date="", 
                                                                   location_ISO_code="US", 
                                                                   n_pages=pages)

    # Save search_results to file
    with open(results_filename, "w", encoding="utf-8") as fp:
        json.dump(search_results, fp, indent=4)


  0%|          | 0/2684 [00:00<?, ?it/s]

 28%|██▊       | 740/2684 [21:45<1:48:29,  3.35s/it]

Changed Serper search SECRET,  2500  calls remaining


 53%|█████▎    | 1422/2684 [1:10:13<1:12:22,  3.44s/it]

Changed Serper search SECRET,  2500  calls remaining


 79%|███████▉  | 2116/2684 [2:03:00<47:16,  4.99s/it]  

Changed Serper search SECRET,  2500  calls remaining


100%|██████████| 2684/2684 [2:50:00<00:00,  3.80s/it]
