In [1]:
import pandas as pd
import ast
from ftlangdetect import detect
import json

# Load the dataset
df = pd.read_csv('data/fnd_politifact_claims_final.csv')

# Load impact scores
with open('data/claim_impact_scores.json', 'r') as f:
    impact_scores = json.load(f)

# Load claim credibility scores
with open('data/claim_credibility.json', 'r') as f:
    claim_credibility_scores = json.load(f)

# Load claim queries
with open('data/claim_queries.json', 'r') as f:
    claim_queries = json.load(f)

### Remove all non-English claims

In [None]:
df_english = pd.DataFrame()

english_claim_count = 0

for row in df.iterrows():
    claim_text = row[1].claim.replace("\n", " ")
    if detect(claim_text)["lang"] != "en":
        continue
        
    df_english = pd.concat([df_english, row[1].to_frame().T])
    english_claim_count += 1

# Reset index after all concatenations
df_english = df_english.reset_index(drop=True)
print("Filtered Dataset size: ", len(df_english))

# Convert claim_date column from datetime64[ns] to string format "Month Day, Year"
df_english['claim_date'] = df_english['claim_date'].dt.strftime('%B %d, %Y')

Filtered Dataset size:  2557


### Each claim must be convered to a JSON object

In [24]:
# Each claim is JSON object of the format:

# 0. claim_id
# 1. claim
# 2. claim_factcheck_url
# 3. claim_author
# 4. claim_source
# 5. claim_date
# 6. fact_check_date
# 7. justification
# 8. fact_checking_sources
# 9. issue
# 10. label
# 11. queries
# 11. domain_authority_metrics => {
#     "domain_age",
#     "tld_score",
#     "page_rank",
#     "bias_rating_score",
#     "factual_rating_score",
#     "credibility_rating_score"
# }
# 12. impact_analysis => {
#     "Impact Score",
#     "Justification"
# }

In [None]:
dataset_json = []
for row in df_english.iterrows():
    claim_id = row[0]
    dataset_json.append({
        "claim_id": claim_id,
        "claim": row[1].claim,
        "claim_factcheck_url": row[1].claim_factcheck_url,
        "claim_author": row[1].claim_author,
        "claim_source": row[1].claim_source,
        "claim_date": row[1].claim_date,
        "fact_check_date": row[1].fact_check_date,
        "justification": row[1].justification,
        "fact_checking_sources": row[1].fact_checking_sources,
        "issue": row[1].issue,
        "label": row[1].label,
        "queries": [q[0]for q in claim_queries[row[1].claim]],
        "domain_authority_metrics": claim_credibility_scores[row[1].claim],
        "impact_analysis": impact_scores[row[1].claim]
    })

with open('data/dataset_politifact.json', 'w') as f:
    json.dump(dataset_json, f, indent=4)

In [2]:
dataset_json[0]

{'claim_id': 0,
 'claim': '“JD Vance actually sent a letter last year to the Department of Justice saying, ‘enforce the Comstock Act.’”',
 'claim_factcheck_url': 'https://www.politifact.com/factchecks/2024/aug/30/elizabeth-warren/jd-vance-joined-republican-letter-urging-doj-to-en/',
 'claim_author': 'Elizabeth Warren',
 'claim_source': 'an interview',
 'claim_date': 'August 25, 2024',
 'fact_check_date': 'August 30, 2024',
 'justification': 'Warren said Vance sent a letter to the Justice Department asking it to enforce the Comstock Act.\n\nVance in 2023 signed onto a letter to the department with about 40 other Republicans. The letter challenged the department’s interpretation of the law and demanded that it shut down all mailing of abortion pills.\nSince becoming the Republican vice presidential candidate, Vance has said he supports a Supreme Court decision that left abortion medication on the market.\n\nWarren’s statement is accurate. We rate it True.',
 'fact_checking_sources': "['h