In [3]:
# Set home directory to parent directory    
import json

with open('data/claims_by_issue.json', 'r', encoding='utf-8') as f:
    claims_by_issue = json.load(f)

In [2]:
import trafilatura
from trafilatura.settings import DEFAULT_CONFIG
from bs4 import BeautifulSoup
from time import sleep
import sys
from tqdm import tqdm
from time import sleep

def get_page(url):
    page = None
    for i in range(3):
        try:
            page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
            assert page is not None
            break
        except Exception as e:
            print(f"Attempt {i+1} with trafilatura failed for {url}: {str(e)}", file=sys.stderr)
            sleep((i+1)*1)    
    return page

def html2lines(page):
    if not page or len(page.strip()) == 0:
        print("No page found")
        return {}

    text = trafilatura.extract(page, 
                               favor_precision=True, 
                               with_metadata = True,    
                               no_fallback=False,
                               include_comments=True,
                               output_format="json"
                            )
    try:
        return json.loads(text)
    except Exception as e:
        print(e)
        return {}

In [3]:
def get_ruling(soup):
    ruling_text = []
    found_ruling = False
    article = soup.find('article', class_='m-textblock')
    for element in article.find_all(['p', 'div']):
        if element.name == 'div' and element.text.strip() == 'Our ruling':
            found_ruling = True
            continue
        
        # Collect all text from paragraphs after finding "Our ruling"
        if found_ruling and element.name == 'p':
            ruling_text.append(element.text.strip())

    # Join all collected text with newlines
    full_ruling = '\n'.join(ruling_text).strip()
    return full_ruling

def get_sources(soup):
    sources = []
    article = soup.find('article', class_='m-superbox__content')
    for link in article.find_all('a', href=True):
        url = link['href']
        # Only include external links (those starting with http)
        if url.startswith('https'):
            sources.append(url)
    return sources

def get_statement_source(soup):
    desc_div = soup.find('div', class_="m-statement__desc")
    if desc_div:
        desc_text = desc_div.text.strip()
        # Extract text between "in" and ":" using string split
        try:
            source = desc_text.split("in ")[1].split(":")[0].strip()
            return source
        except IndexError:
            return ""
    return ""

# clone claims_by_issue
claims_by_issue_clone = {}
for idx, (issue, categories) in enumerate(claims_by_issue.items()):
    print(f"Processing issue: {issue}")
    # if issue != 'abortion':
    #     continue

    claims_by_issue_clone[issue] = {}
    for category, claim_objects in tqdm(categories.items(), total=len(categories)):
        claims_by_issue_clone[issue][category] = []
        for claim_object in claim_objects:
            statement_url = claim_object['statement_url']

            # Get page
            page = get_page(statement_url)
            sleep(1)

            if page is None:
                print(f"No page found for {claim_object['statement_url']}")
                continue

            soup = BeautifulSoup(page, 'html.parser')
            
            # Get ruling/ justification
            full_ruling = get_ruling(soup)

            # Gather sources 
            sources = get_sources(soup)

            # Gather claim source (Where was it spoken)
            claim_source = get_statement_source(soup)

            claim_object['justification'] = full_ruling
            claim_object['fact_checking_sources'] = sources
            claim_object['claim_source'] = claim_source

            claims_by_issue_clone[issue][category].append(claim_object)

with open('data/claims_by_issue_with_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(claims_by_issue_clone, f, indent=4)

Processing abortion


100%|██████████| 6/6 [02:41<00:00, 26.95s/it]


Processing animals


100%|██████████| 6/6 [00:44<00:00,  7.43s/it]


Processing border-security


100%|██████████| 6/6 [01:38<00:00, 16.34s/it]


Processing climate-change


100%|██████████| 6/6 [02:47<00:00, 27.92s/it]


Processing coronavirus


100%|██████████| 6/6 [07:52<00:00, 78.75s/it] 


Processing crime


100%|██████████| 6/6 [04:02<00:00, 40.40s/it]


Processing corporations


100%|██████████| 6/6 [00:37<00:00,  6.30s/it]


Processing children


100%|██████████| 6/6 [02:30<00:00, 25.04s/it]


Processing drugs


100%|██████████| 6/6 [01:21<00:00, 13.56s/it]


Processing economy


 50%|█████     | 3/6 [02:07<02:24, 48.12s/it]Attempt 1 with trafilatura failed for https://www.politifact.com/factchecks/2022/apr/13/rebecca-kleefisch/kleefisch-didnt-have-power-cut-taxes-her-own-lieut/: 
Attempt 2 with trafilatura failed for https://www.politifact.com/factchecks/2022/apr/13/rebecca-kleefisch/kleefisch-didnt-have-power-cut-taxes-her-own-lieut/: 
Attempt 3 with trafilatura failed for https://www.politifact.com/factchecks/2022/apr/13/rebecca-kleefisch/kleefisch-didnt-have-power-cut-taxes-her-own-lieut/: 


No page found for https://www.politifact.com/factchecks/2022/apr/13/rebecca-kleefisch/kleefisch-didnt-have-power-cut-taxes-her-own-lieut/


100%|██████████| 6/6 [05:56<00:00, 59.35s/it]


Processing education


100%|██████████| 6/6 [03:06<00:00, 31.10s/it]


Processing energy


100%|██████████| 6/6 [02:00<00:00, 20.12s/it]


Processing environment


100%|██████████| 6/6 [02:09<00:00, 21.59s/it]


Processing ethics


100%|██████████| 6/6 [00:05<00:00,  1.07it/s]


Processing food


100%|██████████| 6/6 [01:37<00:00, 16.24s/it]


Processing guns


100%|██████████| 6/6 [02:34<00:00, 25.74s/it]


Processing health-care


100%|██████████| 6/6 [03:38<00:00, 36.36s/it]


Processing housing


100%|██████████| 6/6 [00:24<00:00,  4.11s/it]


Processing human-rights


100%|██████████| 6/6 [00:10<00:00,  1.68s/it]


Processing history


100%|██████████| 6/6 [01:02<00:00, 10.45s/it]


Processing military


100%|██████████| 6/6 [02:45<00:00, 27.61s/it]


Processing natural-disasters


100%|██████████| 6/6 [01:07<00:00, 11.31s/it]


Processing welfare


100%|██████████| 6/6 [00:07<00:00,  1.31s/it]


Processing weather


100%|██████████| 6/6 [01:31<00:00, 15.23s/it]


Processing taxes


 50%|█████     | 3/6 [00:54<01:00, 20.06s/it]Attempt 1 with trafilatura failed for https://www.politifact.com/factchecks/2022/apr/13/rebecca-kleefisch/kleefisch-didnt-have-power-cut-taxes-her-own-lieut/: 
Attempt 2 with trafilatura failed for https://www.politifact.com/factchecks/2022/apr/13/rebecca-kleefisch/kleefisch-didnt-have-power-cut-taxes-her-own-lieut/: 
Attempt 3 with trafilatura failed for https://www.politifact.com/factchecks/2022/apr/13/rebecca-kleefisch/kleefisch-didnt-have-power-cut-taxes-her-own-lieut/: 


No page found for https://www.politifact.com/factchecks/2022/apr/13/rebecca-kleefisch/kleefisch-didnt-have-power-cut-taxes-her-own-lieut/


100%|██████████| 6/6 [04:06<00:00, 41.13s/it]


Processing technology


100%|██████████| 6/6 [01:59<00:00, 19.84s/it]


Processing sports


100%|██████████| 6/6 [02:32<00:00, 25.48s/it]


Processing science


100%|██████████| 6/6 [01:49<00:00, 18.21s/it]


Processing religion


100%|██████████| 6/6 [01:11<00:00, 11.89s/it]


Processing lgbtq


100%|██████████| 6/6 [02:09<00:00, 21.56s/it]


In [4]:
rows = []
for issue, categories in claims_by_issue_clone.items():
    for category, claim_objects in categories.items():
        for claim_object in claim_objects:
            rows.append({
                'claim': claim_object['statement_text'],
                'claim_factcheck_url': claim_object['statement_url'],
                'claim_author': claim_object['author_name'],
                'claim_source': claim_object['claim_source'],
                'claim_date': claim_object['date_of_statement'],
                'fact_check_date': claim_object['fact_check_date'],
                'justification': claim_object['justification'],
                'fact_checking_sources': claim_object['fact_checking_sources'],
                'issue': issue,
                'label': category,
            })

import pandas as pd
df = pd.DataFrame(rows)
df.to_csv('data/fnd_politifact_claims.csv', index=False, encoding='utf-8')
print(len(df))