In [3]:
!python3 -m pip install dataclasses-json
!python3 -m pip install requests
!python3 -m pip install beautifulsoup4
!python3 -m pip install html5lib
!python3 -m pip install pandas

You should consider upgrading via the '/Users/ancasarb/.pyenv/versions/3.9.13/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mCollecting requests
  Downloading requests-2.31.0-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting certifi>=2017.4.17
  Downloading certifi-2023.5.7-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.0/157.0 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting urllib3<3,>=1.21.1
  Downloading urllib3-2.0.3-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.6/123.6 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna<4,>=2.5
  Using cached idna-3.4-py3-none-any.whl (61 kB)
Collecting charset-normalizer<4,>=2
  Downloading charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl (122 kB)
[2K     [90m━━━━━━━━━━━

In [4]:
from typing import List, Optional
from datetime import datetime
from dataclasses import dataclass
from dataclasses_json import dataclass_json
from requests import get
from bs4 import BeautifulSoup
from pandas import DataFrame

In [5]:
@dataclass_json
@dataclass
class ArticleURL:
    page_number: int
    url: str

In [6]:
@dataclass_json
@dataclass
class ArticleMetadata:
    page: int
    url: str
    creation_date: datetime
    author: str
    title: str
    keywords: str
    verdict: str

In [7]:
def retrieve_article_urls() -> List[ArticleURL]:
    page_number = 0
    article_urls = []

    while True:
        page_number = page_number + 1
        r = get(
            f"https://www.reuters.com/news/archive/factchecknew?view=page&page={page_number}&pageSize=10"
        )
        soup = BeautifulSoup(r.content, "html5lib")

        a_elements = soup.findAll(
            "a",
            href=lambda value: value
            and (
                value.startswith("/article/factcheck")
                or value.startswith("/article/fact-check")
                or value.startswith("/article/id")
                or ("/fact-check" in value and value != "/fact-check")
            ),
        )

        hrefs = list(dict.fromkeys([a["href"] for a in a_elements]))

        new_article_urls = [
            ArticleURL(page_number, f"https://www.reuters.com{a}") for a in hrefs
        ]

        if not new_article_urls:
            break
        else:
            article_urls.extend(new_article_urls)
    return article_urls

In [8]:
def retrieve_article_metadata(article: ArticleURL) -> Optional[ArticleMetadata]:
    r = get(article.url)
    soup = BeautifulSoup(r.content, "html5lib")

    article_date = find_metadata_tag(soup, "analyticsAttributes.articleDate")
    date_time_obj = datetime.strptime(article_date, "%Y-%m-%dT%H:%M:%SZ")
    if date_time_obj.year == 2022:
        keywords = find_metadata_tag(soup, "keywords")
        author = find_metadata_tag(soup, "analyticsAttributes.author")
        title = find_metadata_tag(soup, "analyticsAttributes.title")
        verdict = soup.find(
            "h2", text=lambda value: value and value.startswith("VERDICT")
        )
        if verdict is not None:
            verdict_type = verdict.findNext("p").getText().split(".")[0]
        else:
            verdict = soup.find(
                "p", text=lambda value: value and value.startswith("VERDICT")
            )
            if verdict is not None:
                verdict_type = verdict.getText().replace("VERDICT", "").split(".")[0]
            else:
                verdict = soup.find(
                    "p", text=lambda value: value and value.startswith("Verdict")
                )
                if verdict is not None:
                    verdict_type = verdict.findNext("p").getText().split(".")[0]
                else:
                    print("Ignoring article " + article.url + " as it doesn't have a verdict")
                    return None

        return ArticleMetadata(
            page=article.page_number,
            url=article.url,
            creation_date=date_time_obj,
            title=title,
            author=author,
            keywords=keywords,
            verdict=verdict_type
        )
    else:
        return None

In [9]:
def find_metadata_tag(element: BeautifulSoup, key: str) -> str:
    output = element.find("meta", attrs={"name": key})
    return output["content"]

In [None]:
article_urls = retrieve_article_urls()

articles_metadata = []
for article_url in article_urls:
    metadata = retrieve_article_metadata(article_url)
    if metadata is not None:
        articles_metadata.append(metadata)

df = DataFrame(articles_metadata)
df.to_csv("../data/fact_check.csv", sep=",", index=False)
