In [8]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from urllib.parse import urljoin

def scrape_articles_with_videos(start_page, end_page):
    """
    Scrapes Politifact for 'true' articles that contain an embedded video.

    Args:
        start_page (int): The page number to start scraping from.
        end_page (int): The page number to end scraping on.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("log-level=3")
    driver = webdriver.Chrome(options=chrome_options)
    
    base_url = "https://www.politifact.com"
    articles_with_videos = []

    try:
        for page_num in range(start_page, end_page + 1):
            list_page_url = f"{base_url}/factchecks/list/?page={page_num}&ruling=true"
            print(f"\n--- Scraping list page {page_num} of {end_page} ---")
            print(f"URL: {list_page_url}")

            try:
                driver.get(list_page_url)
                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div.m-statement__quote a"))
                )
                link_elements = driver.find_elements(By.CSS_SELECTOR, "div.m-statement__quote a")
                article_urls_on_page = [urljoin(base_url, link.get_attribute("href")) for link in link_elements if link.get_attribute("href")]

                print(f"Found {len(article_urls_on_page)} articles on page {page_num}. Now checking each for videos...")

                for i, article_url in enumerate(article_urls_on_page):
                    print(f"  ({i+1}/{len(article_urls_on_page)}) Checking: {article_url}")
                    
                    try:
                        driver.get(article_url)
                        video_containers = driver.find_elements(By.CSS_SELECTOR, "div.artembed iframe")

                        if video_containers:
                            print("    --> Video FOUND. Adding URL to list.")
                            articles_with_videos.append(article_url)
                        else:
                            print("    --> No video found.")

                    except Exception as e:
                        print(f"    --> Could not process article {article_url}: {e}")
                    time.sleep(0.5)

            except TimeoutException:
                print(f"Timed out waiting for content on page {page_num}. It might be the last page or an error.")
                break # Exit loop if a page times out (likely end of results)
            except Exception as e:
                print(f"An error occurred while processing page {page_num}: {e}")
                continue # Continue to the next page

    finally:
        print("\nScraping complete. Closing browser.")
        driver.quit()

    return articles_with_videos

if __name__ == '__main__':
    # Set the page range from 1 to 89
    START_PAGE = 29
    END_PAGE = 89
    #89
    
    all_urls_with_videos = scrape_articles_with_videos(START_PAGE, END_PAGE)

    print("\n--- All collected URLs with videos ---")
    if all_urls_with_videos:
        for url in all_urls_with_videos:
            print(url)
    else:
        print("No articles with videos were found in the specified page range.")


    # Save the collected URLs to a file
    filepath = os.path.join(".", "input.txt")
    try:
        print(f"\nSaving {len(all_urls_with_videos)} URLs to {filepath}...")
        with open(filepath, "w") as f:
            for url in all_urls_with_videos:
                f.write(url + "\n")
        print(f"Successfully saved URLs to {filepath}")
    except Exception as e:
      print(f"Could not save URLs to file: {e}")


--- Scraping list page 29 of 89 ---
URL: https://www.politifact.com/factchecks/list/?page=29&ruling=true
Found 30 articles on page 29. Now checking each for videos...
  (1/30) Checking: https://www.politifact.com/factchecks/2015/oct/29/marco-rubio/when-attacked-missed-votes-marco-rubio-calls-out-b/
    --> No video found.
  (2/30) Checking: https://www.politifact.com/factchecks/2015/oct/28/peter-kinder/lt-gov-kinder-right-about-nixons-veto-overrides/
    --> No video found.
  (3/30) Checking: https://www.politifact.com/factchecks/2015/oct/23/slice-pint-brewpub/brewpub-growling-about-state-craft-beer-sales/
    --> No video found.
  (4/30) Checking: https://www.politifact.com/factchecks/2015/oct/14/gwen-moore/six-10-americans-back-federal-money-planned-parent/
    --> No video found.
  (5/30) Checking: https://www.politifact.com/factchecks/2015/oct/14/hillary-clinton/hillary-clinton-bernie-sanders-voted-against-brady/
    --> No video found.
  (6/30) Checking: https://www.politifact.co

KeyboardInterrupt: 

In [None]:
filepath = os.path.join(".", "input.txt")
try:
    print(f"\nSaving {len(all_urls_with_videos)} URLs to {filepath}...")
    with open(filepath, "w") as f:
        for url in all_urls_with_videos:
            f.write(url + "\n")
    print(f"Successfully saved URLs to {filepath}")
except Exception as e:
    print(f"Could not save URLs to file: {e}")


Saving 12 URLs to ./input.txt...
Successfully saved URLs to ./input.txt
