In [7]:
import requests
from bs4 import BeautifulSoup

def scrape_nyt_article(url):
    try:
        # Headers to simulate a browser visit
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
        }
        
        # Send a GET request
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for HTTP error codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the headline
        headline = soup.find('h1').get_text(strip=True) if soup.find('h1') else "No headline found"
        
        # Extract the main content
        paragraphs = soup.find_all('p')
        content = "\n".join([p.get_text(strip=True) for p in paragraphs])
        
        return {
            "headline": headline,
            "content": content
        }
    except Exception as e:
        return {"error": str(e)}

# Example usage
if __name__ == "__main__":
    article_url = "https://www.nytimes.com/2024/12/02/us/politics/biden-pardon-criticized-democrats.html"
    result = scrape_nyt_article(article_url)
    print(result['content'])


HunterBiden Pardon
Advertisement
Supported by
News Analysis
President Biden is facing criticism for absolving his son after insisting he would not and, according to some critics in his own party, paving the way for Donald Trump’s return to office.
ByPeter Baker
Reporting from Luanda, Angola, where he is traveling with President Biden this week
There was a time, not that long ago, when President Biden imagined he would etch his place in history as the leader who ended the chaotic reign of Donald J. Trump, passed a raft of “Build Back Better” laws to transform the country and reestablished America’s place in the world.
Now, in the desultory final days of his administration, Mr. Biden finds himself repudiated, even by some of his fellow Democrats, as the president who refused to step aside until it was too late, paved the way for Mr. Trump’s return to power and, in a final gesture of personal grievance over stated principle,pardoned his own sonfor multiple felony convictions.
The disappoi

In [8]:
import requests
from bs4 import BeautifulSoup

def scrape_reuters_article(url):
    try:
        # Headers to simulate a browser visit
        headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    "Referer": "https://www.reuters.com/",
    "Accept-Language": "en-US,en;q=0.9",
}
        
        # Send a GET request
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for HTTP error codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the headline
        headline = soup.find('h1').get_text(strip=True) if soup.find('h1') else "No headline found"
        
        # Extract the main content
        article_body = soup.find('div', class_='article-body__content')
        if article_body:
            paragraphs = article_body.find_all('p')
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])
        else:
            content = "No content found"
        
        return {
            "headline": headline,
            "content": content
        }
    except Exception as e:
        return {"error": str(e)}

# Example usage
if __name__ == "__main__":
    article_url = "https://www.reuters.com/world/middle-east/iraqi-militias-enter-syria-reinforce-government-forces-military-sources-say-2024-12-02/"
    result = scrape_reuters_article(article_url)
    print(result)


{'error': '401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/middle-east/iraqi-militias-enter-syria-reinforce-government-forces-military-sources-say-2024-12-02/'}


In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

def scrape_reuters_with_selenium(url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    try:
        headline = driver.find_element(By.TAG_NAME, "h1").text
        paragraphs = driver.find_elements(By.TAG_NAME, "p")
        content = "\n".join([p.text for p in paragraphs])
    finally:
        driver.quit()

    return {"headline": headline, "content": content}

# Example usage
url = "https://www.reuters.com/world/middle-east/iraqi-militias-enter-syria-reinforce-government-forces-military-sources-say-2024-12-02/"
print(scrape_reuters_with_selenium(url))

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"tag name","selector":"h1"}
  (Session info: chrome=131.0.6778.86); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF600EB6CB5+28821]
	(No symbol) [0x00007FF600E23840]
	(No symbol) [0x00007FF600CC578A]
	(No symbol) [0x00007FF600D191BE]
	(No symbol) [0x00007FF600D194AC]
	(No symbol) [0x00007FF600D62647]
	(No symbol) [0x00007FF600D3F33F]
	(No symbol) [0x00007FF600D5F412]
	(No symbol) [0x00007FF600D3F0A3]
	(No symbol) [0x00007FF600D0A778]
	(No symbol) [0x00007FF600D0B8E1]
	GetHandleVerifier [0x00007FF6011EFCAD+3408013]
	GetHandleVerifier [0x00007FF60120741F+3504127]
	GetHandleVerifier [0x00007FF6011FB5FD+3455453]
	GetHandleVerifier [0x00007FF600F7BDBB+835995]
	(No symbol) [0x00007FF600E2EB5F]
	(No symbol) [0x00007FF600E2A814]
	(No symbol) [0x00007FF600E2A9AD]
	(No symbol) [0x00007FF600E1A199]
	BaseThreadInitThunk [0x00007FF83CED259D+29]
	RtlUserThreadStart [0x00007FF83E4AAF38+40]
