## Scrape website title and 1st paragraph

In [1]:
import requests
from bs4 import BeautifulSoup

In [3]:


def scrape_website(url):
    """
    Scrapes the given URL and returns a dictionary with:
      - 'title': The <title> text (or a fallback if none is found).
      - 'snippet': The first 250 characters of the visible page text.
      
    Parameters:
        url (str): The URL of the website to scrape.

    Returns:
        dict: Contains 'title' and 'snippet' keys on success; 
              or None if an error occurs.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an HTTPError if the status is 4xx, 5xx
        
        # Parse HTML
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Get <title>
        title_tag = soup.find('title')
        if title_tag:
            title = title_tag.get_text().strip()
        else:
            title = "No title found"
        
        # Get all visible text
        # Using separator=' ' to ensure words are separated when joined
        full_text = soup.get_text(separator=' ', strip=True)
        
        # Extract first 250 characters
        snippet = full_text[:250]
        
        return {
            "title": title,
            "snippet": snippet
        }
    
    except Exception as e:
        print(f"Failed to scrape {url}. Reason: {e}")
        return None

if __name__ == "__main__":
    # Example usage:
    test_url = "https://www.nytimes.com"  # Or any other news URL
    url = input("Enter URL:")
    result = scrape_website(url)
    if result:
        print("Title:", result["title"])
        print("Snippet:", result["snippet"])
    else:
        print("Scraping failed.")


Title: America’s border crisis, in charts | Vox
Snippet: America’s border crisis, in charts | Vox Skip to main content The homepage Vox Vox logo Explainers Politics Culture Advice Listen Audio Watch Video Menu The homepage Vox Vox logo Navigation Drawer close Close Search Video Watch Audio Listen Crossword


In [8]:
import requests
from bs4 import BeautifulSoup

def scrape_website(url):
    """
    Scrapes the given URL and returns a dictionary with:
      - 'title': The <title> text (or a fallback if none is found).
      - 'description': The content of the <meta name='description'> tag if available.
      - 'snippet': The first 250 characters of the visible page text.
      
    Parameters:
        url (str): The URL of the website to scrape.

    Returns:
        dict: Contains 'title', 'description', and 'snippet' on success; 
              or None if an error occurs.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an HTTPError if the status is 4xx/5xx
        
        # Parse HTML
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Get <title>
        title_tag = soup.find('title')
        if title_tag:
            title = title_tag.get_text().strip()
        else:
            title = "No title found"
        
        # Get <meta name="description" ... >
        description_tag = soup.find('meta', attrs={'name': 'description'})
        if description_tag and 'content' in description_tag.attrs:
            description = description_tag['content'].strip()
        else:
            description = "No description found"
        
        # Get all visible text
        full_text = soup.get_text(separator=' ', strip=True)
        
        # Extract first 250 characters
        snippet = full_text[:1000]
        
        return {
            "title": title,
            "description": description,
            "snippet": snippet
        }
    
    except Exception as e:
        print(f"Failed to scrape {url}. Reason: {e}")
        return None

if __name__ == "__main__":
    # Example usage:
    url = input("Enter URL:")
    result = scrape_website(url)
    if result:
        print("Title:", result["title"])
        print("Description:", result["description"])
        print("Snippet:", result["snippet"])
    else:
        print("Scraping failed.")
        


Title: RFK Jr. hearing live updates: 2nd day of questions expected on vaccines, abortion - ABC News
Description: He's appearing before the Senate Health, Education, Labor and Pension Committee.
Snippet: RFK Jr. hearing live updates: 2nd day of questions expected on vaccines, abortion - ABC News ABC News Video Live Shows 538 Shop Log In Stream on Live RFK Jr. hearing live updates: 2nd day of questions expected on vaccines, abortion He's appearing before the Senate Health, Education, Labor and Pension Committee. 4:29 Robert F. Kennedy Jr., U.S. President Trump's nominee to be Secretary of Health and Human Services, testifies before a Senate Finance Committee confirmation hearing on Capitol Hill in Washington, Jan. 29, 2025. Nathan Howard/Reuters By Alexandra Hutzler , Ivan Pereira , and Mary Kekatos Last Updated: January 29, 2025, 3:56 PM EST President Donald Trump has promised he'd let Robert F. Kennedy Jr. "go wild" on health, food and medicine as head of the Department of Health and H