In [2]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.cnn.com/'
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    # Example: Identify the container or HTML tags that hold headlines
    headlines = soup.select('h3.cd__headline a')
    # headlines will be a list of links or text elements
    for h in headlines:
        print(h.get_text(strip=True))
else:
    print("Failed to retrieve the page.")


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get('https://www.cnn.com/')

# Wait for page to load or use explicit wait
elements = driver.find_elements(By.CSS_SELECTOR, 'h3.cd__headline a')
for el in elements:
    print(el.text)

driver.quit()


In [5]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

def scrape_cnn_top_news():
    url = 'https://www.cnn.com/'
    response = requests.get(url)
    
    if response.status_code != 200:
        print("Failed to retrieve CNN homepage.")
        return
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 1. Select the top headlines
    #    CNN often uses <h3 class="cd__headline"><a ...> for headlines
    headlines = soup.select('h3.cd__headline a')
    
    if not headlines:
        print("No headlines found.")
        return
    
    # 2. For demonstration, let's just grab the "first" top headline
    top_headline_element = headlines[0]
    
    # 3. Extract the headline text and link
    headline_text = top_headline_element.get_text(strip=True)
    headline_link = top_headline_element.get('href')
    
    # CNN links can be relative (e.g. "/2023/xx/xx/..."), so join with base URL
    # to form a proper absolute URL
    if headline_link.startswith('/'):
        headline_link = urllib.parse.urljoin(url, headline_link)
    
    print("Top Headline:", headline_text)
    print("URL:", headline_link)
    
    # 4. Request the article page to get full content
    article_response = requests.get(headline_link)
    if article_response.status_code != 200:
        print("Failed to retrieve the article content.")
        return
    
    article_soup = BeautifulSoup(article_response.text, 'html.parser')
    
    # 5. Find the article body content
    # CNN sometimes structures article text within these sections.
    # The exact selector changes over time. We can try:
    #    - section.zn-body-text, or
    #    - div.article__content, or
    #    - just gather paragraphs in the main container, etc.
    
    # Example: gather paragraphs from within <div> elements that might hold the text
    # This is just an example; you may need to update the selector if it stops working.
    article_paragraphs = article_soup.select('section.zn-body-text p, div.l-container p')
    
    # 6. Join all paragraph texts into one string
    article_text = "\n".join([p.get_text(strip=True) for p in article_paragraphs])
    
    # 7. Print or return the collected article text
    print("Article Content:\n", article_text)

if __name__ == "__main__":
    scrape_cnn_top_news()


No headlines found.


In [6]:
import requests

def get_top_headlines(api_key, country='us'):
    """
    Fetches top headlines from NewsAPI for the specified country.
    Returns a list of articles, where each article is a dict with keys:
    'source', 'author', 'title', 'description', 'url', 'content', etc.
    """
    url = "https://newsapi.org/v2/top-headlines"
    params = {
        "country": country,
        "apiKey": api_key
    }
    
    response = requests.get(url, params=params)
    
    # Check for errors
    if response.status_code != 200:
        print("Error fetching news:", response.text)
        return []
    
    data = response.json()
    
    # If the request was successful, data will have a list of articles
    articles = data.get("articles", [])
    return articles

def main():
    # Replace with your actual API key
    api_key = "4e27164d95c74a7ab521921bf5e666b9"
    
    # Grab top headlines (breaking news) for the US
    articles = get_top_headlines(api_key, country='us')
    
    if not articles:
        print("No articles returned.")
        return
    
    # Print the title and content for each article
    for idx, article in enumerate(articles, start=1):
        title = article.get("title", "No title")
        content = article.get("content", "No content")
        
        print(f"Article #{idx}")
        print("Title:", title)
        print("Content:", content)
        print("-" * 50)

if __name__ == "__main__":
    main()


Article #1
Title: [Removed]
Content: [Removed]
--------------------------------------------------
Article #2
Title: NBA coaches react with dismay over firing of 2-time coach of the year Mike Brown - The Associated Press
Content: ORLANDO, Fla. (AP) Not even two years ago, Rick Carlisle publicly lauded Mike Brown for the job he did on the way to winning the NBAs coach of the year award.
And on Friday, Carlisle was among a sle… [+4032 chars]
--------------------------------------------------
Article #3
Title: Scientists Just Proved Your Brain Operates at the Speed of a Snail—and It’s Shockingly Low - The Daily Galaxy --Great Discoveries Channel
Content: Recent research by Caltech has uncovered a striking limitation in human cognition: the brain processes conscious thought at a mere 10 bits per second. In contrast, our sensory systems collect a billi… [+4228 chars]
--------------------------------------------------
Article #4
Title: Greg Gumbel, Longtime CBS Sports Studio Host and Play-by-

In [None]:
def filter_top_5_valid_articles(articles):
    valid_articles = []
    
    for article in articles:
        title = article.get("title")
        content = article.get("content")
        
        # We skip if the title or content is "[Removed]" or if content is None-like
        if title == "[Removed]" or content == "[Removed]" or content is None or content == "None":
            continue
        
        valid_articles.append(article)
        
    # Slice the first 5 valid articles
    top_5_articles = valid_articles[:5]
    
    return top_5_articles


def main():
    # Replace with your actual API key
    api_key = "4e27164d95c74a7ab521921bf5e666b9"
    
    # Grab top headlines (breaking news) for the US
    articles = get_top_headlines(api_key, country='us')
    
    if not articles:
        print("No articles returned.")
        return
    
    # Take only the first 10 articles
    top_5_valid = filter_top_5_valid_articles(articles)
    
    # Print the title and content for each
    for idx, article in enumerate(top_5_valid, start=1):
        title = article.get("title", "No title")
        content = article.get("content", "No content")
        
        print(f"Article #{idx}")
        print("Title:", title)
        print("Content:", content)
        print("-" * 50)

if __name__ == "__main__":
    main()


Article #1
Title: NBA coaches react with dismay over firing of 2-time coach of the year Mike Brown - The Associated Press
Content: ORLANDO, Fla. (AP) Not even two years ago, Rick Carlisle publicly lauded Mike Brown for the job he did on the way to winning the NBAs coach of the year award.
And on Friday, Carlisle was among a sle… [+4032 chars]
--------------------------------------------------
Article #2
Title: Scientists Just Proved Your Brain Operates at the Speed of a Snail—and It’s Shockingly Low - The Daily Galaxy --Great Discoveries Channel
Content: Recent research by Caltech has uncovered a striking limitation in human cognition: the brain processes conscious thought at a mere 10 bits per second. In contrast, our sensory systems collect a billi… [+4228 chars]
--------------------------------------------------
Article #3
Title: Greg Gumbel, Longtime CBS Sports Studio Host and Play-by-Play Man, Dies at 78 - Hollywood Reporter
Content: Greg Gumbel, the sure-handed CBS sportscaster w

: 