In [None]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Co

In [None]:
!pip install lxml_html_clean

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.1


In [None]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
from urllib.parse import urljoin


In [None]:
import requests
import json

def fetch_articles(api_key):
    """
    Fetch latest articles from NewsData.io API

    Args:
    api_key (str): Your NewsData.io API key

    Returns:
    list: List of articles
    """
    # API endpoint for latest news
    url = "https://newsdata.io/api/1/latest"

    # Parameters for the API request
    params = {
        'apikey': api_key,
        'language': 'en',  # Specify English language
        'domainurl': 'bbc.com'
    }

    try:
        # Send GET request to the API
        response = requests.get(url, params=params)

        # Raise an exception for bad status codes
        response.raise_for_status()

        # Parse the JSON response
        data = response.json()

        # Check if 'results' key exists in the response
        if 'results' in data:
            # Return list of articles
            return data['results']
        else:
            print("No articles found or unexpected response format")
            return []

    except requests.RequestException as e:
        print(f"Error fetching articles: {e}")
        return []

def print_articles(articles):
    """
    Print details of fetched articles and return a list of links

    Args:
    articles (list): List of article dictionaries

    Returns:
    list: List of article links
    """
    if not articles:
        print("No articles to display")
        return []

    links = []

    print(f"Total articles fetched: {len(articles)}")
    for article in articles:
        links.append(article.get('link', 'No link'))



    return links

# Your NewsData.io API key
API_KEY = "pub_617376e0ef3f96bd0ca7794d1a43305614562"

# Fetch and print articles
articles = fetch_articles(API_KEY)
links = print_articles(articles)




Total articles fetched: 10


In [None]:
print(links)

['https://www.bbc.com/news/videos/cx2yjlvvxk4o', 'https://www.bbc.com/news/articles/c938892g294o', 'https://www.bbc.com/news/videos/cwypw11qnppo', 'https://www.bbc.com/sport/cricket/articles/c140n8j8zp3o', 'https://www.bbc.com/sport/cricket/articles/cgj62vwpll9o', 'https://www.bbc.com/news/articles/c4gw7qedn15o', 'https://www.bbc.com/sport/athletics/articles/cm2e38ny7rlo', 'https://www.bbc.com/news/articles/cgl9yk42rz7o', 'https://www.bbc.com/news/articles/c791dnjglq5o', 'https://www.bbc.com/sport/football/articles/cy8yy2krj1mo']


In [None]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline



In [None]:

def scrape_bbc_article(url):
    """
    Scrape text from BBC News articles

    Args:
    url (str): BBC News article URL

    Returns:
    str: Extracted article text
    """
    try:
        # Set up headers to mimic a browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        # Fetch the webpage
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Try to find article body using BBC's typical article structure
        article_body = soup.find('div', class_=lambda x: x and ('article-body' in x or 'story-body' in x))

        if not article_body:
            # Fallback to finding all paragraph tags
            paragraphs = soup.find_all('p')
        else:
            # Find paragraphs within the article body
            paragraphs = article_body.find_all('p')

        # Extract and combine text from paragraphs
        article_text = ' '.join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])

        # Limit text length to prevent overwhelming the summarization model
        return article_text[:3000]

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

def summarize_text(text):
    """
    Generate a summary of the given text using Hugging Face's summarization pipeline

    Args:
    text (str): Text to summarize

    Returns:
    str: Generated summary
    """
    try:
        # Initialize summarization pipeline
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

        # Generate summary
        if len(text) > 50:  # Ensure text is long enough for summarization
            summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
            return summary[0]['summary_text'] if summary else "Unable to generate summary."
        else:
            return "Text too short for summarization."

    except Exception as e:
        print(f"Error generating summary: {e}")
        return "Summary generation failed."


# Process each link

In [None]:
articles_data = []  # List to store dictionaries for each article

for link in links:
    print(f"\n--- Scraping Article: {link} ---")

    # Scrape article text
    article_text = scrape_bbc_article(link)

    if article_text:
        # Generate summary
        summary = summarize_text(article_text)

        # Add title and summary to the dictionary
        articles_data.append({
            "title": link,  # Replace with the article's actual title if available
            "summary": summary
        })
    else:
        print("Could not extract article text.")

# Print the result
print("\nExtracted Articles Data:")
for article in articles_data:
    print(f"Title: {article['title']}")
    print(f"Summary: {article['summary']}\n")



--- Scraping Article: https://www.bbc.com/news/videos/cx2yjlvvxk4o ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


--- Scraping Article: https://www.bbc.com/news/articles/c938892g294o ---

--- Scraping Article: https://www.bbc.com/news/videos/cwypw11qnppo ---

--- Scraping Article: https://www.bbc.com/sport/cricket/articles/c140n8j8zp3o ---

--- Scraping Article: https://www.bbc.com/sport/cricket/articles/cgj62vwpll9o ---

--- Scraping Article: https://www.bbc.com/news/articles/c4gw7qedn15o ---

--- Scraping Article: https://www.bbc.com/sport/athletics/articles/cm2e38ny7rlo ---

--- Scraping Article: https://www.bbc.com/news/articles/cgl9yk42rz7o ---

--- Scraping Article: https://www.bbc.com/news/articles/c791dnjglq5o ---

--- Scraping Article: https://www.bbc.com/sport/football/articles/cy8yy2krj1mo ---

Extracted Articles Data:
Title: https://www.bbc.com/news/videos/cx2yjlvvxk4o
Summary: Rebel fighters and members of the public have gathered in Damascus to celebrate the end of President Bashar Al-Assad's regime. The BBC's Lina Sinjab was drowned out by the sounds of celebratory gunfire as she r