In [None]:
pip install newspaper3k feedparser


Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sgmllib3k (from f

In [None]:
import feedparser
from newspaper import Article
import pandas as pd

# Function to parse RSS feeds and extract article URLs
def parse_rss_feed(rss_url):
    feed = feedparser.parse(rss_url)
    article_urls = [entry['link'] for entry in feed['entries']]
    return article_urls

# Function to download and parse articles using newspaper3k
def extract_article_info(article_url):
    article = Article(article_url)
    article.download()
    article.parse()

    # Extract relevant information
    title = article.title
    authors = ', '.join(article.authors)
    publish_date = article.publish_date
    content = article.text

    return {
        'URL': article_url,
        'Title': title,
        'Authors': authors,
        'Publish Date': publish_date,
        'Content': content
    }

# Function to parse RSS feeds and extract information from articles
def extract_articles_from_rss(rss_urls):
    articles_data = []

    for rss_url in rss_urls:
        print(f"Processing RSS feed: {rss_url}")
        # Step 1: Parse RSS feed and get article URLs
        article_urls = parse_rss_feed(rss_url)

        # Step 2: For each URL, extract article info
        for url in article_urls:
            try:
                print(f"Extracting article from: {url}")
                article_info = extract_article_info(url)
                articles_data.append(article_info)
            except Exception as e:
                print(f"Failed to process {url}: {e}")

    return articles_data

# List of RSS feeds to parse
rss_feed_urls = [
    "https://rss.cnn.com/rss/cnn_topstories.rss",
    "https://feeds.bbci.co.uk/news/rss.xml",
    "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"
]

# Step 3: Extract articles from the given RSS feeds
articles = extract_articles_from_rss(rss_feed_urls)

# Step 4: Convert the extracted data into a DataFrame and save it as a CSV file
df = pd.DataFrame(articles)
df.to_csv('extracted_news_articles.csv', index=False)

# Display the extracted data
print(df.head())


Processing RSS feed: https://rss.cnn.com/rss/cnn_topstories.rss
Processing RSS feed: https://feeds.bbci.co.uk/news/rss.xml
Extracting article from: https://www.bbc.com/news/articles/cq5eewvy3nlo
Extracting article from: https://www.bbc.com/news/articles/c2kddp5x5zno
Extracting article from: https://www.bbc.com/news/articles/c98486dzxnzo
Extracting article from: https://www.bbc.com/news/articles/cvg3g8wpwwqo
Extracting article from: https://www.bbc.com/news/articles/cqlvv2xwd9po
Extracting article from: https://www.bbc.com/news/articles/c4g003lnkm9o
Extracting article from: https://www.bbc.com/sport/football/articles/c1d7drg10nwo
Extracting article from: https://www.bbc.com/news/articles/cy0gg9k76w9o
Extracting article from: https://www.bbc.com/news/articles/c5y00x52d2vo
Extracting article from: https://www.bbc.com/news/articles/c5y3y79llndo
Extracting article from: https://www.bbc.com/news/videos/cevyy2k1z88o
Extracting article from: https://www.bbc.com/news/articles/cq6449gy87jo
Extra