<a href="https://colab.research.google.com/github/aeleraqi/GoogleNewsScraper./blob/main/GoogleNewsScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install feedparser

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=cb2ee45d483a6fa4a9b55fa3e2fa1d57b62b63298d497d3b0ee09205d877747c
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [None]:
import feedparser
from datetime import datetime, timedelta
import logging
import pandas as pd
from urllib.parse import quote

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the Google News Feed Scraper class
class GoogleNewsFeedScraper:
    def __init__(self, query, start_date, end_date, language):
        self.query = query
        self.start_date = start_date  # Already a datetime object
        self.end_date = end_date      # Already a datetime object
        self.language = language

    def scrape_google_news_feed(self):
        articles = []
        current_date = self.start_date

        while current_date <= self.end_date:
            encoded_query = quote(self.query)
            rss_url = f'https://news.google.com/rss/search?q={encoded_query}&hl={self.language}&gl=US&ceid=US:{self.language[:2]}'
            feed = feedparser.parse(rss_url)

            if feed.entries:
                for entry in feed.entries:
                    try:
                        # Try to parse the published date from the entry
                        pubdate = datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %Z')
                    except (AttributeError, ValueError):
                        logging.warning(f"Failed to parse date for article: {entry.title}")
                        continue

                    # Check if the article's publication date falls within the specified range
                    if self.start_date <= pubdate <= self.end_date:
                        title = entry.title
                        link = entry.link
                        description = entry.summary if hasattr(entry, 'summary') else entry.description
                        source = entry.source.title if hasattr(entry, 'source') and hasattr(entry.source, 'title') else 'Unknown'
                        articles.append({
                            'Title': title,
                            'Link': link,
                            'Description': description,
                            'Published': pubdate,
                            'Source': source
                        })
            else:
                logging.info(f"No articles found for date: {current_date.strftime('%Y-%m-%d')}")

            current_date += timedelta(days=1)

        return articles

# Function to fetch articles for multiple queries
def fetch_articles(queries, start_date, end_date, language):
    all_articles = []

    for query in queries:
        logging.info(f"Fetching news for query: {query.strip()}")
        scraper = GoogleNewsFeedScraper(query.strip(), start_date, end_date, language)
        articles = scraper.scrape_google_news_feed()
        all_articles.extend(articles)
        logging.info(f"Fetched {len(articles)} articles for query: {query.strip()}")
        logging.info("="*80)

    return all_articles

# User inputs
keywords = input("Enter keywords (comma separated for multiple): ").split(',')
start_date_str = input("Enter start date (YYYY-MM-DD): ")
end_date_str = input("Enter end date (YYYY-MM-DD): ")
language = input("Enter language (e.g., en for English, ar for Arabic): ")

# Convert the date strings to datetime objects
try:
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
except ValueError as e:
    print(f"Error: {e}. Please ensure dates are in the format YYYY-MM-DD.")
    raise

# Fetch and display the news articles
all_articles = fetch_articles(keywords, start_date, end_date, language)

# Convert the list of articles to a DataFrame and remove duplicates
df = pd.DataFrame(all_articles).drop_duplicates(subset=['Title', 'Link'])

Enter keywords (comma separated for multiple): Gaza
Enter start date (YYYY-MM-DD): 2024-10-01
Enter end date (YYYY-MM-DD): 2024-10-13
Enter language (e.g., en for English, ar for Arabic): en


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63 entries, 0 to 62
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Title        63 non-null     object        
 1   Link         63 non-null     object        
 2   Description  63 non-null     object        
 3   Published    63 non-null     datetime64[ns]
 4   Source       63 non-null     object        
dtypes: datetime64[ns](1), object(4)
memory usage: 3.0+ KB


In [None]:
df.head(5)

Unnamed: 0,Title,Link,Description,Published,Source
0,France's Macron calls for an end to arms expor...,https://news.google.com/rss/articles/CBMisgFBV...,"<a href=""https://news.google.com/rss/articles/...",2024-10-11 19:03:36,Reuters
1,Bill Maher targets LGBTQ singer Chappell Roan ...,https://news.google.com/rss/articles/CBMiqgFBV...,"<a href=""https://news.google.com/rss/articles/...",2024-10-12 22:35:00,The Hill
2,UN inquiry accuses Israel of ‘crime of extermi...,https://news.google.com/rss/articles/CBMingFBV...,"<a href=""https://news.google.com/rss/articles/...",2024-10-11 09:49:00,CNN
3,UN inquiry accuses Israel of seeking to destro...,https://news.google.com/rss/articles/CBMiygFBV...,"<a href=""https://news.google.com/rss/articles/...",2024-10-11 07:42:02,Reuters
4,UN inquiry accuses Israel of crime of ‘extermi...,https://news.google.com/rss/articles/CBMitAFBV...,"<a href=""https://news.google.com/rss/articles/...",2024-10-10 16:18:45,Al Jazeera English


In [None]:
df.to_excel('data.xlsx', index=False)  # Save the dataframe as an Excel file named 'data.xlsx' and exclude the index column