In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from googlenewsdecoder import gnewsdecoder
from urllib.parse import urlparse
from fake_useragent import UserAgent

In [2]:
import feedparser
from newspaper import Article
from urllib.parse import unquote

In [3]:
#Getting top news from google news (many links to sites)

def get_news(rss_url="https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en"):
    response = requests.get(rss_url)
    soup = BeautifulSoup(response.content, 'xml')
    
    return [{
        'title': item.title.text,
        'source': item.source.text,
        'time': item.pubDate.text,
        'link': item.link.text
    } for item in soup.find_all('item')]

In [4]:
#Decoding the google news links to original source

def decode_google_url(google_url):
    """Decode Google News tracking URL to original source"""
    try:
        # Extract article ID from URL path
        path_segments = urlparse(google_url).path.split('/')
        article_id = path_segments[-1] if path_segments else ""
        
        # Use the decoder package
        result = gnewsdecoder(
            google_url,
            interval=2,  # Add 2s delay between requests
            proxy=None  # Add proxy if getting blocked
        )
        
        if result['status']:
            return result['decoded_url']
        return google_url  # Fallback to original URL
    
    except Exception as e:
        print(f"Decoding failed: {str(e)}")
        return google_url


In [5]:
news=get_news()
news

[{'title': 'The Memo: Democrats, lawyers left reeling from Paul, Weiss firm’s Trump deal - The Hill',
  'source': 'The Hill',
  'time': 'Sat, 22 Mar 2025 10:16:58 GMT',
  'link': 'https://news.google.com/rss/articles/CBMihwFBVV95cUxPS3VXaTF1S1RoN1dZTXQ4elZIQk1sY18xdjVvaTR1U0dhV1hWWUtyb2dFYi04VFpGZnEzYW5wQnNFcHB2RWp4eWRfc1JxWGV0Tlp1RFNnS2VYS0Z2Y0V2RFpoSEdnQm5rcDh1b2x3NENpaXNWUm9JU2RsZlltMVdZM0otSFlMREXSAYwBQVVfeXFMTnlNXzZVTlk0NFU5dnBRZ2tsRW9kLTItaXJHRVFXNllaTW92V2stRGlEb0czbXZ6UnBGeVdCTzlQcGV0NGhlZkEweFNzSjBMTXBndmozVUg4aWdTNWtTR1RzTE5WUW1kQWdsOWpsQXE0Y0d2a0Q1S0puZDJTc2E3ZFpJVzVmTkVvWEtiOXo?oc=5'},
 {'title': 'Sanders-AOC rallies setting stage for protest movement: Activist - WANE',
  'source': 'WANE',
  'time': 'Sat, 22 Mar 2025 01:10:27 GMT',
  'link': 'https://news.google.com/rss/articles/CBMilAFBVV95cUxQd0Y3QW8wY3d5YlRjVVJheVNvRU1sQTVISVRjVUN2cEZXLWV0V2lHSGdjTVpIMjRJMER1Q0twbEtpUThlR0gwb1hTdmpTM3NJVGpFVFdSblpXVEdFamQtU1dpaks2clAzVlVrQ280Wklra0FJRGR2d0VIVTFUcGpRbW1XVE5vREl3bXAwU0hNNU

In [6]:
for item in news:
    item['link'] = decode_google_url(item['link'])
    print(item['link'])

https://thehill.com/homenews/administration/5208538-trump-intimidates-law-firms/
https://www.wane.com/news/sanders-aoc-rallies-setting-stage-for-protest-movement-activist/
https://www.pbs.org/newshour/world/u-s-detention-of-european-and-canadian-tourists-creates-fear-over-traveling-to-america
https://www.bbc.com/news/articles/cn4ynpzk8d8o
https://www.aljazeera.com/news/2025/3/22/trump-revokes-security-clearances-of-biden-harris-and-clinton
https://abcnews.go.com/Politics/acting-social-security-chief-now-shut-agency-after/story?id=120046608
https://www.city-journal.org/article/mahmoud-khalil-columbia-hamas-speech-ramzi-kassem
https://apnews.com/article/oak-hill-virginia-james-monroe-estate-20bf992b46a750959a9fa6a6e0a0ee83
https://www.politico.com/news/2025/03/21/james-boasberg-trump-administration-deportations-00003815
https://www.reuters.com/world/uk/global-travel-chaos-has-airlines-scrambling-after-fire-forces-heathrow-shutdown-2025-03-22/
https://www.vaticannews.va/en/pope/news/2025-

In [7]:
news

[{'title': 'The Memo: Democrats, lawyers left reeling from Paul, Weiss firm’s Trump deal - The Hill',
  'source': 'The Hill',
  'time': 'Sat, 22 Mar 2025 10:16:58 GMT',
  'link': 'https://thehill.com/homenews/administration/5208538-trump-intimidates-law-firms/'},
 {'title': 'Sanders-AOC rallies setting stage for protest movement: Activist - WANE',
  'source': 'WANE',
  'time': 'Sat, 22 Mar 2025 01:10:27 GMT',
  'link': 'https://www.wane.com/news/sanders-aoc-rallies-setting-stage-for-protest-movement-activist/'},
 {'title': 'U.S. detention of European and Canadian tourists creates fear over traveling to America - PBS NewsHour',
  'source': 'PBS NewsHour',
  'time': 'Fri, 21 Mar 2025 22:41:07 GMT',
  'link': 'https://www.pbs.org/newshour/world/u-s-detention-of-european-and-canadian-tourists-creates-fear-over-traveling-to-america'},
 {'title': 'Israel strikes Lebanon after first rocket attack since ceasefire - BBC.com',
  'source': 'BBC.com',
  'time': 'Sat, 22 Mar 2025 12:56:59 GMT',
  '

In [8]:
import requests
from newspaper import Article
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import time

# Configure session with retry logic
session = requests.Session()
retries = Retry(
    total=3,
    backoff_factor=0.5,
    status_forcelist=[500, 502, 503, 504],
    allowed_methods=["GET"]
)
session.mount('https://', HTTPAdapter(max_retries=retries))

def get_full_article_content(article_url):
    """Enhanced article downloader with timeout handling"""
    headers = {
        'User-Agent': UserAgent().random,
        'Accept-Language': 'en-US,en;q=0.9',
        'Referer': 'https://news.google.com/',
        'DNT': '1'
    }
    
    try:
        # Get article with timeout and retry logic
        response = session.get(article_url, headers=headers, timeout=(3.05, 10))
        response.raise_for_status()

        article = Article(article_url)
        article.download(input_html=response.text)
        article.parse()
        
        return {
            'title': article.title,
            'publish_date': article.publish_date,
            'text': article.text,
            'top_image': article.top_image,
            'source_url': article_url,
        }

    except requests.exceptions.Timeout:
        print(f"⏰ Timeout skipped: {article_url}")
        return None
    except Exception as e:
        print(f"⚠️ Error processing {article_url}: {str(e)}")
        return None

# Batch processing with progress tracking
def process_news_items(news_items, delay=1):
    results = []
    for idx, item in enumerate(news_items, 1):
        print(f"Processing article {idx}/{len(news_items)}...")
        article = get_full_article_content(item['link'])
        if article:
            results.append(article)
        time.sleep(delay)  # Respectful delay between requests
    return results

# Usage
newslist = process_news_items(news)
newslist

Processing article 1/37...
Processing article 2/37...
Processing article 3/37...
Processing article 4/37...
Processing article 5/37...
Processing article 6/37...
Processing article 7/37...
Processing article 8/37...
Processing article 9/37...
⚠️ Error processing https://www.politico.com/news/2025/03/21/james-boasberg-trump-administration-deportations-00003815: 403 Client Error: Forbidden for url: https://www.politico.com/news/2025/03/21/james-boasberg-trump-administration-deportations-00003815
Processing article 10/37...
Processing article 11/37...
Processing article 12/37...
Processing article 13/37...
Processing article 14/37...
Processing article 15/37...
Processing article 16/37...
Processing article 17/37...
Processing article 18/37...
Processing article 19/37...
Processing article 20/37...
Processing article 21/37...
Processing article 22/37...
Processing article 23/37...
Processing article 24/37...
Processing article 25/37...
Processing article 26/37...
Processing article 27/37.

[{'title': 'The Memo: Democrats, lawyers left reeling from Paul, Weiss firm’s Trump deal',
  'publish_date': datetime.datetime(2025, 3, 22, 10, 0, tzinfo=tzutc()),
  'text': 'A Washington Post editorial appeared earlier this week headlined “Trump’s efforts to intimidate the legal profession cannot stand.”\n\nWithin 48 hours later, those efforts were not merely standing. They had worked — at least in one case.\n\nThe major law firm of Paul, Weiss, Rifkind, Wharton & Garrison (called Paul, Weiss), targeted by President Trump in an executive order, opted to mollify rather than confront the president. In short, it backed down.\n\nDemocrats, as well as many lawyers, are grappling with the implications.\n\nPaul, Weiss agreed to provide $40 million in pro bono work on causes backed by the administration and to hire an outside expert to audit its hiring and employment practices.\n\nAfter the firm had made those concessions, and some others, Trump agreed to lift an executive order that would ha

In [9]:
df = pd.DataFrame(newslist)

In [10]:
df

Unnamed: 0,title,publish_date,text,top_image,source_url
0,"The Memo: Democrats, lawyers left reeling from...",2025-03-22 10:00:00+00:00,A Washington Post editorial appeared earlier t...,https://thehill.com/wp-content/uploads/sites/2...,https://thehill.com/homenews/administration/52...
1,Sanders-AOC rallies setting stage for protest ...,2025-03-22 01:10:27+00:00,(NewsNation) — There are signs to be read in t...,https://www.wane.com/wp-content/uploads/sites/...,https://www.wane.com/news/sanders-aoc-rallies-...
2,U.S. detention of European and Canadian touris...,2025-03-21 18:41:07-04:00,SAN DIEGO (AP) — Lennon Tyler and her German f...,https://d3i6fh83elv35t.cloudfront.net/static/2...,https://www.pbs.org/newshour/world/u-s-detenti...
3,Israel strikes Lebanon after first rocket atta...,,Israel strikes Lebanon after first rocket atta...,https://ichef.bbci.co.uk/news/1024/branded_new...,https://www.bbc.com/news/articles/cn4ynpzk8d8o
4,"Trump targets Biden, Harris in US security cle...",2025-03-22 00:00:00,The executive order also targets Hillary Clint...,https://www.aljazeera.com/wp-content/uploads/2...,https://www.aljazeera.com/news/2025/3/22/trump...
5,Acting Social Security chief now says he won't...,,Acting Social Security chief now says he won't...,https://i.abcnewsfe.com/a/d0e49d12-8d2b-459a-a...,https://abcnews.go.com/Politics/acting-social-...
6,Who Are the Shadowy Figures Defending Mahmoud ...,,"As it unfurls, the saga of Mahmoud Khalilâth...",https://media4.manhattan-institute.org/wp-cont...,https://www.city-journal.org/article/mahmoud-k...
7,James Monroe’s Oak Hill estate has historic ro...,2025-03-22 11:00:43,"ALDIE, Va. (AP) — The room where President Jam...",https://dims.apnews.com/dims4/default/e0c40ca/...,https://apnews.com/article/oak-hill-virginia-j...
8,Heathrow resumes operations as global airlines...,2025-03-22 00:00:00,Summary\n\nCompanies Operations normal on Satu...,https://www.reuters.com/resizer/v2/4XH4L74NIJN...,https://www.reuters.com/world/uk/global-travel...
9,Pope to greet crowd at Gemelli on Sunday,2025-03-22 00:00:00,People pray next the statue of Pope John Paul ...,https://www.vaticannews.va/content/dam/vatican...,https://www.vaticannews.va/en/pope/news/2025-0...


In [16]:
df["source_url"][0]

'https://thehill.com/homenews/administration/5208538-trump-intimidates-law-firms/'