# News Scraper with Advanced Content Cleaning

This notebook extracts news articles and applies comprehensive cleaning to get only the article content.

## Requirements
```bash
pip install newspaper4k pygooglenews lxml_html_clean googlenewsdecoder
```

## Cleaning Features
- Remove emails and contact information
- Remove social media links and CTAs (call-to-action)
- Remove navigation text and headers
- Remove advertisements and boilerplate
- Clean and format into proper paragraphs
- Remove excessive whitespace
- Filter out short non-content sentences

In [1]:
# Import required libraries
from pygooglenews import GoogleNews
from newspaper import Article
import pandas as pd
from datetime import datetime
import time
from googlenewsdecoder import new_decoderv1
import re

In [2]:
def clean_article_text(text):
    """
    Comprehensive cleaning of article text to remove boilerplate, ads, and noise
    Returns clean paragraphs of article content only
    """
    if not text or len(text.strip()) == 0:
        return ""
    
    # Step 1: Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    
    # Step 2: Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Step 3: Remove LaTeX patterns
    text = re.sub(r'\$.*?\$', '', text)  # Inline math
    text = re.sub(r'\$\$.*?\$\$', '', text, flags=re.DOTALL)  # Display math
    text = re.sub(r'\\\[.*?\\\]', '', text, flags=re.DOTALL)  # Display math
    text = re.sub(r'\\\(.*?\\\)', '', text, flags=re.DOTALL)  # Display math
    text = re.sub(r'\\begin\{[a-z]+\*?\}.*?\\end\{[a-z]+\*?\}', '', text, flags=re.DOTALL) # Environments
    text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?(?:\{[^\}]*\})?', '', text) # Commands
    
    # Step 4: Remove common boilerplate patterns
    boilerplate_patterns = [
        r'(?i)subscribe to our newsletter',
        r'(?i)sign up for our newsletter',
        r'(?i)follow us on',
        r'(?i)share this article',
        r'(?i)read more:',
        r'(?i)advertisement',
        r'(?i)click here',
        r'(?i)related articles',
        r'(?i)you may also like',
        r'(?i)recommended for you',
        r'(?i)terms of service',
        r'(?i)privacy policy',
        r'(?i)cookie policy',
        r'(?i)all rights reserved',
        r'(?i)copyright ©',
        r'©\s*\d{4}',
        r'(?i)join our community',
        r'(?i)get the latest',
        r'(?i)breaking news',
        r'(?i)trending now',
    ]
    
    for pattern in boilerplate_patterns:
        text = re.sub(pattern + r'[^.!?]*[.!?]', '', text)
    
    # Step 5: Split into sentences and filter
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Filter out short sentences (likely navigation/ads)
    # Keep sentences with at least 10 words
    cleaned_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        
        # Skip if too short
        word_count = len(sentence.split())
        if word_count < 10:
            continue
        
        # Skip if contains too many capital letters (likely navigation)
        capitals = sum(1 for c in sentence if c.isupper())
        if len(sentence) > 0 and capitals / len(sentence) > 0.3:
            continue
        
        # Skip sentences with common navigation patterns
        nav_keywords = ['facebook', 'twitter', 'instagram', 'linkedin', 'youtube', 
                       'subscribe', 'newsletter', 'advertisement', 'sponsored']
        if any(keyword in sentence.lower() for keyword in nav_keywords):
            continue
        
        cleaned_sentences.append(sentence)
    
    # Step 6: Remove excessive punctuation and special characters
    cleaned_text = ' '.join(cleaned_sentences)
    cleaned_text = re.sub(r'[^\w\s.,!?;:\'\"\-()]', ' ', cleaned_text)
    
    # Step 7: Remove excessive whitespace (this includes newlines like \n)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    
    # Step 8: Format into paragraphs (split long text every 3-5 sentences)
    sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
    paragraphs = []
    current_paragraph = []
    
    for i, sentence in enumerate(sentences):
        current_paragraph.append(sentence)
        
        # Create a new paragraph every 4-5 sentences
        if (i + 1) % 4 == 0 and len(current_paragraph) > 0:
            paragraphs.append(' '.join(current_paragraph))
            current_paragraph = []
    
    # Add remaining sentences
    if current_paragraph:
        paragraphs.append(' '.join(current_paragraph))
    
    # Join paragraphs with double line breaks
    final_text = '\n\n'.join(paragraphs)
    
    return final_text

In [3]:
def decode_google_news_url(google_url, max_retries=3):
    """
    Decode Google News URL to get the actual article URL
    """
    for attempt in range(max_retries):
        try:
            result = new_decoderv1(google_url, interval=2)
            
            if result.get('status'):
                decoded_url = result.get('decoded_url')
                if decoded_url and 'http' in decoded_url:
                    return decoded_url
            
            if attempt < max_retries - 1:
                time.sleep(2)
        
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2)
            else:
                print(f"   Error decoding URL: {str(e)}")
    
    return None

In [4]:
def extract_full_article(url):
    """
    Extract and clean full article content
    """
    try:
        original_url = url
        
        # Decode Google News URL if needed
        if 'news.google.com' in url:
            print(f"   Decoding Google News URL...")
            url = decode_google_news_url(url)
            
            if not url:
                return {
                    'url': original_url,
                    'resolved_url': None,
                    'title': 'Error',
                    'authors': '',
                    'publish_date': None,
                    'raw_content': '',
                    'cleaned_content': '',
                    'top_image': '',
                    'keywords': '',
                    'raw_length': 0,
                    'cleaned_length': 0,
                    'extraction_status': 'Failed: Could not decode Google News URL'
                }
            
            print(f"   ✓ Decoded to: {url[:80]}...")
        
        # Extract article content
        article = Article(url)
        article.download()
        article.parse()
        
        # Extract NLP features
        try:
            article.nlp()
        except:
            pass
        
        raw_text = article.text
        
        # Apply comprehensive cleaning
        print(f"   Cleaning content...")
        cleaned_text = clean_article_text(raw_text)
        
        return {
            'url': original_url,
            'resolved_url': url,
            'title': article.title,
            'authors': ', '.join(article.authors) if article.authors else 'Unknown',
            'publish_date': str(article.publish_date) if article.publish_date else None,
            'raw_content': raw_text,
            'cleaned_content': cleaned_text,
            'top_image': article.top_image,
            'keywords': ', '.join(article.keywords) if hasattr(article, 'keywords') and article.keywords else '',
            'raw_length': len(raw_text),
            'cleaned_length': len(cleaned_text),
            'extraction_status': 'Success'
        }
    
    except Exception as e:
        return {
            'url': original_url if 'original_url' in locals() else url,
            'resolved_url': url if 'original_url' in locals() else None,
            'title': 'Error',
            'authors': '',
            'publish_date': None,
            'raw_content': '',
            'cleaned_content': '',
            'top_image': '',
            'keywords': '',
            'raw_length': 0,
            'cleaned_length': 0,
            'extraction_status': f'Failed: {str(e)}'
        }

In [5]:
def search_news(query=None, topic=None, max_results=10):
    """
    Search for news articles using query OR topic
    """
    gn = GoogleNews(lang='en', country='US')
    
    # Determine search mode
    if query and topic:
        print(f"⚠️  WARNING: PyGoogleNews cannot combine query + topic filters.")
        print(f"   Using QUERY ONLY: '{query}' (ignoring topic filter)\n")
        search_result = gn.search(query)
        search_mode = f"Query: '{query}' (across all topics)"
    elif query:
        print(f"Searching for: '{query}' (across all topics)")
        search_result = gn.search(query)
        search_mode = f"Query: '{query}'"
    elif topic:
        print(f"Getting latest headlines for topic: {topic}")
        search_result = gn.topic_headlines(topic)
        search_mode = f"Topic: {topic}"
    else:
        print("Getting top news stories")
        search_result = gn.top_news()
        search_mode = "Top News"
    
    print(f"Max results: {max_results}\n")
    
    articles = []
    entries = search_result.get('entries', [])[:max_results]
    
    print(f"Found {len(entries)} articles. Extracting and cleaning content...\n")
    
    for idx, entry in enumerate(entries, 1):
        url = entry.link
        print(f"[{idx}/{len(entries)}] Processing: {entry.title[:60]}...")
        
        article_data = extract_full_article(url)
        article_data['search_query'] = query if query else 'N/A'
        article_data['topic'] = topic if topic else 'N/A'
        article_data['search_mode'] = search_mode
        articles.append(article_data)
        
        if article_data['extraction_status'] == 'Success':
            print(f"   ✓ Raw: {article_data['raw_length']:,} chars | Cleaned: {article_data['cleaned_length']:,} chars")
            print(f"   Removed: {article_data['raw_length'] - article_data['cleaned_length']:,} chars of noise\n")
        else:
            print(f"   ✗ {article_data['extraction_status']}\n")
        
        time.sleep(1)
    
    return articles

## Search and Extract Articles

In [6]:
# INPUT YOUR SEARCH PARAMETERS HERE
query = "Parking occupancy monitoring software"  # Your search query
max_results = 5  # Number of articles

# Search and extract
articles = search_news(query=query, max_results=max_results)

# Convert to DataFrame
df = pd.DataFrame(articles)
print(f"\n{'='*80}")
print(f"✓ Successfully extracted {len(df[df['extraction_status'] == 'Success'])} out of {len(df)} articles")
print(f"{'='*80}\n")
df[['title', 'authors', 'raw_length', 'cleaned_length', 'extraction_status']]

Searching for: 'Parking occupancy monitoring software' (across all topics)
Max results: 5

Found 5 articles. Extracting and cleaning content...

[1/5] Processing: A digital twin framework for urban parking management and mo...
   Decoding Google News URL...
   ✓ Decoded to: https://www.nature.com/articles/s41467-025-65306-w...
   Cleaning content...
   ✓ Raw: 77,191 chars | Cleaned: 74,278 chars
   Removed: 2,913 chars of noise

[2/5] Processing: Europe Parking Management Market Size, Share & Growth, 2033 ...
   Decoding Google News URL...
   ✓ Decoded to: https://www.marketdataforecast.com/market-reports/europe-parking-management-mark...
   Cleaning content...
   ✓ Raw: 20,735 chars | Cleaned: 20,464 chars
   Removed: 271 chars of noise

[3/5] Processing: 5 ways police departments are using RTCCs beyond crime fight...
   Decoding Google News URL...
   ✓ Decoded to: https://www.police1.com/real-time-crime-center/5-ways-police-departments-are-usi...
   Cleaning content...
   ✓ Raw: 9,25

Unnamed: 0,title,authors,raw_length,cleaned_length,extraction_status
0,A digital twin framework for urban parking man...,Unknown,77191,74278,Success
1,"Europe Parking Management Market Size, Share &...","Market Data Forecast, Market Data Forecast ltd",20735,20464,Success
2,5 ways police departments are using RTCCs beyo...,"Sarah Calams, Sarah Calams Sarah Calams, Sarah...",9255,9108,Success
3,Parking Reservation System Market Size,"Satyam Jaiswal, Preeti Wadhwani",32760,32613,Success
4,Parking Guidance Systems Announces Global Merg...,Felicia Perez,4012,3824,Success


## View Cleaned Article (Before & After)

In [7]:
# Compare raw vs cleaned content
article_index = 0  # Change this to view different articles

if len(articles) > article_index:
    article = articles[article_index]
    
    print("="*80)
    print(f"TITLE: {article['title']}")
    print(f"AUTHOR(S): {article['authors']}")
    print(f"PUBLISHED: {article['publish_date']}")
    print(f"URL: {article['resolved_url']}")
    print("="*80)
    
    print(f"\nRAW CONTENT LENGTH: {article['raw_length']:,} characters")
    print(f"CLEANED CONTENT LENGTH: {article['cleaned_length']:,} characters")
    print(f"REMOVED: {article['raw_length'] - article['cleaned_length']:,} characters ({((article['raw_length'] - article['cleaned_length']) / article['raw_length'] * 100):.1f}% noise)")
    
    print("\n" + "="*80)
    print("BEFORE CLEANING (First 500 chars):")
    print("="*80)
    print(article['raw_content'][:500] + "...\n")
    
    print("="*80)
    print("AFTER CLEANING (Full cleaned article in paragraphs):")
    print("="*80)
    print(article['cleaned_content'])
    print("\n" + "="*80)
else:
    print(f"Article {article_index} not found")

TITLE: A digital twin framework for urban parking management and mobility forecasting
AUTHOR(S): Unknown
PUBLISHED: 2025-10-23 00:00:00
URL: https://www.nature.com/articles/s41467-025-65306-w

RAW CONTENT LENGTH: 77,191 characters
CLEANED CONTENT LENGTH: 74,278 characters
REMOVED: 2,913 characters (3.8% noise)

BEFORE CLEANING (First 500 chars):
Statistical outputs and distributions

This section presents the key findings from the three components of our DT framework. Statistical analysis has played a crucial role in understanding urban mobility dynamics, providing a detailed insight into parking resource usage and user behavior. By examining transaction distributions, parking occupancy rates, and recorded violations, recurring patterns and specific issues within the analyzed urban context were identified. See Fig. 2d, e for weekday vs....

AFTER CLEANING (Full cleaned article in paragraphs):
Statistical outputs and distributions This section presents the key findings from the three co

## Export Results

In [8]:
# Export with both raw and cleaned content
df.to_csv(f'news_articles_with_cleaning_q={query}.csv', index=False, encoding='utf-8')
print("✓ Exported to news_articles_with_cleaning.csv")

# Export JSON
df.to_json(f'news_articles_with_cleaning_q={query}.json', orient='records', indent=2, force_ascii=False)
print("✓ Exported to news_articles_with_cleaning.json")

# Export cleaned content only (for analysis)
df_clean = df[df['extraction_status'] == 'Success'][['title', 'authors', 'publish_date', 'cleaned_content', 'keywords']]
if len(df_clean) > 0:
    df_clean.to_csv('news_articles_cleaned_only.csv', index=False, encoding='utf-8')
    print(f"✓ Exported {len(df_clean)} cleaned articles to news_articles_cleaned_only.csv")

✓ Exported to news_articles_with_cleaning.csv
✓ Exported to news_articles_with_cleaning.json
✓ Exported 5 cleaned articles to news_articles_cleaned_only.csv


## Statistics

In [9]:
# Show cleaning statistics
success_df = df[df['extraction_status'] == 'Success']

print("="*80)
print("EXTRACTION & CLEANING STATISTICS")
print("="*80)
print(f"Total articles: {len(df)}")
print(f"Successful extractions: {len(success_df)}")
print(f"Failed extractions: {len(df) - len(success_df)}")

if len(success_df) > 0:
    print(f"\n--- RAW CONTENT ---")
    print(f"Average raw length: {success_df['raw_length'].mean():.0f} characters")
    print(f"Total raw content: {success_df['raw_length'].sum():,} characters")
    
    print(f"\n--- CLEANED CONTENT ---")
    print(f"Average cleaned length: {success_df['cleaned_length'].mean():.0f} characters")
    print(f"Total cleaned content: {success_df['cleaned_length'].sum():,} characters")
    
    print(f"\n--- CLEANING EFFICIENCY ---")
    total_removed = success_df['raw_length'].sum() - success_df['cleaned_length'].sum()
    avg_noise_pct = ((success_df['raw_length'] - success_df['cleaned_length']) / success_df['raw_length'] * 100).mean()
    print(f"Total noise removed: {total_removed:,} characters")
    print(f"Average noise percentage: {avg_noise_pct:.1f}%")
    print(f"Shortest cleaned article: {success_df['cleaned_length'].min():,} characters")
    print(f"Longest cleaned article: {success_df['cleaned_length'].max():,} characters")

print("="*80)

EXTRACTION & CLEANING STATISTICS
Total articles: 5
Successful extractions: 5
Failed extractions: 0

--- RAW CONTENT ---
Average raw length: 28791 characters
Total raw content: 143,953 characters

--- CLEANED CONTENT ---
Average cleaned length: 28057 characters
Total cleaned content: 140,287 characters

--- CLEANING EFFICIENCY ---
Total noise removed: 3,666 characters
Average noise percentage: 2.4%
Shortest cleaned article: 3,824 characters
Longest cleaned article: 74,278 characters
