# Web Scraping Module

This notebook contains utilities for scraping news articles using DuckDuckGo Search and newspaper4k.
Scraped data is saved to `data/raw/`.

In [None]:
from ddgs import DDGS
from newspaper import Article
from typing import List, Optional
from pathlib import Path
import pandas as pd
import hashlib
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Data directories
RAW_DATA_DIR = Path('../data/raw')
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
def search_news(query: str, count: int = 50) -> List[str]:
    """
    Search for news articles using DuckDuckGo.
    
    Args:
        query: Search query string
        count: Target number of articles to collect
    
    Returns:
        List of news article URLs
    """
    urls = []
    try:
        with DDGS() as ddgs:
            results = ddgs.news(query, max_results=count, backend='auto')
            urls = [r['url'] for r in results if r.get('url')]
        logger.info(f"Found {len(urls)} URLs for query: {query}")
    except Exception as e:
        logger.error(f"Search failed: {e}")
    return urls

In [None]:
def download_article(url: str) -> Optional[dict]:
    """
    Download and parse a single article using newspaper4k.
    
    Args:
        url: URL of the article to download
    
    Returns:
        Dictionary with article data or None if failed
    """
    try:
        article = Article(url)
        article.download()
        article.parse()
        
        return {
            'url': url,
            'title': article.title,
            'text': article.text
        }
    except Exception as e:
        logger.warning(f"Failed to download article {url}: {e}")
        return None

In [None]:
def download_articles(urls: List[str]) -> List[dict]:
    """
    Download multiple articles.
    
    Args:
        urls: List of article URLs
    
    Returns:
        List of article data dictionaries
    """
    articles = []
    
    for url in urls:
        article_data = download_article(url)
        if article_data and article_data.get('text'):
            articles.append(article_data)
        
    logger.info(f"Successfully downloaded {len(articles)}/{len(urls)} articles")
    return articles

In [None]:
def save_urls_to_raw(urls: List[str], query: str) -> Path:
    """
    Save scraped URLs to data/raw as parquet.
    
    Args:
        urls: List of article URLs
        query: Search query used
    
    Returns:
        Path to saved file
    """
    df = pd.DataFrame({'url': urls, 'query': query})
    filepath = RAW_DATA_DIR / 'urls.parquet'
    df.to_parquet(filepath, index=False)
    logger.info(f"Saved {len(urls)} URLs to {filepath}")
    return filepath

In [None]:
def save_articles_to_raw(articles: List[dict]) -> Path:
    """
    Save article texts to data/raw as individual txt files.
    Also saves metadata as parquet.
    
    Args:
        articles: List of article dictionaries
    
    Returns:
        Path to articles directory
    """
    articles_dir = RAW_DATA_DIR / 'articles'
    articles_dir.mkdir(exist_ok=True)
    
    metadata = []
    for article in articles:
        # Create unique filename from URL hash
        url_hash = hashlib.md5(article['url'].encode()).hexdigest()[:12]
        txt_path = articles_dir / f"{url_hash}.txt"
        
        # Save text
        txt_path.write_text(article['text'], encoding='utf-8')
        
        # Track metadata
        metadata.append({
            'url': article['url'],
            'title': article['title'],
            'filename': f"{url_hash}.txt"
        })
    
    # Save metadata
    meta_df = pd.DataFrame(metadata)
    meta_df.to_parquet(RAW_DATA_DIR / 'articles_metadata.parquet', index=False)
    
    logger.info(f"Saved {len(articles)} articles to {articles_dir}")
    return articles_dir

## Example Usage

In [None]:
# Example: Search for news, download articles, and save to data/raw
# urls = search_news("artificial intelligence", count=20)
# save_urls_to_raw(urls, "artificial intelligence")
# articles = download_articles(urls)
# save_articles_to_raw(articles)
# print(f"Downloaded and saved {len(articles)} articles")