## 1.3 Retreive Article Data

This notebook scrapes article links and retreives text data.

In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin, urlparse
import logging
import random
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import urllib3
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import cloudscraper
import httpx
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed

#### Suppress SSL warning and set up logging

In [17]:
# Suppress SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

#### List extreme domains and proxy list

In [18]:
# Problematic domains that need extreme methods
EXTREME_DOMAINS = ['www.eetimes.com', 'www.edn.com']

# Free proxy list
PROXY_LIST = [
    # Add working proxies here if available
    # 'http://proxy1:port',
    # 'http://proxy2:port',
]

#### Cloud scraper

In [19]:
def create_cloudscraper_session():
    try:
        scraper = cloudscraper.create_scraper(
            browser={
                'browser': 'chrome',
                'platform': 'windows',
                'desktop': True
            }
        )
        return scraper
    except Exception as e:
        logging.error(f"Failed to create cloudscraper: {e}")
        return None

#### Selenium driver

In [20]:
def create_selenium_driver():
    try:
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run in background
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        driver = webdriver.Chrome(options=chrome_options)
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        return driver
    except Exception as e:
        logging.error(f"Failed to create Selenium driver: {e}")
        return None

#### Automatic retries

In [21]:
def create_session_with_retries():
    session = requests.Session()
    try:
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"],
            backoff_factor=1
        )
    except TypeError:
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            backoff_factor=1
        )
    adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=20)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

#### Extract article text 

In [22]:
def extract_article_text(html_content, url):
    soup = BeautifulSoup(html_content, 'html.parser')
    # Remove unwanted elements
    for script in soup(["script", "style", "nav", "header", "footer", "aside", "form", "iframe", "noscript"]):
        script.decompose()
    selectors = [
        'article',
        '[role="main"]',
        '.article-content',
        '.post-content',
        '.entry-content',
        '.content',
        '.main-content',
        '#content',
        '.article-body',
        '.story-body',
        '.post-body',
        '.article-text',
        '.body-content',
        '.article-wrapper'
    ]
    text_content = ""

    # Process each selector
    for selector in selectors:
        elements = soup.select(selector)
        if elements:
            for element in elements:
                text_content += element.get_text(separator=' ', strip=True) + " "
            break
    # Fallback to paragraphs
    if not text_content.strip():
        paragraphs = soup.find_all('p')
        text_content = ' '.join([p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30])
    # Last resort - get body text
    if not text_content.strip():
        body = soup.find('body')
        if body:
            text_content = body.get_text(separator=' ', strip=True)
    
    return text_content.strip()

#### Scrape with retries

In [23]:

def scrape_with_requests(url, session):
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0'
    ]
    headers = {
        'User-Agent': random.choice(user_agents),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache'
    }
    response = session.get(url, headers=headers, timeout=(15, 45), verify=False, allow_redirects=True)
    response.raise_for_status()
    return response.text

#### Scrape with cloudscraper

In [24]:
def scrape_with_cloudscraper(url, scraper):
    if scraper is None:
        raise Exception("CloudScraper not available")
    response = scraper.get(url, timeout=45)
    response.raise_for_status()
    return response.text
async def scrape_with_httpx(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    async with httpx.AsyncClient(timeout=45.0, verify=False) as client:
        response = await client.get(url, headers=headers, follow_redirects=True)
        response.raise_for_status()
        return response.text

#### Scrape with selenium driver

In [25]:
def scrape_with_selenium(url, driver):
    if driver is None:
        raise Exception("Selenium driver not available")
    driver.set_page_load_timeout(60)
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "article"))
        )
    except TimeoutException:
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.TAG_NAME, "p"))
            )
        except TimeoutException:
            pass  # Continue anyway
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
    time.sleep(2)
    html_content = driver.page_source
    return html_content

#### Scrape with httpx sync

In [26]:
def scrape_with_httpx_sync(url):
    """Synchronous version of HTTPX scraping"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    with httpx.Client(timeout=45.0, verify=False) as client:
        response = client.get(url, headers=headers, follow_redirects=True)
        response.raise_for_status()
        return response.text

#### Extreme method to scrape an article (Uses 4 different methods)

In [27]:
def scrape_article_extreme(url, session=None, scraper=None, driver=None):
    methods = []
    if session:
        methods.append(("Requests", lambda: scrape_with_requests(url, session)))
    
    if scraper:
        methods.append(("CloudScraper", lambda: scrape_with_cloudscraper(url, scraper)))
    methods.append(("HTTPX", lambda: scrape_with_httpx_sync(url)))
    
    if driver:
        methods.append(("Selenium", lambda: scrape_with_selenium(url, driver)))
    
    for method_name, method_func in methods:
        try:
            html_content = method_func()
            if html_content and len(html_content) > 1000:
                article_text = extract_article_text(html_content, url)
                if len(article_text) > 100:
                    return article_text
                else:
                    logging.warning(f"{method_name} got content but extraction failed")
            else:
                logging.warning(f"{method_name} returned insufficient content")
                
        except Exception as e:
            continue
    return None

def is_extreme_domain(url):
    return any(domain in url.lower() for domain in EXTREME_DOMAINS)

#### Scrape all articles with extreme methods

In [28]:
def scrape_article(url, session=None, scraper=None, driver=None):
    # For extreme domains, use all methods
    if is_extreme_domain(url):
        result = scrape_article_extreme(url, session, scraper, driver)
        if result:
            return result
        else:
            logging.error(f"All extreme methods failed for {url}")
            return "Content unavailable - all methods failed"
    # For normal domains, use regular method
    try:
        if session is None:
            session = create_session_with_retries()
        
        html_content = scrape_with_requests(url, session)
        article_text = extract_article_text(html_content, url)
        
        if len(article_text) > 100:
            return article_text
        else:
            result = scrape_article_extreme(url, session, scraper, driver)
            return result if result else "Content unavailable - extraction failed"
            
    except Exception as e:
        result = scrape_article_extreme(url, session, scraper, driver)
        return result if result else f"Content unavailable - {str(e)}"

#### Collect and organize article data

In [29]:

def collect_article_data(csv_path):
    # Read the CSV file
    try:
        df = pd.read_csv(csv_path)
        logging.info(f"Loaded {len(df)} articles from CSV")
    except Exception as e:
        logging.error(f"Error reading CSV: {str(e)}")
        return None
    
    # Validate required columns
    required_columns = ['title', 'url', 'date', 'source']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        logging.error(f"Missing required columns: {missing_columns}")
        return None
    
    # Initialize all scraping tools
    session = create_session_with_retries()
    scraper = create_cloudscraper_session()
    driver = create_selenium_driver()
     
    articles_data = []
    successful_scrapes = 0
    failed_scrapes = 0
    try:
        # Process each article
        for index, row in df.iterrows():
            logging.info(f"Processing article {index + 1}/{len(df)}: {row['title'][:50]}...")
            
            try:
                article_text = scrape_article(row['url'], session, scraper, driver)
                
                article_data = {
                    'title': row['title'],
                    'source': row['source'],
                    'date': row['date'],
                    'link': row['url'],
                    'text': article_text,
                }
                
                articles_data.append(article_data)
                
                if len(article_text) > 100 and not article_text.startswith("Content unavailable"):
                    successful_scrapes += 1
                else:
                    failed_scrapes += 1
                    
                # Progress update
                if (index + 1) % 5 == 0:
                    logging.info(f"Progress: {index + 1}/{len(df)} - Success: {successful_scrapes}, Failed: {failed_scrapes}")
                
                # Respectful delay
                if is_extreme_domain(row['url']):
                    time.sleep(random.uniform(2, 4))  # Longer for extreme domains
                else:
                    time.sleep(random.uniform(0.5, 1.5))
                    
            except Exception as e:
                article_data = {
                    'title': row['title'],
                    'source': row['source'],
                    'date': row['date'],
                    'link': row['url'],
                    'text': f"Processing error: {str(e)}",
                }
                articles_data.append(article_data)
                failed_scrapes += 1
    finally:
        # Clean up Selenium driver
        if driver:
            try:
                driver.quit()
            except:
                pass
    
    # Final summary
    logging.info(f"Total articles processed: {len(articles_data)}")
    
    return articles_data

### Run and Save raw article data

In [31]:
collected_data = collect_article_data('./intermediate_data/01-Collect/Scraped_Products_Article_Links.csv')
with open('./intermediate_data/01-Collect/Scraped_Products_Article_Data.json', 'w', encoding='utf-8') as f:
    json.dump(collected_data, f, indent=2, ensure_ascii=False)
print("Data saved successfully")
   

2025-06-13 12:11:31,877 - INFO - Loaded 30 articles from CSV
2025-06-13 12:11:34,841 - INFO - Processing article 1/30: EnCharge Picks The PC For Its First Analog AI Chip...
2025-06-13 12:11:39,855 - INFO - Processing article 2/30: IMS2025: Cross-correlation spectrum analyser from ...
2025-06-13 12:11:41,270 - INFO - Processing article 3/30: Dragonwing modules support varied OS...
2025-06-13 12:11:42,293 - INFO - Processing article 4/30: Tiger Lake-H Xeon-W SOSA single-board computer...
2025-06-13 12:11:43,052 - INFO - Processing article 5/30: The 90nm Leakage Issue...
2025-06-13 12:11:43,523 - INFO - Progress: 5/30 - Success: 5, Failed: 0
2025-06-13 12:11:44,472 - INFO - Processing article 6/30: AMS Technologies to distribute Singular Photonics’...
2025-06-13 12:11:45,670 - INFO - Processing article 7/30: Most Read – Qualcomm, Big Beautiful Bill, Semi sal...
2025-06-13 12:11:47,022 - INFO - Processing article 8/30: Rugged circular connectors are latching, threaded ...
2025-06-13 12:11:

Data saved successfully
