<a href="https://colab.research.google.com/github/ahteshamsalamatansari/colabcodes/blob/main/kff.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#!/usr/bin/env python3
"""
KFF Health News Article Scraper
High-performance scraper designed to extract clean article content from KFF Health News URLs
Optimized for processing 10,000+ URLs in ~40 minutes with full transparency
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from urllib.parse import urlparse
import re
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

class KFFScraper:
    def __init__(self, max_workers=20, timeout=15):
        """
        Initialize the scraper with optimized settings
        max_workers: Number of concurrent threads (20 for ~40min target)
        timeout: Request timeout in seconds
        """
        self.max_workers = max_workers
        self.timeout = timeout
        self.session = self._create_session()
        self.results = []
        self.failed_urls = []
        self.lock = threading.Lock()

        # Progress tracking
        self.processed = 0
        self.total_urls = 0
        self.start_time = None

        # Unwanted text patterns to filter out
        self.unwanted_patterns = [
            r'Click to share on.*?(?=Click|$)',
            r'@\w+',
            r'Related Topics.*',
            r'Contact Us.*',
            r'Submit a Story Tip.*',
            r'Republish This Story.*',
            r'\(.*?/KHN\)',  # Author attribution like (Hannah Norman/KHN)
            r'Share This Story:.*',
            r'We want to hear from you:.*',
        ]

        print("🚀 KFF Health News Scraper Initialized")
        print(f"⚙️ Settings: {max_workers} workers, {timeout}s timeout")
        print("="*60)

    def _create_session(self):
        """Create optimized requests session"""
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        return session

    def clean_text(self, text):
        """Clean extracted text from unwanted patterns"""
        if not text:
            return ""

        # Remove unwanted patterns
        for pattern in self.unwanted_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)

        # Clean up whitespace
        text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single
        text = re.sub(r'\n\s*\n', '\n', text)  # Multiple newlines to single
        text = text.strip()

        return text

    def extract_article_content(self, soup):
        """Extract clean article content focusing on main text"""
        content_parts = []

        # Find the main article content container
        article_body = soup.find('div', class_='article-body')
        if not article_body:
            # Fallback to entry-content
            article_body = soup.find('div', class_='entry-content')

        if article_body:
            # Remove unwanted elements
            unwanted_selectors = [
                'aside',  # Sidebar content
                'figure',  # Images and captions
                'figcaption',  # Image captions
                '.wp-block',  # WordPress blocks (newsletters, etc.)
                '.sharedaddy',  # Sharing buttons
                '.sd-sharing',  # Social sharing
                '.meta-authors',  # Author info
                '.category-tag-list',  # Related topics
                '.share',  # Share buttons
                '.newsletter',  # Newsletter signup
                '.partner',  # Partner content
                'form',  # Forms
                'button',  # Buttons
                '.screen-reader-text'  # Hidden text
            ]

            for selector in unwanted_selectors:
                for element in article_body.select(selector):
                    element.decompose()

            # Extract text from remaining paragraphs
            paragraphs = article_body.find_all(['p', 'div'], recursive=True)

            for p in paragraphs:
                # Skip if paragraph contains unwanted classes
                if p.get('class') and any(cls in str(p.get('class')) for cls in
                                        ['share', 'meta', 'social', 'republish', 'feedback']):
                    continue

                text = p.get_text(strip=True)
                if text and len(text) > 30:  # Only meaningful content
                    # Additional filtering for sharing text
                    if not any(phrase in text.lower() for phrase in
                              ['click to share', 'opens in new window', 'related topics',
                               'contact us', 'submit a story', '@', 'republish']):
                        content_parts.append(text)

        return ' '.join(content_parts)

    def scrape_single_url(self, url):
        """Scrape a single URL and extract title + content"""
        try:
            # Make request
            response = self.session.get(url, timeout=self.timeout)
            response.raise_for_status()

            # Parse HTML
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract title
            title = ""
            title_element = soup.find('h1', class_='entry-title')
            if title_element:
                title = title_element.get_text(strip=True)
            else:
                # Fallback to page title
                title_tag = soup.find('title')
                if title_tag:
                    title = title_tag.get_text(strip=True)

            # Extract content
            content = self.extract_article_content(soup)
            content = self.clean_text(content)

            return {
                'url': url,
                'title': title,
                'content': content,
                'status': 'success',
                'content_length': len(content)
            }

        except requests.exceptions.RequestException as e:
            return {
                'url': url,
                'title': '',
                'content': '',
                'status': f'request_error: {str(e)}',
                'content_length': 0
            }
        except Exception as e:
            return {
                'url': url,
                'title': '',
                'content': '',
                'status': f'parsing_error: {str(e)}',
                'content_length': 0
            }

    def update_progress(self, result):
        """Update progress tracking with thread safety"""
        with self.lock:
            self.processed += 1

            if result['status'] == 'success':
                self.results.append(result)
            else:
                self.failed_urls.append(result)

            # Calculate stats
            elapsed = time.time() - self.start_time
            progress = (self.processed / self.total_urls) * 100
            avg_time_per_url = elapsed / self.processed
            estimated_remaining = (self.total_urls - self.processed) * avg_time_per_url

            # Print progress every 50 URLs
            if self.processed % 50 == 0 or self.processed == self.total_urls:
                print(f"📊 Progress: {self.processed}/{self.total_urls} ({progress:.1f}%)")
                print(f"⏱️  Elapsed: {elapsed/60:.1f}min | ETA: {estimated_remaining/60:.1f}min")
                print(f"✅ Success: {len(self.results)} | ❌ Failed: {len(self.failed_urls)}")
                if len(self.results) > 0:
                    avg_content_length = sum(r['content_length'] for r in self.results) / len(self.results)
                    print(f"📝 Avg content length: {avg_content_length:.0f} chars")
                print("-" * 60)

    def scrape_urls(self, urls):
        """Main scraping function with concurrent processing"""
        self.total_urls = len(urls)
        self.start_time = time.time()

        print(f"🎯 Starting to scrape {self.total_urls} URLs")
        print(f"🔧 Using {self.max_workers} concurrent workers")
        print(f"🎯 Target: Complete in ~40 minutes")
        print("=" * 60)

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all tasks
            future_to_url = {
                executor.submit(self.scrape_single_url, url): url
                for url in urls
            }

            # Process completed tasks
            for future in as_completed(future_to_url):
                result = future.result()
                self.update_progress(result)

        # Final summary
        total_time = time.time() - self.start_time
        print("\n" + "=" * 60)
        print("🎉 SCRAPING COMPLETED!")
        print(f"⏱️  Total time: {total_time/60:.1f} minutes")
        print(f"✅ Successfully scraped: {len(self.results)}")
        print(f"❌ Failed: {len(self.failed_urls)}")
        print(f"📈 Success rate: {(len(self.results)/self.total_urls)*100:.1f}%")

        if len(self.results) > 0:
            avg_content_length = sum(r['content_length'] for r in self.results) / len(self.results)
            total_content = sum(r['content_length'] for r in self.results)
            print(f"📝 Total content extracted: {total_content:,} characters")
            print(f"📊 Average content per article: {avg_content_length:.0f} characters")
        print("=" * 60)

    def save_results(self, output_filename='kff_articles.csv', failed_filename='failed_urls.csv'):
        """Save results to CSV files"""
        if self.results:
            df = pd.DataFrame(self.results)
            df.to_csv(output_filename, index=False, encoding='utf-8')
            print(f"💾 Successful results saved to: {output_filename}")

        if self.failed_urls:
            df_failed = pd.DataFrame(self.failed_urls)
            df_failed.to_csv(failed_filename, index=False, encoding='utf-8')
            print(f"💾 Failed URLs saved to: {failed_filename}")

def load_urls_from_file(filename):
    """Load URLs from a text file"""
    urls = []
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            for line in file:
                url = line.strip()
                if url and url.startswith('http'):
                    urls.append(url)
        print(f"📁 Loaded {len(urls)} URLs from {filename}")
        return urls
    except FileNotFoundError:
        print(f"❌ File {filename} not found!")
        return []

# Example usage and main execution
if __name__ == "__main__":
    print("🌟 KFF Health News Article Scraper")
    print("=" * 60)

    # Option 1: Load URLs from file
    urls_file = input("📁 Enter the path to your URLs file (or press Enter for test mode): ").strip()

    if urls_file:
        urls = load_urls_from_file(urls_file)
        if not urls:
            print("❌ No valid URLs found. Exiting.")
            exit(1)
    else:
        # Test mode with sample URLs
        print("🧪 Test mode: Using sample URLs")
        urls = [
            "https://kffhealthnews.org/news/article/artificial-intelligence-algorithms-software-health-care/",
            # Add more test URLs here if needed
        ]

    # Initialize scraper (optimized for 10K URLs in 40 minutes)
    scraper = KFFScraper(max_workers=20, timeout=15)

    # Start scraping
    scraper.scrape_urls(urls)

    # Save results
    output_file = input("💾 Enter output filename (default: kff_articles.csv): ").strip()
    if not output_file:
        output_file = "kff_articles.csv"

    scraper.save_results(output_file)

    print("✨ Scraping completed! Check your output files.")

🌟 KFF Health News Article Scraper
📁 Enter the path to your URLs file (or press Enter for test mode): /content/urls.txt.txt
📁 Loaded 4057 URLs from /content/urls.txt.txt
🚀 KFF Health News Scraper Initialized
⚙️ Settings: 20 workers, 15s timeout
🎯 Starting to scrape 4057 URLs
🔧 Using 20 concurrent workers
🎯 Target: Complete in ~40 minutes
📊 Progress: 50/4057 (1.2%)
⏱️  Elapsed: 0.1min | ETA: 4.5min
✅ Success: 50 | ❌ Failed: 0
📝 Avg content length: 8333 chars
------------------------------------------------------------




📊 Progress: 100/4057 (2.5%)
⏱️  Elapsed: 0.1min | ETA: 5.1min
✅ Success: 100 | ❌ Failed: 0
📝 Avg content length: 7955 chars
------------------------------------------------------------
📊 Progress: 150/4057 (3.7%)
⏱️  Elapsed: 0.2min | ETA: 4.7min
✅ Success: 150 | ❌ Failed: 0
📝 Avg content length: 8114 chars
------------------------------------------------------------
📊 Progress: 200/4057 (4.9%)
⏱️  Elapsed: 0.2min | ETA: 4.4min
✅ Success: 200 | ❌ Failed: 0
📝 Avg content length: 8210 chars
------------------------------------------------------------
📊 Progress: 250/4057 (6.2%)
⏱️  Elapsed: 0.3min | ETA: 4.2min
✅ Success: 250 | ❌ Failed: 0
📝 Avg content length: 8193 chars
------------------------------------------------------------




📊 Progress: 300/4057 (7.4%)
⏱️  Elapsed: 0.3min | ETA: 4.3min
✅ Success: 300 | ❌ Failed: 0
📝 Avg content length: 8109 chars
------------------------------------------------------------
📊 Progress: 350/4057 (8.6%)
⏱️  Elapsed: 0.4min | ETA: 4.2min
✅ Success: 350 | ❌ Failed: 0
📝 Avg content length: 8289 chars
------------------------------------------------------------
📊 Progress: 400/4057 (9.9%)
⏱️  Elapsed: 0.4min | ETA: 4.1min
✅ Success: 400 | ❌ Failed: 0
📝 Avg content length: 8103 chars
------------------------------------------------------------




📊 Progress: 450/4057 (11.1%)
⏱️  Elapsed: 0.5min | ETA: 4.0min
✅ Success: 450 | ❌ Failed: 0
📝 Avg content length: 7942 chars
------------------------------------------------------------




📊 Progress: 500/4057 (12.3%)
⏱️  Elapsed: 0.6min | ETA: 4.0min
✅ Success: 500 | ❌ Failed: 0
📝 Avg content length: 7883 chars
------------------------------------------------------------
📊 Progress: 550/4057 (13.6%)
⏱️  Elapsed: 0.6min | ETA: 3.9min
✅ Success: 550 | ❌ Failed: 0
📝 Avg content length: 7922 chars
------------------------------------------------------------
📊 Progress: 600/4057 (14.8%)
⏱️  Elapsed: 0.7min | ETA: 3.8min
✅ Success: 600 | ❌ Failed: 0
📝 Avg content length: 7894 chars
------------------------------------------------------------
📊 Progress: 650/4057 (16.0%)
⏱️  Elapsed: 0.7min | ETA: 3.7min
✅ Success: 650 | ❌ Failed: 0
📝 Avg content length: 7954 chars
------------------------------------------------------------




📊 Progress: 700/4057 (17.3%)
⏱️  Elapsed: 0.8min | ETA: 3.7min
✅ Success: 700 | ❌ Failed: 0
📝 Avg content length: 7905 chars
------------------------------------------------------------
📊 Progress: 750/4057 (18.5%)
⏱️  Elapsed: 0.8min | ETA: 3.7min
✅ Success: 750 | ❌ Failed: 0
📝 Avg content length: 7903 chars
------------------------------------------------------------
📊 Progress: 800/4057 (19.7%)
⏱️  Elapsed: 0.9min | ETA: 3.6min
✅ Success: 800 | ❌ Failed: 0
📝 Avg content length: 7899 chars
------------------------------------------------------------
📊 Progress: 850/4057 (21.0%)
⏱️  Elapsed: 0.9min | ETA: 3.5min
✅ Success: 850 | ❌ Failed: 0
📝 Avg content length: 7910 chars
------------------------------------------------------------




📊 Progress: 900/4057 (22.2%)
⏱️  Elapsed: 1.0min | ETA: 3.5min
✅ Success: 900 | ❌ Failed: 0
📝 Avg content length: 7966 chars
------------------------------------------------------------
📊 Progress: 950/4057 (23.4%)
⏱️  Elapsed: 1.1min | ETA: 3.4min
✅ Success: 950 | ❌ Failed: 0
📝 Avg content length: 7941 chars
------------------------------------------------------------




📊 Progress: 1000/4057 (24.6%)
⏱️  Elapsed: 1.1min | ETA: 3.4min
✅ Success: 1000 | ❌ Failed: 0
📝 Avg content length: 7927 chars
------------------------------------------------------------
📊 Progress: 1050/4057 (25.9%)
⏱️  Elapsed: 1.1min | ETA: 3.3min
✅ Success: 1050 | ❌ Failed: 0
📝 Avg content length: 7895 chars
------------------------------------------------------------




📊 Progress: 1100/4057 (27.1%)
⏱️  Elapsed: 1.2min | ETA: 3.3min
✅ Success: 1100 | ❌ Failed: 0
📝 Avg content length: 7903 chars
------------------------------------------------------------
📊 Progress: 1150/4057 (28.3%)
⏱️  Elapsed: 1.3min | ETA: 3.2min
✅ Success: 1150 | ❌ Failed: 0
📝 Avg content length: 7860 chars
------------------------------------------------------------
📊 Progress: 1200/4057 (29.6%)
⏱️  Elapsed: 1.3min | ETA: 3.1min
✅ Success: 1200 | ❌ Failed: 0
📝 Avg content length: 7880 chars
------------------------------------------------------------
📊 Progress: 1250/4057 (30.8%)
⏱️  Elapsed: 1.4min | ETA: 3.1min
✅ Success: 1250 | ❌ Failed: 0
📝 Avg content length: 7855 chars
------------------------------------------------------------




📊 Progress: 1300/4057 (32.0%)
⏱️  Elapsed: 1.4min | ETA: 3.0min
✅ Success: 1300 | ❌ Failed: 0
📝 Avg content length: 7917 chars
------------------------------------------------------------
📊 Progress: 1350/4057 (33.3%)
⏱️  Elapsed: 1.5min | ETA: 3.0min
✅ Success: 1350 | ❌ Failed: 0
📝 Avg content length: 7908 chars
------------------------------------------------------------
📊 Progress: 1400/4057 (34.5%)
⏱️  Elapsed: 1.5min | ETA: 2.9min
✅ Success: 1400 | ❌ Failed: 0
📝 Avg content length: 7919 chars
------------------------------------------------------------
📊 Progress: 1450/4057 (35.7%)
⏱️  Elapsed: 1.6min | ETA: 2.8min
✅ Success: 1450 | ❌ Failed: 0
📝 Avg content length: 7917 chars
------------------------------------------------------------




📊 Progress: 1500/4057 (37.0%)
⏱️  Elapsed: 1.6min | ETA: 2.8min
✅ Success: 1500 | ❌ Failed: 0
📝 Avg content length: 7931 chars
------------------------------------------------------------




📊 Progress: 1550/4057 (38.2%)
⏱️  Elapsed: 1.7min | ETA: 2.7min
✅ Success: 1550 | ❌ Failed: 0
📝 Avg content length: 7954 chars
------------------------------------------------------------
📊 Progress: 1600/4057 (39.4%)
⏱️  Elapsed: 1.7min | ETA: 2.7min
✅ Success: 1600 | ❌ Failed: 0
📝 Avg content length: 7947 chars
------------------------------------------------------------
📊 Progress: 1650/4057 (40.7%)
⏱️  Elapsed: 1.8min | ETA: 2.6min
✅ Success: 1650 | ❌ Failed: 0
📝 Avg content length: 7939 chars
------------------------------------------------------------
📊 Progress: 1700/4057 (41.9%)
⏱️  Elapsed: 1.8min | ETA: 2.6min
✅ Success: 1700 | ❌ Failed: 0
📝 Avg content length: 7959 chars
------------------------------------------------------------




📊 Progress: 1750/4057 (43.1%)
⏱️  Elapsed: 1.9min | ETA: 2.5min
✅ Success: 1750 | ❌ Failed: 0
📝 Avg content length: 7990 chars
------------------------------------------------------------
📊 Progress: 1800/4057 (44.4%)
⏱️  Elapsed: 2.0min | ETA: 2.5min
✅ Success: 1800 | ❌ Failed: 0
📝 Avg content length: 7986 chars
------------------------------------------------------------
📊 Progress: 1850/4057 (45.6%)
⏱️  Elapsed: 2.0min | ETA: 2.4min
✅ Success: 1850 | ❌ Failed: 0
📝 Avg content length: 7990 chars
------------------------------------------------------------
📊 Progress: 1900/4057 (46.8%)
⏱️  Elapsed: 2.1min | ETA: 2.3min
✅ Success: 1900 | ❌ Failed: 0
📝 Avg content length: 7993 chars
------------------------------------------------------------




📊 Progress: 1950/4057 (48.1%)
⏱️  Elapsed: 2.1min | ETA: 2.3min
✅ Success: 1950 | ❌ Failed: 0
📝 Avg content length: 7991 chars
------------------------------------------------------------
📊 Progress: 2000/4057 (49.3%)
⏱️  Elapsed: 2.2min | ETA: 2.2min
✅ Success: 2000 | ❌ Failed: 0
📝 Avg content length: 8002 chars
------------------------------------------------------------




📊 Progress: 2050/4057 (50.5%)
⏱️  Elapsed: 2.2min | ETA: 2.2min
✅ Success: 2050 | ❌ Failed: 0
📝 Avg content length: 7989 chars
------------------------------------------------------------
📊 Progress: 2100/4057 (51.8%)
⏱️  Elapsed: 2.3min | ETA: 2.1min
✅ Success: 2100 | ❌ Failed: 0
📝 Avg content length: 7998 chars
------------------------------------------------------------




📊 Progress: 2150/4057 (53.0%)
⏱️  Elapsed: 2.4min | ETA: 2.1min
✅ Success: 2150 | ❌ Failed: 0
📝 Avg content length: 7990 chars
------------------------------------------------------------
📊 Progress: 2200/4057 (54.2%)
⏱️  Elapsed: 2.4min | ETA: 2.0min
✅ Success: 2200 | ❌ Failed: 0
📝 Avg content length: 7971 chars
------------------------------------------------------------
📊 Progress: 2250/4057 (55.5%)
⏱️  Elapsed: 2.4min | ETA: 2.0min
✅ Success: 2250 | ❌ Failed: 0
📝 Avg content length: 7952 chars
------------------------------------------------------------




📊 Progress: 2300/4057 (56.7%)
⏱️  Elapsed: 2.5min | ETA: 1.9min
✅ Success: 2300 | ❌ Failed: 0
📝 Avg content length: 7945 chars
------------------------------------------------------------




📊 Progress: 2350/4057 (57.9%)
⏱️  Elapsed: 2.6min | ETA: 1.9min
✅ Success: 2350 | ❌ Failed: 0
📝 Avg content length: 7944 chars
------------------------------------------------------------
📊 Progress: 2400/4057 (59.2%)
⏱️  Elapsed: 2.6min | ETA: 1.8min
✅ Success: 2400 | ❌ Failed: 0
📝 Avg content length: 7937 chars
------------------------------------------------------------
📊 Progress: 2450/4057 (60.4%)
⏱️  Elapsed: 2.7min | ETA: 1.7min
✅ Success: 2450 | ❌ Failed: 0
📝 Avg content length: 7936 chars
------------------------------------------------------------
📊 Progress: 2500/4057 (61.6%)
⏱️  Elapsed: 2.7min | ETA: 1.7min
✅ Success: 2500 | ❌ Failed: 0
📝 Avg content length: 7907 chars
------------------------------------------------------------




📊 Progress: 2550/4057 (62.9%)
⏱️  Elapsed: 2.8min | ETA: 1.6min
✅ Success: 2550 | ❌ Failed: 0
📝 Avg content length: 7885 chars
------------------------------------------------------------
📊 Progress: 2600/4057 (64.1%)
⏱️  Elapsed: 2.8min | ETA: 1.6min
✅ Success: 2600 | ❌ Failed: 0
📝 Avg content length: 7881 chars
------------------------------------------------------------
📊 Progress: 2650/4057 (65.3%)
⏱️  Elapsed: 2.9min | ETA: 1.5min
✅ Success: 2650 | ❌ Failed: 0
📝 Avg content length: 7864 chars
------------------------------------------------------------




📊 Progress: 2700/4057 (66.6%)
⏱️  Elapsed: 2.9min | ETA: 1.5min
✅ Success: 2700 | ❌ Failed: 0
📝 Avg content length: 7838 chars
------------------------------------------------------------




📊 Progress: 2750/4057 (67.8%)
⏱️  Elapsed: 3.0min | ETA: 1.4min
✅ Success: 2750 | ❌ Failed: 0
📝 Avg content length: 7825 chars
------------------------------------------------------------
📊 Progress: 2800/4057 (69.0%)
⏱️  Elapsed: 3.0min | ETA: 1.4min
✅ Success: 2800 | ❌ Failed: 0
📝 Avg content length: 7817 chars
------------------------------------------------------------
📊 Progress: 2850/4057 (70.2%)
⏱️  Elapsed: 3.1min | ETA: 1.3min
✅ Success: 2850 | ❌ Failed: 0
📝 Avg content length: 7815 chars
------------------------------------------------------------
📊 Progress: 2900/4057 (71.5%)
⏱️  Elapsed: 3.1min | ETA: 1.3min
✅ Success: 2900 | ❌ Failed: 0
📝 Avg content length: 7811 chars
------------------------------------------------------------




📊 Progress: 2950/4057 (72.7%)
⏱️  Elapsed: 3.2min | ETA: 1.2min
✅ Success: 2950 | ❌ Failed: 0
📝 Avg content length: 7810 chars
------------------------------------------------------------
📊 Progress: 3000/4057 (73.9%)
⏱️  Elapsed: 3.3min | ETA: 1.2min
✅ Success: 3000 | ❌ Failed: 0
📝 Avg content length: 7793 chars
------------------------------------------------------------
📊 Progress: 3050/4057 (75.2%)
⏱️  Elapsed: 3.3min | ETA: 1.1min
✅ Success: 3050 | ❌ Failed: 0
📝 Avg content length: 7786 chars
------------------------------------------------------------
📊 Progress: 3100/4057 (76.4%)
⏱️  Elapsed: 3.4min | ETA: 1.0min
✅ Success: 3100 | ❌ Failed: 0
📝 Avg content length: 7777 chars
------------------------------------------------------------




📊 Progress: 3150/4057 (77.6%)
⏱️  Elapsed: 3.4min | ETA: 1.0min
✅ Success: 3150 | ❌ Failed: 0
📝 Avg content length: 7764 chars
------------------------------------------------------------




📊 Progress: 3200/4057 (78.9%)
⏱️  Elapsed: 3.5min | ETA: 0.9min
✅ Success: 3200 | ❌ Failed: 0
📝 Avg content length: 7742 chars
------------------------------------------------------------
📊 Progress: 3250/4057 (80.1%)
⏱️  Elapsed: 3.5min | ETA: 0.9min
✅ Success: 3250 | ❌ Failed: 0
📝 Avg content length: 7718 chars
------------------------------------------------------------
📊 Progress: 3300/4057 (81.3%)
⏱️  Elapsed: 3.6min | ETA: 0.8min
✅ Success: 3300 | ❌ Failed: 0
📝 Avg content length: 7696 chars
------------------------------------------------------------




📊 Progress: 3350/4057 (82.6%)
⏱️  Elapsed: 3.6min | ETA: 0.8min
✅ Success: 3350 | ❌ Failed: 0
📝 Avg content length: 7667 chars
------------------------------------------------------------




📊 Progress: 3400/4057 (83.8%)
⏱️  Elapsed: 3.7min | ETA: 0.7min
✅ Success: 3400 | ❌ Failed: 0
📝 Avg content length: 7644 chars
------------------------------------------------------------
📊 Progress: 3450/4057 (85.0%)
⏱️  Elapsed: 3.7min | ETA: 0.7min
✅ Success: 3450 | ❌ Failed: 0
📝 Avg content length: 7628 chars
------------------------------------------------------------




📊 Progress: 3500/4057 (86.3%)
⏱️  Elapsed: 3.8min | ETA: 0.6min
✅ Success: 3500 | ❌ Failed: 0
📝 Avg content length: 7613 chars
------------------------------------------------------------
📊 Progress: 3550/4057 (87.5%)
⏱️  Elapsed: 3.8min | ETA: 0.5min
✅ Success: 3550 | ❌ Failed: 0
📝 Avg content length: 7590 chars
------------------------------------------------------------




📊 Progress: 3600/4057 (88.7%)
⏱️  Elapsed: 3.9min | ETA: 0.5min
✅ Success: 3600 | ❌ Failed: 0
📝 Avg content length: 7571 chars
------------------------------------------------------------
📊 Progress: 3650/4057 (90.0%)
⏱️  Elapsed: 4.0min | ETA: 0.4min
✅ Success: 3650 | ❌ Failed: 0
📝 Avg content length: 7560 chars
------------------------------------------------------------
📊 Progress: 3700/4057 (91.2%)
⏱️  Elapsed: 4.0min | ETA: 0.4min
✅ Success: 3700 | ❌ Failed: 0
📝 Avg content length: 7551 chars
------------------------------------------------------------
📊 Progress: 3750/4057 (92.4%)
⏱️  Elapsed: 4.0min | ETA: 0.3min
✅ Success: 3750 | ❌ Failed: 0
📝 Avg content length: 7536 chars
------------------------------------------------------------




📊 Progress: 3800/4057 (93.7%)
⏱️  Elapsed: 4.1min | ETA: 0.3min
✅ Success: 3800 | ❌ Failed: 0
📝 Avg content length: 7517 chars
------------------------------------------------------------




📊 Progress: 3850/4057 (94.9%)
⏱️  Elapsed: 4.1min | ETA: 0.2min
✅ Success: 3850 | ❌ Failed: 0
📝 Avg content length: 7495 chars
------------------------------------------------------------
📊 Progress: 3900/4057 (96.1%)
⏱️  Elapsed: 4.2min | ETA: 0.2min
✅ Success: 3900 | ❌ Failed: 0
📝 Avg content length: 7488 chars
------------------------------------------------------------
📊 Progress: 3950/4057 (97.4%)
⏱️  Elapsed: 4.2min | ETA: 0.1min
✅ Success: 3950 | ❌ Failed: 0
📝 Avg content length: 7471 chars
------------------------------------------------------------
📊 Progress: 4000/4057 (98.6%)
⏱️  Elapsed: 4.3min | ETA: 0.1min
✅ Success: 4000 | ❌ Failed: 0
📝 Avg content length: 7456 chars
------------------------------------------------------------




📊 Progress: 4050/4057 (99.8%)
⏱️  Elapsed: 4.3min | ETA: 0.0min
✅ Success: 4050 | ❌ Failed: 0
📝 Avg content length: 7436 chars
------------------------------------------------------------
📊 Progress: 4057/4057 (100.0%)
⏱️  Elapsed: 4.3min | ETA: 0.0min
✅ Success: 4057 | ❌ Failed: 0
📝 Avg content length: 7432 chars
------------------------------------------------------------

🎉 SCRAPING COMPLETED!
⏱️  Total time: 4.3 minutes
✅ Successfully scraped: 4057
❌ Failed: 0
📈 Success rate: 100.0%
📝 Total content extracted: 30,152,161 characters
📊 Average content per article: 7432 characters
💾 Enter output filename (default: kff_articles.csv): kff4000.csv
💾 Successful results saved to: kff4000.csv
✨ Scraping completed! Check your output files.
