In [3]:
!pip install -r requirements.txt



In [5]:
# *******
# IMPORTS
# *******
try:
    import requests  
    from bs4 import BeautifulSoup  
    import pandas as pd  
    import time  
    
    # User agent rotation
    try:
        from fake_useragent import UserAgent  
        ua = UserAgent()
        headers = {'User-Agent': ua.chrome}
    except:
        # Fallback
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
    
    print("All libraries loaded successfully!")
    
except ImportError as e:
    print(f"Critical import missing: {e}")
    print("Run: pip install requests beautifulsoup4 pandas fake-useragent")

# ***************************
# PRODUCTION SCRAPER FUNCTION
# ***************************
def scraper(url, timeout=10, retries=3):

    for attempt in range(retries):
        try:
            # Make HTTP request (with timeout to prevent hanging)
            response = requests.get(
                url,
                headers=headers,
                timeout=timeout
            )
            
            # Verify successful response (raises HTTPError for 4XX/5XX)
            response.raise_for_status()
            
            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove unwanted elements
            for element in soup(['script', 'style', 'iframe', 'nav', 'footer']):
                element.decompose()
                
            # Extract clean text
            text = soup.get_text(separator=' ', strip=True)
            
            # Add delay (avoids getting blocked)
            time.sleep(1.5)
            
            # Return structured success result
            return {
                'status': 'success',
                'url': url,
                'text': text,
                'chars': len(text),
                'words': len(text.split()),
                'attempt': attempt + 1
            }
            
        # ERROR HANDLING 
        except requests.exceptions.RequestException as e:
            last_error = str(e)  # Store the error
            print(f"Attempt {attempt + 1} failed: {type(e).__name__}")
            time.sleep(2 ** attempt)
    
    # Return structured failure if all retries exhausted
    return {
        'status': 'failed',
        'url': url,
        'error': f"All {retries} attempts failed",
        'last_error': last_error  
    }

# ******************
# TESTING FRAMEWORK
# ******************
if __name__ == "__main__":
    # Test URLs covering different scenarios
    test_urls = [
    "https://en.wikipedia.org/wiki/Web_scraping",
    "https://httpbin.org/status/404",
    "https://www.nytimes.com"
]

results = []
for url in test_urls:
    print(f"\nScraping: {url}")
    result = scraper(url)
    results.append(result)
    
    if result['status'] == 'success':
        print(f"Scraped {result['chars']} chars")
    else:
        print(f"Failed: {result['last_error']}")

pd.DataFrame(results)[['status', 'url', 'chars', 'last_error']]

All libraries loaded successfully!

Scraping: https://en.wikipedia.org/wiki/Web_scraping
Scraped 26551 chars

Scraping: https://httpbin.org/status/404
Attempt 1 failed: HTTPError
Attempt 2 failed: HTTPError
Attempt 3 failed: HTTPError
Failed: 404 Client Error: NOT FOUND for url: https://httpbin.org/status/404

Scraping: https://www.nytimes.com
Scraped 8845 chars


Unnamed: 0,status,url,chars,last_error
0,success,https://en.wikipedia.org/wiki/Web_scraping,26551.0,
1,failed,https://httpbin.org/status/404,,404 Client Error: NOT FOUND for url: https://h...
2,success,https://www.nytimes.com,8845.0,
