## Import Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import html
from urllib.parse import urljoin, urlparse

In [3]:
# Sample HTML content
SAMPLE_HTML = '''
<!DOCTYPE html>
<html>
<head>
    <title>BBC News - Wikipedia</title>
    <meta name="description" content="BBC News article">
</head>
<body>
    <nav>
        <ul>
            <li><a href="/home">Home</a></li>
            <li><a href="/news">News</a></li>
        </ul>
    </nav>
    
    <div class="main-content">
        <h1>BBC News</h1>
        
        <p>BBC News is an operational business division of the British Broadcasting Corporation (BBC) responsible for the gathering and broadcasting of news and current affairs in the UK and around the world. The department is the world's largest broadcast news organisation and generates about 120 hours of radio and television output each day, as well as online news coverage.</p>
        
        <div class="infobox">
            <table>
                <tr><td>Founded</td><td>1922</td></tr>
                <tr><td>Headquarters</td><td>Broadcasting House, London</td></tr>
            </table>
        </div>
        
        <h2>History</h2>
        
        <p>The British Broadcasting Company broadcast its first radio bulletin from radio station 2LO on 14 November 1922. Wishing to avoid competition, newspaper publishers persuaded the government to ban the BBC from broadcasting news before 7 p.m., and to force it to use wire service copy instead of reporting on its own.</p>
        
        <p>The BBC gradually gained the right to edit the copy and, in 1934, created its own news operation. However, it could not broadcast news before 6 p.m. until World War II.</p>
        
        <div class="sidebar">
            <h3>Related Articles</h3>
            <ul>
                <li><a href="/bbc">BBC</a></li>
                <li><a href="/news">News Broadcasting</a></li>
            </ul>
        </div>
        
        <h2>Television News</h2>
        
        <p>Television news, although physically separate from its radio counterpart, was still firmly under radio news' control in the 1950s. Correspondents provided reports for both outlets, and the first televised bulletin, shown on 5 July 1954 on the then BBC television service and presented by Richard Baker, involved his providing narration off-screen while stills were shown.</p>
        
    </div>
    
    <footer>
        <p>© 2025 Wikipedia. All rights reserved.</p>
        <div class="footer-links">
            <a href="/privacy">Privacy Policy</a>
            <a href="/terms">Terms of Service</a>
        </div>
    </footer>
    
    <script>
        // Analytics tracking code
        console.log("Page loaded");
    </script>
</body>
</html>
'''

In [4]:
class TextExtractor:
    
    def __init__(self, html_content):
        self.html_content = html_content
        self.soup = BeautifulSoup(html_content, 'html.parser')
    
    def basic_beautifulsoup_extraction(self):
        print("=" * 60)
        print("1. BASIC BEAUTIFULSOUP EXTRACTION")
        print("=" * 60)
        
        # Simple get_text() approach
        raw_text = self.soup.get_text()
        print("Raw text extraction:")
        print("-" * 30)
        print(raw_text[:500] + "..." if len(raw_text) > 500 else raw_text)
        print("\nISSUES IDENTIFIED:")
        print("- Includes navigation menu text")
        print("- Includes footer content") 
        print("- Includes sidebar content")
        print("- Poor spacing and formatting")
        print("- No content prioritization")
        
        return raw_text
    
    def improved_beautifulsoup_extraction(self):
        print("\n" + "=" * 60)
        print("2. IMPROVED BEAUTIFULSOUP EXTRACTION")
        print("=" * 60)
        
        # Remove unwanted elements
        soup_copy = BeautifulSoup(self.html_content, 'html.parser')
        
        # Remove navigation, footer, sidebar, scripts, style
        unwanted_tags = ['nav', 'footer', 'script', 'style', 'aside']
        unwanted_classes = ['sidebar', 'footer-links', 'infobox']
        
        for tag in unwanted_tags:
            for element in soup_copy.find_all(tag):
                element.decompose()
        
        for class_name in unwanted_classes:
            for element in soup_copy.find_all(class_=class_name):
                element.decompose()
        
        # Focus on main content
        main_content = soup_copy.find('div', class_='main-content')
        if main_content:
            # Extract text with better spacing
            text = main_content.get_text(separator='\n', strip=True)
        else:
            text = soup_copy.get_text(separator='\n', strip=True)
        
        # Clean up excessive whitespace
        text = re.sub(r'\n\s*\n', '\n\n', text)
        text = re.sub(r' +', ' ', text)
        
        print("Improved extraction:")
        print("-" * 30)
        print(text)
        print("\nIMPROVEMENTS:")
        print("✓ Removed navigation and footer")
        print("✓ Removed sidebar content")
        print("✓ Better text spacing")
        print("✓ Focused on main content area")
        
        return text
    
    def justext_like_approach(self):
        print("\n" + "=" * 60)
        print("3. JUSTEXT-LIKE HEURISTIC APPROACH")
        print("=" * 60)
        
        # Justext uses language models and statistical features
        # We'll simulate some basic heuristics
        
        soup_copy = BeautifulSoup(self.html_content, 'html.parser')
        
        # Score paragraphs based on various features
        paragraphs = []
        
        for p in soup_copy.find_all(['p', 'div']):
            if not p.get_text(strip=True):
                continue
                
            text = p.get_text(strip=True)
            
            # Calculate heuristic scores
            score = 0
            
            # Length score (prefer medium-length paragraphs)
            length = len(text)
            if 50 <= length <= 500:
                score += 2
            elif length > 500:
                score += 1
            
            # Link density (prefer low link density)
            links = p.find_all('a')
            link_text_length = sum(len(a.get_text()) for a in links)
            link_density = link_text_length / length if length > 0 else 1
            
            if link_density < 0.3:
                score += 2
            elif link_density < 0.5:
                score += 1
            
            # Position score (prefer content in main areas)
            if p.find_parent('nav') or p.find_parent('footer'):
                score -= 3
            if 'main' in str(p.get('class', [])).lower():
                score += 2
            
            # Punctuation score (real sentences have punctuation)
            if re.search(r'[.!?]', text):
                score += 1
            
            # Word count (prefer substantial content)
            word_count = len(text.split())
            if word_count > 10:
                score += 1
            
            paragraphs.append((score, text, p))
        
        # Select paragraphs with positive scores
        good_paragraphs = [text for score, text, elem in paragraphs if score > 0]
        
        result = '\n\n'.join(good_paragraphs)
        
        print("Heuristic-based extraction:")
        print("-" * 30)
        print(result)
        print(f"\nHEURISTICS APPLIED:")
        print(f"- Scored {len(paragraphs)} text blocks")
        print(f"- Selected {len(good_paragraphs)} high-quality paragraphs")
        print("- Length-based filtering")
        print("- Link density analysis")
        print("- Position-based scoring")
        print("- Punctuation analysis")
        
        return result
    
    def readability_like_approach(self):
        print("\n" + "=" * 60)
        print("4. READABILITY-LIKE ALGORITHM")
        print("=" * 60)
        
        soup_copy = BeautifulSoup(self.html_content, 'html.parser')
        
        # Score elements based on Readability-like heuristics
        scored_elements = []
        
        for elem in soup_copy.find_all(['div', 'article', 'section', 'main']):
            if not elem.get_text(strip=True):
                continue
            
            score = 0
            text = elem.get_text(strip=True)
            
            # Positive scoring
            if elem.name in ['article', 'main']:
                score += 25
            
            # Class and ID based scoring
            class_id_text = ' '.join([
                ' '.join(elem.get('class', [])),
                elem.get('id', '')
            ]).lower()
            
            positive_keywords = ['content', 'main', 'article', 'entry', 'post']
            negative_keywords = ['nav', 'sidebar', 'footer', 'ad', 'comment', 'widget']
            
            for keyword in positive_keywords:
                if keyword in class_id_text:
                    score += 25
            
            for keyword in negative_keywords:
                if keyword in class_id_text:
                    score -= 25
            
            # Content-based scoring
            p_count = len(elem.find_all('p'))
            score += min(p_count * 5, 50)  # Cap at 50
            
            # Text length scoring
            if len(text) > 200:
                score += 20
            
            # Link density penalty
            links = elem.find_all('a')
            link_text = ' '.join(a.get_text() for a in links)
            if len(text) > 0:
                link_density = len(link_text) / len(text)
                score -= int(link_density * 50)
            
            scored_elements.append((score, elem, text))
        
        # Get the highest scoring element
        if scored_elements:
            best_score, best_elem, best_text = max(scored_elements, key=lambda x: x[0])
            
            # Extract clean text from the best element
            for unwanted in best_elem.find_all(['script', 'style', 'nav', 'footer']):
                unwanted.decompose()
            
            result = best_elem.get_text(separator='\n', strip=True)
            result = re.sub(r'\n\s*\n', '\n\n', result)
            
            print("Readability-style extraction:")
            print("-" * 30)
            print(result)
            print(f"\nALGORITHM RESULTS:")
            print(f"- Best element score: {best_score}")
            print(f"- Selected element: {best_elem.name}")
            print(f"- Class/ID: {best_elem.get('class', [])} / {best_elem.get('id', 'None')}")
            print("- Applied content scoring heuristics")
            
            return result
        
        return "No suitable content found"
    
    def newspaper_like_approach(self):
        print("\n" + "=" * 60)
        print("5. NEWSPAPER3K-LIKE APPROACH")
        print("=" * 60)
        
        soup_copy = BeautifulSoup(self.html_content, 'html.parser')
        
        # Extract article metadata
        title = soup_copy.find('h1')
        title_text = title.get_text(strip=True) if title else "No title found"
        
        # Look for article-like content structures
        article_candidates = []
        
        # Find content containers
        containers = soup_copy.find_all(['article', 'div', 'section'], 
                                      class_=re.compile(r'(content|main|article|entry)', re.I))
        
        if not containers:
            # Fallback to divs with substantial content
            containers = [div for div in soup_copy.find_all('div') 
                         if len(div.get_text(strip=True)) > 200]
        
        for container in containers:
            # Remove noise
            for noise in container.find_all(['nav', 'footer', 'aside', 'script', 'style']):
                noise.decompose()
            
            # Extract paragraphs
            paragraphs = container.find_all('p')
            if len(paragraphs) >= 2:  # Articles typically have multiple paragraphs
                text_blocks = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
                if text_blocks:
                    article_candidates.append('\n\n'.join(text_blocks))
        
        # Select the longest article candidate
        if article_candidates:
            article_text = max(article_candidates, key=len)
        else:
            article_text = "No article content identified"
        
        print(f"Title: {title_text}")
        print("\nArticle text:")
        print("-" * 30)
        print(article_text)
        print(f"\nNEWSPAPER3K-STYLE FEATURES:")
        print(f"- Title extraction: {'✓' if title else '✗'}")
        print(f"- Found {len(article_candidates)} article candidates")
        print("- Paragraph-based content extraction")
        print("- Container detection by class names")
        
        return {"title": title_text, "text": article_text}


In [None]:
def additional_heuristics_needed():
    print("\n" + "=" * 60)
    print("ADDITIONAL HEURISTICS NEEDED FOR PRODUCTION")
    print("=" * 60)
    
    heuristics = [
        {
            "category": "Language Detection",
            "techniques": [
                "Language model-based text classification",
                "Character frequency analysis", 
                "Stop word detection",
                "N-gram analysis"
            ],
            "why": "To filter content in the target language and improve accuracy"
        },
        {
            "category": "Content Structure Analysis", 
            "techniques": [
                "DOM tree depth analysis",
                "Sibling element similarity",
                "CSS class pattern matching",
                "HTML5 semantic element detection"
            ],
            "why": "Modern websites use consistent CSS patterns for content"
        },
        {
            "category": "Text Quality Metrics",
            "techniques": [
                "Sentence boundary detection",
                "Punctuation density analysis",
                "Capitalization patterns",
                "Word frequency distribution",
                "Text coherence scoring"
            ],
            "why": "To distinguish real content from navigation/boilerplate text"
        },
        {
            "category": "Visual Layout Heuristics",
            "techniques": [
                "Font size analysis (via CSS)",
                "Text block positioning",
                "White space analysis", 
                "Reading flow patterns"
            ],
            "why": "Important content typically has distinct visual styling"
        },
        {
            "category": "Site-Specific Rules",
            "techniques": [
                "Domain-specific extraction rules",
                "CMS pattern recognition",
                "Publisher-specific templates",
                "Microdata/JSON-LD parsing"
            ],
            "why": "Many sites follow consistent patterns that can be learned"
        },
        {
            "category": "Machine Learning Approaches",
            "techniques": [
                "Content vs. boilerplate classification",
                "Feature engineering from DOM properties",
                "Neural content extraction models",
                "Transfer learning from labeled data"
            ],
            "why": "Data-driven approaches can generalize better than hand-crafted rules"
        }
    ]
    
    for h in heuristics:
        print(f"\n{h['category']}:")
        print(f"  Why needed: {h['why']}")
        print("  Techniques:")
        for technique in h['techniques']:
            print(f"    - {technique}")

In [None]:
def compare_library_performance():
    print("\n" + "=" * 60)
    print("LIBRARY COMPARISON & RECOMMENDATIONS")
    print("=" * 60)
    
    libraries = [
        {
            "name": "Justext",
            "pros": ["Language-aware", "Statistical approach", "Good at removing boilerplate", "Handles multiple languages"],
            "cons": ["Requires language specification", "May be too aggressive", "Less control over rules"],
            "best_for": "Multi-language news sites, blogs with lots of navigation"
        },
        {
            "name": "newspaper3k", 
            "pros": ["Article-focused", "Extracts metadata", "Handles news sites well", "Easy to use"],
            "cons": ["News-specific", "Less customizable", "May miss non-article content"],
            "best_for": "News articles, blog posts, journalistic content"
        },
        {
            "name": "Trafilatura",
            "pros": ["Fast", "High precision", "Handles many site types", "Good recall"],
            "cons": ["Less metadata extraction", "Newer library", "May need fine-tuning"],
            "best_for": "General web scraping, academic content, diverse site types"
        },
        {
            "name": "Readability/readabilipy",
            "pros": ["Based on Mozilla algorithm", "Handles complex layouts", "Good for articles"],
            "cons": ["May be slower", "Less control", "Originally designed for reading view"],
            "best_for": "Article reading, content curation, reading applications"
        },
        {
            "name": "BeautifulSoup + Custom Rules",
            "pros": ["Full control", "Site-specific optimization", "Highly customizable"],
            "cons": ["Requires manual tuning", "Site-specific", "Time-intensive"],
            "best_for": "Specific sites, custom requirements, maximum control needed"
        }
    ]
    
    for lib in libraries:
        print(f"\n{lib['name']}:")
        print(f"  Best for: {lib['best_for']}")
        print("  Pros:", ', '.join(lib['pros']))
        print("  Cons:", ', '.join(lib['cons']))


In [None]:
def real_world_example_analysis():
    print("\n" + "=" * 60)
    print("REAL-WORLD EXAMPLE: BBC NEWS WIKIPEDIA PAGE")
    print("=" * 60)
    
    # Raw content from the fetched BBC News Wikipedia page
    raw_sample = '''BBC News is an operational business division of the British Broadcasting Corporation (BBC) responsible for the gathering and broadcasting of news and current affairs in the UK and around the world. The department is the world's largest broadcast news organisation and generates about 120 hours of radio and television output each day, as well as online news coverage. The service has over 5,500 journalists working across its output including in 50 foreign news bureaus where more than 250 foreign correspondents are stationed.'''
    
    print("EXTRACTED CONTENT (after processing):")
    print("-" * 40)
    print(raw_sample)
    
    print("\nWHY THIS EXTRACTION WORKED:")
    print("✓ Focused on main article content") 
    print("✓ Removed navigation and metadata")
    print("✓ Preserved paragraph structure")
    print("✓ Filtered out Wikipedia-specific elements")
    
    print("\nCHALLENGES ENCOUNTERED:")
    print("• Complex nested HTML structure")
    print("• Multiple content sections mixed with metadata")
    print("• Navigation links embedded within content")
    print("• Table data mixed with article text")
    print("• Citation markers and edit links throughout")
    
    return raw_sample


In [None]:
def production_recommendations():
    print("\n" + "=" * 60)
    print("PRODUCTION RECOMMENDATIONS")
    print("=" * 60)
    
    recommendations = [
        {
            "approach": "Multi-Library Pipeline",
            "description": "Use 2-3 libraries in sequence for better results",
            "example": "Trafilatura → newspaper3k → custom cleanup",
            "when": "High-volume, diverse content sources"
        },
        {
            "approach": "Site-Specific Rules",
            "description": "Develop custom extraction rules for major sources",
            "example": "BBC: target .story-content, CNN: target .article-body",
            "when": "Limited number of high-priority sources"
        },
        {
            "approach": "Machine Learning Classification",
            "description": "Train models to classify content vs. boilerplate",
            "example": "Features: text length, link density, CSS classes",
            "when": "Large-scale, automated processing"
        },
        {
            "approach": "Hybrid Heuristic System",
            "description": "Combine rule-based and statistical approaches",
            "example": "Content scoring + language detection + DOM analysis",
            "when": "Balance between accuracy and maintainability"
        }
    ]
    
    for rec in recommendations:
        print(f"\n{rec['approach']}:")
        print(f"  Description: {rec['description']}")
        print(f"  Example: {rec['example']}")
        print(f"  Best for: {rec['when']}")


In [9]:
if __name__ == "__main__":
    print("WEB TEXT EXTRACTION ANALYSIS")
    print("=" * 60)
    print("Analysis of text extraction techniques using real-world examples")
    
    extractor = TextExtractor(SAMPLE_HTML)
    
    # Run all extraction methods
    basic_result = extractor.basic_beautifulsoup_extraction()
    improved_result = extractor.improved_beautifulsoup_extraction()
    justext_result = extractor.justext_like_approach()
    readability_result = extractor.readability_like_approach()
    newspaper_result = extractor.newspaper_like_approach()
    
    # Real-world analysis
    real_world_example_analysis()
    
    # Analysis and recommendations
    additional_heuristics_needed()
    compare_library_performance()
    production_recommendations()
    
    print("\n" + "=" * 60)
    print("KEY FINDINGS & CONCLUSION")
    print("=" * 60)
    print("✓ No single library handles all cases perfectly")
    print("✓ Basic extraction is rarely sufficient for production")
    print("✓ Heuristic approaches provide significant improvement")
    print("✓ Site-specific rules often yield the best results")
    print("✓ Content quality varies dramatically across the web")
    print("✓ Language detection is crucial for multilingual sites")
    print("\nRECOMMENDED APPROACH:")
    print("1. Start with Trafilatura or newspaper3k for baseline")
    print("2. Add custom heuristics for specific content types")
    print("3. Implement fallback strategies for edge cases")
    print("4. Monitor extraction quality and iterate")

WEB TEXT EXTRACTION ANALYSIS
Analysis of text extraction techniques using real-world examples
1. BASIC BEAUTIFULSOUP EXTRACTION
Raw text extraction:
------------------------------




BBC News - Wikipedia





Home
News



BBC News
BBC News is an operational business division of the British Broadcasting Corporation (BBC) responsible for the gathering and broadcasting of news and current affairs in the UK and around the world. The department is the world's largest broadcast news organisation and generates about 120 hours of radio and television output each day, as well as online news coverage.


Founded1922
HeadquartersBroadcasting House, London


History
The British Broad...

ISSUES IDENTIFIED:
- Includes navigation menu text
- Includes footer content
- Includes sidebar content
- Poor spacing and formatting
- No content prioritization

2. IMPROVED BEAUTIFULSOUP EXTRACTION
Improved extraction:
------------------------------
BBC News
BBC News is an operational business division of the Br