In [2]:
import requests
from readability import Document
from newspaper import Article
from bs4 import BeautifulSoup
from serpapi import GoogleSearch
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

In [3]:
api_key = "c9691012c0d88d2800f8b26247609a0231402853f0c2cb6c89c3d9d1fa39a41f"  # Replace with your SerpAPI key
article_citation_id = "1NtVbf1efHoJ"  # Replace with your citation ID

# List to store URLs
urls = []
# Paginate through results, stopping after retrieving 100 URLs
start = 0  # Pagination index
MAX_RESULTS = 100

while len(urls) < MAX_RESULTS:
    params = {
        "engine": "google_scholar",
        "cites": article_citation_id,
        "api_key": api_key,
        "start": start
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    organic_results = results.get("organic_results", [])

    if not organic_results:
        break  # No more results, exit the loop

    for result in organic_results:
        urls.append(result["link"])
        if len(urls) >= MAX_RESULTS:
            break  # Stop once we reach the desired number of articles

    start += 10  # Move to the next page
print(f"Retrieved {len(urls)} URLs.")

Retrieved 100 URLs.


In [4]:
# Function to extract content using Readability
def extract_readability(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        doc = Document(response.text)
        content = doc.summary()
        return content
    except Exception as e:
        return None

# Function to extract content using Newspaper3k
def extract_newspaper(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        return None

# Function to extract content using BeautifulSoup
def extract_beautifulsoup(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = [p.get_text() for p in soup.find_all("p")]
        content = "\n".join(paragraphs)
        return content if content.strip() else None
    except Exception as e:
        return None

# Compare all extractors on a single URL
def process_url(url):
    result = {"URL": url}

    # Extract content using all libraries
    result["Readability"] = bool(extract_readability(url))
    result["Newspaper3k"] = bool(extract_newspaper(url))
    result["BeautifulSoup"] = bool(extract_beautifulsoup(url))

    return result

# Run comparison on all URLs
def compare_extractors(urls):
    results = []

    # Use multithreading to process URLs in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(process_url, urls))

    return results

# Main script
if __name__ == "__main__":
    print("Comparing content extraction libraries...")
    comparison_results = compare_extractors(urls)

    # Convert results to a DataFrame for analysis
    df = pd.DataFrame(comparison_results)

    # Determine overall success rates
    success_rates = df.drop(columns=["URL"]).mean() * 100

    print("\nSuccess Rates (Percentage of URLs with content extracted):")
    print(success_rates)

    # Find common URLs that failed across all extractors
    failed_all = df[
        (df["Readability"] == False) &
        (df["Newspaper3k"] == False) &
        (df["BeautifulSoup"] == False)
    ]

    print(f"\nNumber of URLs failed by all extractors: {len(failed_all)}")
    print("URLs failed by all extractors:")
    print(failed_all["URL"])

    # Save results to a CSV file for further analysis
    df.to_csv("content_extraction_comparison.csv", index=False)
    print("\nResults saved to 'content_extraction_comparison.csv'.")


Comparing content extraction libraries...

Success Rates (Percentage of URLs with content extracted):
Readability      49.0
Newspaper3k      55.0
BeautifulSoup    50.0
dtype: float64

Number of URLs failed by all extractors: 45
URLs failed by all extractors:
0     https://www.cell.com/cell-systems/fulltext/S24...
1     https://ascopubs.org/doi/abs/10.1200/CCI.18.00069
2     https://www.cell.com/trends/cell-biology/fullt...
4     https://www.tandfonline.com/doi/abs/10.1080/19...
6     https://www.cell.com/trends/cancer/fulltext/S2...
11    https://www.embopress.org/doi/abs/10.15252/msb...
12    https://pubs.acs.org/doi/abs/10.1021/acs.chemr...
15    https://onlinelibrary.wiley.com/doi/abs/10.100...
16    https://ieeexplore.ieee.org/abstract/document/...
17    https://academic.oup.com/bioinformatics/articl...
19    https://www.sciencedirect.com/science/article/...
21    https://royalsocietypublishing.org/doi/abs/10....
22    https://wires.onlinelibrary.wiley.com/doi/abs/...
29    https:/