
# 📄 Web extraction comparison: BeautifulSoup/spaCy vs newspaper3k

This notebook compares two methods for extracting text, summaries, keywords, and titles from a list of URLs:

- **Method 1**: Uses `requests` + `BeautifulSoup` + `spaCy`
- **Method 2**: Uses the `newspaper3k` library

We evaluate each method by:
- Processing a list of URLs
- Measuring execution time
- Visualizing text lengths
- Saving outputs for comparison


In [None]:

import time
import requests
from bs4 import BeautifulSoup
import spacy
import pandas as pd
from newspaper import Article
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [None]:

nlp = spacy.load("en_core_web_sm")
df = pd.read_csv('NLS_alive_urls.csv')
results_bs_spacy = []
results_newspaper = []


In [None]:

def get_title(soup):
    if soup.title:
        return soup.title.string.strip()
    return "No title found"

def get_text_and_summary(soup):
    paragraphs = soup.find_all('p')
    text = ' '.join([para.get_text().strip() for para in paragraphs])
    if text:
        doc = nlp(text)
        summary = ' '.join([sent.text for sent in doc.sents][:5])
        return text, summary
    return "", "No summary available"

def get_full_text(soup):
    paragraphs = soup.find_all('p')
    return ' '.join([para.get_text().strip() for para in paragraphs]) or "No text available"

def get_keywords(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop][:10]

def process_with_bs_spacy(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title = get_title(soup)
            text, summary = get_text_and_summary(soup)
            keywords = get_keywords(text)
            full_text = get_full_text(soup)
            return {
                "url": url,
                "title": title,
                "summary": summary,
                "keywords": ', '.join(keywords),
                "full_text": full_text
            }
    except Exception as e:
        print(f"Error processing {url} with BeautifulSoup/spaCy: {e}")
    return None

def process_with_newspaper(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        return {
            "url": url,
            "title": article.title,
            "summary": article.summary,
            "keywords": ', '.join(article.keywords),
            "full_text": article.text
        }
    except Exception as e:
        print(f"Error processing {url} with newspaper3k: {e}")
    return None


In [None]:

start_time_bs_spacy = time.time()
with tqdm(total=len(df), desc="Processing with BeautifulSoup/spaCy", unit="url") as pbar:
    for url in df['url']:
        result = process_with_bs_spacy(url)
        if result:
            results_bs_spacy.append(result)
        pbar.update(1)
end_time_bs_spacy = time.time()

start_time_newspaper = time.time()
with tqdm(total=len(df), desc="Processing with newspaper3k", unit="url") as pbar:
    for url in df['url']:
        result = process_with_newspaper(url)
        if result:
            results_newspaper.append(result)
        pbar.update(1)
end_time_newspaper = time.time()


In [None]:

df_bs_spacy = pd.DataFrame(results_bs_spacy)
df_newspaper = pd.DataFrame(results_newspaper)


In [None]:

print(f"Execution time for BeautifulSoup/spaCy: {end_time_bs_spacy - start_time_bs_spacy} seconds")
print(f"Execution time for newspaper3k: {end_time_newspaper - start_time_newspaper} seconds")


In [None]:

def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

save_dir = 'plots'
ensure_dir(save_dir)

def plot_combined_text_length_distribution(df1, df2, method1, method2):
    df1['text_length'] = df1['full_text'].apply(lambda x: len(x.split()))
    df2['text_length'] = df2['full_text'].apply(lambda x: len(x.split()))

    plt.figure(figsize=(12, 6))
    sns.histplot(df1['text_length'], kde=True, bins=30, color='blue', label=method1, alpha=0.5)
    sns.histplot(df2['text_length'], kde=True, bins=30, color='orange', label=method2, alpha=0.5)
    plt.xlabel('Text Length (number of words)')
    plt.ylabel('Frequency')
    plt.title('Text Length Distribution Comparison')
    plt.legend()
    plt.savefig(os.path.join(save_dir, 'text_length_distribution_comparison.png'))
    plt.show()

plot_combined_text_length_distribution(df_bs_spacy, df_newspaper, 'BeautifulSoup/spaCy', 'newspaper3k')


In [None]:

df_bs_spacy.to_csv('results_bs_spacy.csv', index=False)
df_newspaper.to_csv('results_newspaper.csv', index=False)

with open('execution_times.txt', 'w') as f:
    f.write(f"Execution time for BeautifulSoup/spaCy: {end_time_bs_spacy - start_time_bs_spacy} seconds\n")
    f.write(f"Execution time for newspaper3k: {end_time_newspaper - start_time_newspaper} seconds\n")
