# Scraping Blog Posts off `nutritionfacts.org`

## Libraries

In [1]:
import json
import time
from pathlib import Path

from bs4 import BeautifulSoup
from tqdm import tqdm

from src.web_scraping import (
    extract_all_urls,
    extract_blog_data,
    filter_links,
    get_meta_data,
    get_paragraphs,
    get_webpage_content,
    replace_strange_chars,
)

## Functions

# Parameters

In [None]:
data_path = Path(".").resolve().parent / "data"
data_path.is_dir()  # fails if it doesn't exist
blog_posts_root: Path = data_path / "blog_posts"
post_path_raw: Path = blog_posts_root / "raw_txt"
post_path_raw.is_dir()  # fails if it doesn't exist
post_path_json: Path = blog_posts_root / "json"
post_path_json.is_dir()  # fails if it doesn't exist

In [None]:
root_url: str = "https://nutritionfacts.org/blog/"
file_url_list: Path = blog_posts_root / "blog_posts_urls.csv"

# Code

## Testing connection

In [None]:
response = get_webpage_content(root_url)

In [None]:
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
# Find all links on the page
links: set[str] = sorted({link["href"] for link in soup.find_all("a", href=True)})
print("Number of links:", len(links))

In [None]:
# filter the links
blog_posts_of_page: list[str] = filter_links(links, root_url)
n_posts: int = len(blog_posts_of_page)
print(f"Number of blog posts: {n_posts}")

## Extract urls of all blog posts

In [None]:
urls_list: list[str] = extract_all_urls(root=root_url, page_stop=None)

In [None]:
blog_post_urls_set = set(urls_list)
print("Number of unique blog posts:", len(blog_post_urls_set))
# Number of blog posts: 1285

In [None]:
# post processing
for url in list(blog_post_urls_set):  # create a copy of the set
    link_tail: str = url.replace(root_url, "").replace("/", "")
    # remove some urls that are not blog posts
    if link_tail.isdigit():
        print(url)
        blog_post_urls_set.remove(url)
print("Number of unique blog posts:", len(blog_post_urls_set))
# Number of unique blog posts: 1281

In [None]:
# export to csv file
with open(blog_posts_root / file_url_list, "w") as f:
    for url in sorted(blog_post_urls_set):
        f.write(f"{url}\n")

## Extract content of each blog post

In [None]:
# read from csv file
with open(blog_posts_root / file_url_list) as f:
    urls_list: list[str] = f.read().splitlines()

### Testing

In [None]:
blog_post_url = urls_list[1111]
url_tail = blog_post_url.replace(root_url, "").replace("/", "")
url_tail

In [None]:
blog_post_url

In [None]:
response = get_webpage_content(blog_post_url)
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
# write to file
with open(f"{url_tail}.html", "w") as f:
    f.write(str(soup))

#### pure content

In [None]:
# Extract the content you are interested in
paragraphs_raw = soup.find_all("p", class_="p1")
content = "\n\n".join(para.get_text() for para in paragraphs_raw)
paragraphs_raw

In [None]:
with open(f"{url_tail}.txt", "w") as f:
    f.write(content)

#### meta data

In [None]:
meta_data = get_meta_data(soup)
meta_data

In [None]:
title_text = soup.find("h1", class_="entry-title").get_text()
title_text

In [None]:
# Extract the first datetime value
date_created = soup.find("time", class_="updated")["datetime"]

# Extract the second datetime value (using the second <time> tag)
date_last_update = soup.find_all("time")[1]["datetime"]

print("Datetime 01:", date_created)
print("Datetime 02:", date_last_update)

#### paragraphs

In [None]:
paragraphs_clean = get_paragraphs(soup)
paragraphs_clean

In [None]:
paragraphs_html: list = soup.find_all("p", class_="p1")
if not paragraphs_html:
    paragraphs_html = soup.find_all("p")

In [None]:
paragraphs_raw: list[str] = [para.get_text() for para in paragraphs_html]
paragraphs_raw

In [None]:
# Extract and clean paragraphs while excluding those that start with certain phrases
paragraphs_raw: list[str] = [para_html.get_text().strip() for para_html in paragraphs_html]
exclude_startswith: list[str] = [
    "Written By",
    "Image Credit",
    "In health",
    "Michael Greger",
    "PS:",
    "A founding member",
    "Subscribe",
    "Catch up",
    "Charity ID",
    "We  our volunteers!",
    "Interested in learning more about",
    "Check out:",
]
# Create clean list
paragraphs_clean: list[str] = [
    replace_strange_chars(para_raw)
    for para_raw in paragraphs_raw
    if para_raw and not any(para_raw.startswith(prefix) for prefix in exclude_startswith)
]
paragraphs_clean

#### Extract key takeaways

In [None]:
key_takeaways_heading = soup.find("p", string="KEY TAKEAWAYS")
if key_takeaways_heading is None:
    key_takeaways = []
else:
    # Find the next <ul> element after the "KEY TAKEAWAYS" heading
    key_takeaways_list = key_takeaways_heading.find_next("ul")

    # Extract the text from each <li> in the list
    key_takeaways = [replace_strange_chars(li.get_text().stripe()) for li in key_takeaways_list.find_all("li")]

# Print or use the extracted key takeaways
for takeaway in key_takeaways:
    print(takeaway)

#### article tags

In [None]:
tags_raw = soup.find("article").get("class")
if tags_raw:
    tags_blog = [tag.split("-")[1] for tag in tags_raw if tag.startswith("tag-")]
    print(tags_blog)
    cats = [cat.split("-")[1] for cat in tags_raw if cat.startswith("category-")]
    print(cats)

#### export to json

In [None]:
blog_data = extract_blog_data(soup)

In [None]:
# write to json file
with open(f"{url_tail}.json", "w", encoding="utf-8") as json_file:
    json.dump(blog_data, json_file, ensure_ascii=True, indent=4)

### Real extraction loop

#### pure text

In [None]:
# pure text
for url in tqdm(urls_list):
    url_tail = url.replace(root_url, "").replace("/", "")
    file_out = post_path_raw / f"{url_tail}.txt"
    if file_out.exists():
        continue

    time.sleep(0.5)  # wait a bit to avoid being blocked

    # get the HTML content
    response = get_webpage_content(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the content
    paragraphs = soup.find_all("p")
    content = "\n\n".join(para.get_text() for para in paragraphs)

    # export to file
    with open(file_out, "w", encoding="utf-8") as f:
        f.write(content)

# 100%|██████████| 1281/1281 [28:03<00:00,  1.31s/it]

#### meta data and text chunks

In [None]:
for url in tqdm(urls_list):
    url_tail = url.replace(root_url, "").replace("/", "")
    file_out = post_path_json / f"{url_tail}.json"
    if file_out.exists():
        continue

    time.sleep(0.1)  # wait a bit to avoid being blocked

    # get the HTML content
    response = get_webpage_content(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the blog data
    blog_data: dict = {"url": url}
    blog_data.update(extract_blog_data(soup))

    # export to json file
    with open(file_out, "w", encoding="utf-8") as json_file:
        json.dump(blog_data, json_file, ensure_ascii=True, indent=4)
# 100%|██████████| 1281/1281 [22:06<00:00,  1.04s/it]