# Scraping Blog Posts off `nutritionfacts.org`

## Libraries

In [123]:
import json
from bs4 import BeautifulSoup
import requests
import time
from pathlib import Path
from tqdm import tqdm

## Functions

In [169]:

def get_webpage_content(url) -> requests.Response | None:
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None

    return response

def filter_links(links: list[str], root: str) -> list[str]:
    filtered_links: list[str] = []
    for href in links:
        if not href.startswith(root):
            continue
        link_tail: str = href.replace(root, "")
        if link_tail and not link_tail.startswith("page"):
            filtered_links.append(href)

    return filtered_links

def extract_all_urls(root: str, page_stop: int | None = None ) -> list[str]:
    # collect all the blog posts urls
    i_page: int = 0
    url_list: list[str] = []
    while True:
        time.sleep(0.2) # wait a bit to avoid being blocked
        i_page += 1
        # for debug only
        if page_stop is not None and i_page > page_stop:
            break

        if i_page == 1:
            page_url = root
        else:
            page_url = f"{root}page/{i_page}/"
        print(f"{i_page}. Page URL: {page_url}")

        # get the HTML content
        response = get_webpage_content(page_url)
        if response is None:
            break

        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # get all links on the page
        links: list[str] = sorted({link["href"] for link in soup.find_all("a", href=True)})

        # filter the links
        blog_posts_of_page: list[str] = filter_links(links, root)
        n_posts: int = len(blog_posts_of_page)
        print(f"\t Number of blog posts: {n_posts}")
        # page needs at least 2 posts to be considered, otherwise it's at the last page
        if n_posts < 2:
            break
        url_list.extend( blog_posts_of_page )

    return url_list

def replace_strange_chars(text: str) -> str:
    # Create a dictionary for replacements to make the code more scalable
    replacements = {     "“": "'", "”": "'", "’": "'", "‘": "'", "…": "...", "—": "-" }
    return text.translate(str.maketrans(replacements))

def get_meta_data(soup: BeautifulSoup) -> dict:
    meta_data = {
        "title" : soup.find('h1', class_='entry-title').get_text(),
        "created" : soup.find('time', class_='updated')['datetime'],
        "updated" : soup.find_all('time')[1]['datetime'],
    }
    return meta_data

def get_paragraphs(soup: BeautifulSoup) -> list[str]:
    paragraphs_html: list = soup.find_all("p", class_='p1')
    if not paragraphs_html:
        paragraphs_html = soup.find_all("p")

    # Extract and clean paragraphs while excluding those that start with certain phrases
    paragraphs_raw: list[str] = [para_html.get_text().strip() for para_html in paragraphs_html]
    exclude_startswith: list[str] = ["Written By","Image Credit","In health","Michael Greger","PS:","A founding member","Subscribe","Catch up","Charity ID","We  our volunteers!","Interested in learning more about","Check out:"]
    # Create clean list
    paragraphs_clean: list[str] = [
        replace_strange_chars(para_raw)
        for para_raw in paragraphs_raw
        if para_raw and not any(para_raw.startswith(prefix) for prefix in exclude_startswith)
    ]
    return paragraphs_clean

def get_key_takeaways(soup: BeautifulSoup) -> list[str]:
    key_takeaways_heading = soup.find('p', string="KEY TAKEAWAYS")
    if key_takeaways_heading is None:
       return []

    # Find the next <ul> element after the "KEY TAKEAWAYS" heading
    key_takeaways_list = key_takeaways_heading.find_next('ul')

    # Extract the text from each <li> in the list
    return [replace_strange_chars(li.get_text().strip()) for li in key_takeaways_list.find_all('li')]

# Parameters

In [174]:
data_path = Path(".").resolve().parent / "data"
data_path.is_dir() # fails if it doesn't exist
blog_posts_root: Path = data_path / "blog_posts"
post_path_raw: Path = blog_posts_root / "raw_txt"
post_path_raw.is_dir() # fails if it doesn't exist
post_path_json: Path = blog_posts_root / "json"
post_path_json.is_dir() # fails if it doesn't exist

True

In [54]:
root_url: str = "https://nutritionfacts.org/blog/"
file_url_list: Path = blog_posts_root / "blog_posts.csv"

# Code

## Testing connection

In [None]:
response = get_webpage_content(root_url)

In [None]:
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
# Find all links on the page
links: set[str] = sorted({link["href"] for link in soup.find_all("a", href=True)})
print("Number of links:", len(links))

In [None]:
# filter the links
blog_posts_of_page: list[str] = filter_links(links, root_url)
n_posts: int = len(blog_posts_of_page)
print(f"Number of blog posts: {n_posts}")

## Extract urls of all blog posts

In [None]:
urls_list: list[str] = extract_all_urls(root=root_url, page_stop = None )

In [None]:
blog_post_urls_set = set(urls_list)
print("Number of unique blog posts:", len(blog_post_urls_set))
# Number of blog posts: 1285

In [None]:
# post processing
for url in list(blog_post_urls_set): # create a copy of the set
    link_tail: str = url.replace(root_url, "").replace("/", "")
    # remove some urls that are not blog posts
    if link_tail.isdigit():
        print(url)
        blog_post_urls_set.remove(url)
print("Number of unique blog posts:", len(blog_post_urls_set))
# Number of unique blog posts: 1281

In [None]:
# export to csv file
with open(blog_posts_root / file_url_list, "w") as f:
    for url in sorted(blog_post_urls_set):
        f.write(f"{url}\n")

## Extract content of each blog post

In [55]:
# read from csv file
with open(blog_posts_root / file_url_list, "r") as f:
    urls_list: list[str] = f.read().splitlines()

### Testing

In [144]:
blog_post_url = urls_list[25]
url_tail = blog_post_url.replace(root_url, "").replace("/", "")
url_tail

'after-marijuana-legalization-did-opioid-overdoses-go-up-stay-the-same-or-go-down'

In [145]:
blog_post_url

'https://nutritionfacts.org/blog/after-marijuana-legalization-did-opioid-overdoses-go-up-stay-the-same-or-go-down/'

In [146]:
response = get_webpage_content(blog_post_url)
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

In [147]:
# write to file
with open(f"{url_tail}.html", "w") as f:
    f.write(str(soup))

#### pure content

In [148]:
# Extract the content you are interested in
paragraphs_raw = soup.find_all("p", class_='p1')
content = "\n\n".join(para.get_text() for para in paragraphs_raw)
paragraphs_raw

[<p class="p1">What happened in states after medical marijuana laws were passed? Did opioid overdoses go up, stay the same, or go down?</p>,
 <p class="p1">Millions of people in the United States have been <a href="https://www.ncbi.nlm.nih.gov/pubmed/28162799" rel="noopener noreferrer" target="_blank"><span class="s1">diagnosed</span></a> with an opioid use disorder, and more than 80 Americans <span class="s2">die</span> each day from opioid overdose. Where is this coming from? Most “new heroin users started out misusing opioid prescription painkillers.” This is important because more than 200 million opioid painkiller prescriptions are still written every year. Did you catch that number? Two hundred million prescriptions every year, “a number closely approximating the entire adult population in the United States.” That’s incredible.</p>,
 <p class="p1">“‘When you <a href="https://nutritionfacts.org/video/do-potatoes-increase-the-risk-of-high-blood-pressure-and-death/" rel="noopener no

In [149]:
with open(f"{url_tail}.txt", "w") as f:
    f.write(content)


#### meta data

In [166]:
meta_data= get_meta_data(soup)
meta_data

{'title': 'After Marijuana Legalization Did Opioid Overdoses Go Up, Stay the Same, or Go Down?',
 'created': '2022-05-31T12:00:33+00:00',
 'updated': '2022-12-13T13:17:33-05:00'}

In [150]:
title_text = soup.find('h1', class_='entry-title').get_text()
title_text

'After Marijuana Legalization Did Opioid Overdoses Go Up, Stay the Same, or Go Down?'

In [151]:
# Extract the first datetime value
date_created = soup.find('time', class_='updated')['datetime']

# Extract the second datetime value (using the second <time> tag)
date_last_update = soup.find_all('time')[1]['datetime']

print("Datetime 01:", date_created)
print("Datetime 02:", date_last_update)

Datetime 01: 2022-05-31T12:00:33+00:00
Datetime 02: 2022-12-13T13:17:33-05:00


#### paragraphs

In [163]:
paragraphs_clean = get_paragraphs(soup)
paragraphs_clean

['What happened in states after medical marijuana laws were passed? Did opioid overdoses go up, stay the same, or go down?',
 "Millions of people in the United States have been diagnosed with an opioid use disorder, and more than 80 Americans die each day from opioid overdose. Where is this coming from? Most 'new heroin users started out misusing opioid prescription painkillers.' This is important because more than 200 million opioid painkiller prescriptions are still written every year. Did you catch that number? Two hundred million prescriptions every year, 'a number closely approximating the entire adult population in the United States.' That's incredible.",
 "''When you see something like the opioid addiction crisis blossoming in so many states around this country, the last thing we should be doing is encouraging people' to smoke cannabis, [White House Spokesperson Sean] Spicer told reporters.' But, if opioid addiction starts with people taking prescription pain pills, maybe cannab

In [153]:
paragraphs_html: list = soup.find_all("p", class_='p1')
if not paragraphs_html:
    paragraphs_html = soup.find_all("p")

In [154]:
paragraphs_raw: list[str] = [para.get_text() for para in paragraphs_html]
paragraphs_raw

['What happened in states after medical marijuana laws were passed? Did opioid overdoses go up, stay the same, or go down?',
 'Millions of people in the United States have been diagnosed with an opioid use disorder, and more than 80 Americans die each day from opioid overdose. Where is this coming from? Most “new heroin users started out misusing opioid prescription painkillers.” This is important because more than 200 million opioid painkiller prescriptions are still written every year. Did you catch that number? Two hundred million prescriptions every year, “a number closely approximating the entire adult population in the United States.” That’s incredible.',
 '“‘When you see something like the opioid addiction crisis blossoming in so many states around this country, the last thing we should be doing is encouraging people’ to smoke cannabis, [White House Spokesperson Sean] Spicer told reporters.” But, if opioid addiction starts with people taking prescription pain pills, maybe cannab

In [156]:
# Extract and clean paragraphs while excluding those that start with certain phrases
paragraphs_raw: list[str] = [para_html.get_text().strip() for para_html in paragraphs_html]
exclude_startswith: list[str] = ["Written By","Image Credit","In health","Michael Greger","PS:","A founding member","Subscribe","Catch up","Charity ID","We  our volunteers!","Interested in learning more about","Check out:"]
# Create clean list
paragraphs_clean: list[str] = [
    replace_strange_chars(para_raw)
    for para_raw in paragraphs_raw
    if para_raw and not any(para_raw.startswith(prefix) for prefix in exclude_startswith)
]
blog_content["paragraphs"] = paragraphs_clean
paragraphs_clean

['What happened in states after medical marijuana laws were passed? Did opioid overdoses go up, stay the same, or go down?',
 "Millions of people in the United States have been diagnosed with an opioid use disorder, and more than 80 Americans die each day from opioid overdose. Where is this coming from? Most 'new heroin users started out misusing opioid prescription painkillers.' This is important because more than 200 million opioid painkiller prescriptions are still written every year. Did you catch that number? Two hundred million prescriptions every year, 'a number closely approximating the entire adult population in the United States.' That's incredible.",
 "''When you see something like the opioid addiction crisis blossoming in so many states around this country, the last thing we should be doing is encouraging people' to smoke cannabis, [White House Spokesperson Sean] Spicer told reporters.' But, if opioid addiction starts with people taking prescription pain pills, maybe cannab

#### Extract key takeaways

In [159]:
key_takeaways_heading = soup.find('p', string="KEY TAKEAWAYS")
if key_takeaways_heading is None:
    key_takeaways = []
else:
    # Find the next <ul> element after the "KEY TAKEAWAYS" heading
    key_takeaways_list = key_takeaways_heading.find_next('ul')

    # Extract the text from each <li> in the list
    key_takeaways = [replace_strange_chars(li.get_text().stripe()) for li in key_takeaways_list.find_all('li')]

# Print or use the extracted key takeaways
for takeaway in key_takeaways:
    print(takeaway)



More than 200 million opioid painkiller prescriptions are written annually despite the diagnosis of millions in the United States with an opioid use disorder and more than 80 Americans dying every day from opioid overdose.
Might cannabis act as a gateway to harder drugs, like opioids, or might it reduce opioid addiction by offering a substitute painkiller to prescription pills?
The American Medical Association's official position is that marijuana 'has no scientifically proven, currently accepted medical use for preventing or treating any disease,' but studies have found that cannabis compounds produce pain relief 'equivalent to moderate doses of codeine,' an opioid used to treat mild to moderate pain.
At the end of life, cannabis may allow patients to reduce opiate doses without compromising pain relief such that they may not be in such a drug-induced stupor that they cannot say goodbye.
Most New Englanders taking opioids claimed they reduced their opioid use after starting medical ca

#### export to json

In [170]:
blog_content = { }
blog_content.update(get_meta_data(soup))
blog_content["paragraphs"] = get_paragraphs(soup)
blog_content["key_takeaways"] = get_key_takeaways(soup)

In [171]:
# write to json file
with open(f"{url_tail}.json", "w") as f:
    json.dump(blog_content, f)


### Real extraction loop

#### pure text

In [57]:
# pure text
for url in tqdm(urls_list):
    url_tail = url.replace(root_url, "").replace("/", "")
    file_out = post_path_raw / f"{url_tail}.txt"
    if file_out.exists():
        continue

    time.sleep(0.5) # wait a bit to avoid being blocked

    # get the HTML content
    response = get_webpage_content(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the content
    paragraphs = soup.find_all("p")
    content = "\n\n".join(para.get_text() for para in paragraphs)

    # export to file
    with open(file_out, "w", encoding="utf-8") as f:
        f.write(content)

# 100%|██████████| 1281/1281 [28:03<00:00,  1.31s/it]

100%|██████████| 1281/1281 [00:00<00:00, 34354.04it/s]


#### meta data and text chunks

In [None]:

for url in tqdm(urls_list):
    url_tail = url.replace(root_url, "").replace("/", "")
    file_out = post_path_json / f"{url_tail}.json"
    if file_out.exists():
        continue

    time.sleep(0.1) # wait a bit to avoid being blocked

    # get the HTML content
    response = get_webpage_content(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    blog_content = { }
    blog_content.update(get_meta_data(soup))
    blog_content["paragraphs"] = get_paragraphs(soup)
    blog_content["key_takeaways"] = get_key_takeaways(soup)

    # export to file
    with open(file_out, "w", encoding="utf-8") as f:
        json.dump(blog_content, f)