Data Exploration:

TechBlogs are HTML pages from a Wordpress Blog. They have a leading title, and then are often further divided into sections, which can be one or more paragraphs. The blogs contain a mix of regular text, code, and images.

If we "embed" the text of each post, we get a floating-point vector with hundreds or even thousands of dimensions quantifying the "meaning" of each blog post. To visualize this, we can reduce the dimensionality of these vectors so each blog is now represented by a 3D point which we can easily plot. Note how the embeddings naturally cluster the blogs, and we've color coded the clusters. Orange might represent blogs in the realm of healthcare and life-sciences. Magenta might capture blogs within the realm of robotics, etc.


In [None]:
# Download the data

import math
import os
import json
import asyncio
import httpx
import time
import shutil
from datetime import datetime

POSTS_PER_PAGE = 25  # using 100 can cause HTML response to be too long so that the text gets terminated
MAX_PAGE = 8  # setting to 8 so that we get 200 pages total. Increase to download more articles

def download(session, headers, wp, data_dir):
    current_page = 1
    download_complete = False
    now = datetime.now()
    start_timecode = f"{now.year}{now.month}{now.day}{now.hour}{now.minute}{now.second}"
    padding_width = math.ceil(math.log(MAX_PAGE, 10))

    print(f"Downloading up to {MAX_PAGE * POSTS_PER_PAGE} posts...")
    while (not download_complete) and (
        current_page <= MAX_PAGE
    ):  # <= because pages are 1-indexed
        response = session.get(
            f"https://{wp}/wp-json/wp/v2/posts?page={current_page}&per_page={POSTS_PER_PAGE}",
            headers=headers,
        )
        if response.status_code == 200:
            response_json = response.json()
            with open(
                os.path.join(
                    data_dir,
                    f"{start_timecode}_{str(current_page).zfill(padding_width)}.json",
                ),
                "w",
            ) as dump_file:
                json.dump(response_json, dump_file)

            print(f"Page {current_page}. Downloaded {len(response_json)} posts")

            if len(response_json) < POSTS_PER_PAGE:
                download_complete = True
                print(
                    f"Downloaded all ({POSTS_PER_PAGE * (current_page - 1) + len(response_json)} posts)"
                )
            else:
                current_page += 1

        else:
            print(
                f"Download of page {current_page} failed with status code {response.status_code}. {response.text}"
            )
            download_complete = True

In [None]:
data_dir = os.path.join(os.getcwd(), 'data', 'techblogs')

In [None]:
# Organize the tech blogs

file_list = [x for x in sorted(os.listdir(data_dir)) if '.json' in x]

techblogs_dict = {}

for i, filename in enumerate(file_list):
    with open(os.path.join(data_dir, filename), 'r') as in_file:
        data = json.load(in_file)
    for item in data:
        # skip items that do not link to developer.nvidia.com/blog or blogs.nvidia.com
        if not item['link'].startswith("https://developer.nvidia.com/blog"): # and not item['link'].startswith("https://blogs.nvidia.com"):
            continue
        document_title = item['title']['rendered']
        document_url = item['link']
        document_html = item['content']['rendered']
        document_date = item['date_gmt']
        document_date_modified = item['modified_gmt']

        techblogs_dict[document_url] = item


Data Chunking Using Fast API

In [None]:
lient = httpx.Client()

chunking_url = "http://chunking:5005/api/chunking"

def chunk_request(client, request_body):
    chunking_resp = client.post(chunking_url, json=request_body, timeout=30)
    chunks = chunking_resp.json()
    return chunks


# Sentence by sentence chunking
item = example1

document_title = item["title"]["rendered"]
document_url = item["link"]
document_html = item["content"]["rendered"]
document_date = item["date_gmt"]
document_date_modified = item["modified_gmt"]


chunk_request(
    client,
    {
        "strategy": "sentence",
        "input_type": "html",
        "input_str": document_html,
        "additional_metadata": {
            "document_title": document_title,
            "document_url": document_url,
            "document_date": document_date,
            "document_date_modified": document_date_modified,
        },
    },
)

chunks = chunk_request(
    client,
    {
        "strategy": "sentence",
        "chunk_min_words": 250,
        "chunk_overlap_words": 50,
        "input_type": "html",
        "input_str": document_html,
        "additional_metadata": {
            "document_title": document_title,
            "document_url": document_url,
            "document_date": document_date,
            "document_date_modified": document_date_modified,
        },
    },
)
chunks

for i, chunk in enumerate(chunks):
    print(f"Chunk #{i}")
    print(f"Word Count: {sum(chunk['word_count'])}")
    # print(chunk["text"])
    print("==========")

This chunk is this small because the default behavior of the chunking microservice is to enforce boundaries between code and non-code sections in HTML. It will not combine a section of the article written in natural language with another section of only code. This little sentence comes right before a large code section in the article, so the chunking service ends the chunk right there.

Our use case therefore requires us to handle the presence of large code sections within our HTML.

Lots of text documents at NVIDIA contain a mix of code and natural language, whether it's blog posts like these, SDK documentation, Git repository README markdown files, etc.

These sections of code are very different syntactically and grammatically from regular natural language text, and so an embedding model that has not been trained on code may not perform well with code present. For embedding models that are trained on code and natural language, it's also going to be important to delimit the code with the characters the embedding model was trained on. The chunking service uses triple backticks (```) to indicate a section of code.

The chunking service as written supports three strategies to deal with code.
1. The default (`"code_behavior": "enforce_code_boundaries"`) is to enforce hard boundaries between code and non-code. This has the benefit of separation, but has the drawback that sometimes you will end up with awkward small chunks because of these boundaries.
2. The second option (`"code_behavior": "ignore_code_boundaries"`) is to just ignore the boundaries and lump code and non-code together, while still keeping the backticks as delimiters. This is a good option if your embedding model supports both code and non-code.
3. The third option (`"code_behavior": "remove_code_sections"`) is to remove the long only-code sections from the actual text that will be embedded, but store the code as metadata which can later be used. For example, the code can be supplied to an LLM that is generating a response based on the retrieval results it found by matching on the accompanying natural language.

In [None]:
chunks = chunk_request(
    client,
    {
        "strategy": "sentence",
        "code_behavior": "remove_code_sections",
        "chunk_min_words": 250,
        "chunk_overlap_words": 50,
        "input_type": "html",
        "input_str": document_html,
        "additional_metadata": {
            "document_title": document_title,
            "document_url": document_url,
            "document_date": document_date,
            "document_date_modified": document_date_modified,
        },
    },
)

print(len(chunks))

for i, chunk in enumerate(chunks):
    print(f"Chunk #{i}")
    # note that word_count is a list which contains word components of all text components, including ones that are
    # only code. These get removed from the final text though
    print(f"Word Count: {sum(wc for wc, only_code in zip(chunk['word_count'], chunk['only_code']) if not only_code)}")
    print(chunk["text"])
    print("==========")


In [None]:
item = example2

document_title = item["title"]["rendered"]
document_url = item["link"]
document_html = item["content"]["rendered"]
document_date = item["date_gmt"]
document_date_modified = item["modified_gmt"]


chunks = chunk_request(
    client,
    {
        "strategy": "sentence",
        "code_behavior": "remove_code_sections",
        "chunk_min_words": 250,
        "chunk_overlap_words": 50,
        "input_type": "html",
        "input_str": document_html,
        "additional_metadata": {
            "document_title": document_title,
            "document_url": document_url,
            "document_date": document_date,
            "document_date_modified": document_date_modified,
        },
    },
)
for i, chunk in enumerate(chunks):
    print(f"Chunk #{i}")
    # note that word_count is a list which contains word components of all text components, including ones that are
    # only code. These get removed from the final text though
    print(f"Word Count: {sum(wc for wc, only_code in zip(chunk['word_count'], chunk['only_code']) if not only_code)}")
    print(chunk["text"])
    print("==========")

Handling Heading Sections:



In [None]:
item = example2

document_title = item["title"]["rendered"]
document_url = item["link"]
document_html = item["content"]["rendered"]
document_date = item["date_gmt"]
document_date_modified = item["modified_gmt"]


chunks = chunk_request(
    client,
    {
        "strategy": "heading_section_sentence",
        "code_behavior": "remove_code_sections",
        "chunk_min_words": 250,
        "chunk_overlap_words": 50,
        "input_type": "html",
        "input_str": document_html,
        "additional_metadata": {
            "document_title": document_title,
            "document_url": document_url,
            "document_date": document_date,
            "document_date_modified": document_date_modified,
        },
    },
)
for i, chunk in enumerate(chunks):
    print(f"Chunk #{i}")
    # note that word_count is a list which contains word components of all text components, including ones that are
    # only code. These get removed from the final text though
    print(f"Word Count: {sum(wc for wc, only_code in zip(chunk['word_count'], chunk['only_code']) if not only_code)}")
    print(chunk["text"])
    print("==========")

In [None]:
#Split by heading and Remove Code Sections

item = example1

document_title = item["title"]["rendered"]
document_url = item["link"]
document_html = item["content"]["rendered"]
document_date = item["date_gmt"]
document_date_modified = item["modified_gmt"]


chunks = chunk_request(
    client,
    {
        "strategy": "heading_section",
        "code_behavior": "remove_code_sections",
        "input_type": "html",
        "input_str": document_html,
        "additional_metadata": {
            "document_title": document_title,
            "document_url": document_url,
            "document_date": document_date,
            "document_date_modified": document_date_modified,
        },
    },
)
for i, chunk in enumerate(chunks):
    print(f"Chunk #{i}")
    # note that word_count is a list which contains word components of all text components, including ones that are
    # only code. These get removed from the final text though
    print(
        f"Word Count: {sum(wc for wc, only_code in zip(chunk['word_count'], chunk['only_code']) if not only_code)}"
    )
    print(chunk["text"])
    print("==========")

# Concatenate non-code sections:
clean_text_no_code = "\n".join([x["text"] for x in chunks])
print(clean_text_no_code)

# Concatenate code and non-code sections
clean_text_with_code = "\n".join([ x["heading_section_title"][0] + "\n" + "\n".join(x["text_components"]) for x in chunks])
print(clean_text_with_code)