# RAG Project 2n Phase - General Handbook / Changing Chunck Structure

## Parsing HTML

I found that links for all pages in the General Handbook are found in the source code of the main page. I used BeautifulSoup to parse all links in the main page of the General Handbook page. I saved them into a list. Then, I checked which links I wouldn't need, and cut them out of the list.

In [1]:
from bs4 import BeautifulSoup

with open("documents/study_manual_general-handbook.html", "r") as f:
    soup = BeautifulSoup(f, "html.parser")
    links = [a['href'] for a in soup.find_all('a', class_="html-external-link", href=True)]

print(len(links))

#for link in links[8:-1]:
#    print(link)    

links = links[8:-1]

# Found that many links where only references to the same pages, so I just filtered them and kept only one link for each page (39 links total for each page in the General Handbook). 
chapter_links = []

for link in links:

    try:
        link_i = link.index("#")

        if link[:link_i] in chapter_links:
            continue
        chapter_links.append(link[:link_i])
    except:
        if link in chapter_links:
            continue
        chapter_links.append(link)

print(len(chapter_links))
for url in chapter_links:
    print(url)

1015
39
https://www.churchofjesuschrist.org/study/manual/general-handbook/0-introductory-overview?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/1-work-of-salvation-and-exaltation?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/2-supporting-individuals-and-families?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/3-priesthood-principles?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/4-leadership-in-the-church-of-jesus-christ?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/5-general-and-area-leadership?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/6-stake-leadership?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/7?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/8-elders-quorum?lang=eng
https://www.churchofjesuschrist.org/study/manual/general-handbook/9-relief-society?lan

I used Selenium to use the browser and wait for the pages to load before getting their content. Then, I saved all pages in a json document for further processing.

In [2]:
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome()  # Replace with desired browser


with open('general-handbook-pages.json', 'w', encoding='utf-8') as f:
    
    manual_dict = {}

    for url in chapter_links:
        driver.get(url)

        # Wait for a specific element to be present (customizable)
        wait = WebDriverWait(driver, 10)  # 10 second timeout
        wait.until(EC.presence_of_element_located((By.ID, "page")))

        time.sleep(3)

        content = driver.page_source
        manual_dict[f"{url}"] = str(content)

    json.dump(manual_dict, f, ensure_ascii=False, indent=4)

driver.quit()


## Preparing Chuncks with BeautifulSoup
I divided the content using their subheadings and regex to get smaller portions of text.

In [30]:
import json
from bs4 import BeautifulSoup
import re

main_content = {}

# I started by getting all chapter titles and main content from each page.
with open('processed-docs/general-handbook-pages.json', 'r', encoding='utf-8') as f:

    gen_hand_dict = json.loads(f.read())
    
    for key, value in gen_hand_dict.items():
        soup = BeautifulSoup(value, "html.parser")
        
        main_content[key] = {"chapter": soup.find("h1", attrs={'id':'title1'}).get_text(), "body": soup.find("div", class_="body-block")}

Now, I will try to see if I can keep tables and maybe lists with their respective tags.

In [42]:
def extract_content_with_html(element):
    # Keep <ul>, <ol>, and <table> tags as HTML
    allowed_tags = ['ul', 'ol', 'li', 'table', 'thead', 'tbody', 'tr', 'th', 'td']
    
    # Replace \xa0 (non-breaking space) with a regular space in the text content
    return ''.join(str(child).replace('\xa0', ' ') if child.name in allowed_tags else child.get_text().replace('\xa0', ' ') for child in element.children)


def parse_section(element, chapter_title, chapter_url):
    sections = []
    
    # Handle both section and figure tags
    for section in element.find_all(['section', 'figure'], recursive=False):
        header = section.find('header')
        if header:
            # Safely extract the section number and title
            section_number = header.find_all('p')[0].get_text(strip=True) if header.find_all('p') else 'No section number'
            
            if section.name == 'figure':
                title = header.find_all('p')[1].get_text(strip=True) if len(header.find_all('p')) > 1 else 'No title'
            else:
                title_tag = header.find('h2') or header.find('h3') or header.find('h4') or header.find('h5') or header.find('h6')
                title = title_tag.get_text(strip=True) if title_tag else 'No title'
            
            # Collect all direct children that are not sections or headers
            content = ''.join([extract_content_with_html(child) for child in section.find_all(recursive=False) if child.name not in ['header', 'section', 'figure']])
            
            # Append the section to the list
            sections.append({
                'chapter': chapter_title,
                'chapter_url': chapter_url,
                'section': section_number,
                'title': title.replace('\xa0', ' '),
                'content': content.strip()
            })
        
        # Recursively process nested sections or figures
        sections.extend(parse_section(section, chapter_title, chapter_url))
    
    return sections

In [43]:
# i = 0
for key, value in main_content.items():
    # Add a number to the chapter title, only once in Jupyter
    # value["chapter"] = f"{i}. {value['chapter']}"
    
    for element in value["body"].find_all():
        element.attrs = {}

    section_list = []
    
    section_list = parse_section(value["body"], value["chapter"], chapter_url=key)
    print(section_list)        
    break

[{'chapter': '0. Introductory Overview', 'chapter_url': 'https://www.churchofjesuschrist.org/study/manual/general-handbook/0-introductory-overview?lang=eng', 'section': '0.0', 'title': 'Introduction', 'content': 'The Lord taught, “Let every man learn his duty, and to act in the office in which he is appointed, in all diligence” (Doctrine and Covenants 107:99). As a leader in The Church of Jesus Christ of Latter-day Saints, you should seek personal revelation to help you learn and fulfill the duties of your calling.Studying the scriptures and the teachings of latter-day prophets will help you understand and fulfill your duties. As you study the words of God, you will be more receptive to the influence of the Spirit (see Doctrine and Covenants 84:85).You also learn your duties by studying the instructions in this handbook. These instructions can invite revelation if they are used to provide an understanding of principles, policies, and procedures to apply while seeking the guidance of th

In [44]:
final_chunks = []

for key, value in main_content.items():
        
    for element in value["body"].find_all():
        element.attrs = {}    
    
    section_list = parse_section(value["body"], value["chapter"], chapter_url=key)

    final_chunks.extend(section_list)

1380
{'chapter': '38. Church Policies and Guidelines', 'chapter_url': 'https://www.churchofjesuschrist.org/study/manual/general-handbook/38-church-policies-and-guidelines?lang=eng', 'section': '38.9.10', 'title': 'Other Information', 'content': 'For information about membership records of service members, see 33.6.9.For information about patriarchal blessings for service members, see 38.2.10.3.For information about ordaining service members in isolated locations, see 38.2.9.6.For information about issuing temple recommends in isolated locations, see 26.3.2.If Church leaders have questions about military relations, they may contact:Military Relations and Chaplain Services Division50 East North Temple Street, Room 2411Salt Lake City, UT 84150-0024Telephone: 1-801-240-2286Email: pst-military@ChurchofJesusChrist.org'}


In [52]:
print(len(final_chunks))
print("Content length: ", len(final_chunks[844]["content"]))
print(final_chunks[844])

1380
Content length:  11656
{'chapter': '30. Callings in the Church', 'chapter_url': 'https://www.churchofjesuschrist.org/study/manual/general-handbook/30-callings-in-the-church?lang=eng', 'section': '30.8.1', 'title': 'Ward Callings', 'content': '<thead><tr><th>\n<p>Calling</p>\n</th><th>\n<p>Recommended by</p>\n</th><th>\n<p>Approved by</p>\n</th><th>\n<p>Sustained by<a><sup></sup></a></p>\n</th><th>\n<p>Called and set apart by</p>\n</th></tr></thead><tbody><tr><th><div>\n<p>Calling</p>\n</div>\n<p><a>Bishop</a><a><sup></sup></a></p>\n</th><td><div>\n<p>Recommended by</p>\n</div>\n<p>Stake presidency, using <a>LCR</a></p>\n</td><td><div>\n<p>Approved by</p>\n</div>\n<p>First Presidency and Quorum of the Twelve</p>\n</td><td><div>\n<p>Sustained by<a><sup></sup></a></p>\n</div>\n<p>Ward members</p>\n</td><td><div>\n<p>Called and set apart by</p>\n</div>\n<p>Stake president after receiving approval from the First Presidency (see <a>30.7</a>)</p>\n</td></tr><tr><th><div>\n<p>Calling</p>\

## Creating Text Embeddings with Gemini
Next, I will use the Gemini API to create text embeddings.

In [45]:
## Importing and initializing the library
import google.generativeai as genai
import os

google_api_key=os.environ.get("GOOGLE_API_KEY")

genai.configure(api_key=google_api_key)



  from .autonotebook import tqdm as notebook_tqdm


I will try to generate embeddings for 100 chuncks first.

In [53]:
final_chunks_embed = []

for i, value in enumerate(final_chunks):
    try:
        result = genai.embed_content(
            model = "models/embedding-001",
            content = f"{final_chunks[i]['chapter']}. {final_chunks[i]['title']}. {final_chunks[i]['content']}",
            task_type = "retrieval_document",
            title = "Embedding of single string")
        
        final_chunks_embed.append({
            "chapter": final_chunks[i]["chapter"],
            "chapter_url": final_chunks[i]["chapter_url"],
            "section": final_chunks[i]["section"],
            "title": final_chunks[i]["title"],
            "content": final_chunks[i]["content"],
            "embedding": result["embedding"]
        })
        

    except:
        result = genai.embed_content(
            model = "models/embedding-001",
            content = f"{final_chunks[i]['chapter']}. {final_chunks[i]['title']}. {final_chunks[i]['content'][:len(final_chunks[i]['content'])//2]}",
            task_type = "retrieval_document",
            title = "Embedding of single string")
        
        final_chunks_embed.append({
            "chapter": final_chunks[i]["chapter"],
            "chapter_url": final_chunks[i]["chapter_url"],
            "section": final_chunks[i]["section"],
            "title": final_chunks[i]["title"],
            "content": final_chunks[i]["content"],
            "embedding": result["embedding"]
        })


The above process took 3m 36.4s

Now, I will just check that data is in te desired format.

In [54]:
print(len(final_chunks_embed))
print(final_chunks_embed[0])

1380
{'chapter': '0. Introductory Overview', 'chapter_url': 'https://www.churchofjesuschrist.org/study/manual/general-handbook/0-introductory-overview?lang=eng', 'section': '0.0', 'title': 'Introduction', 'content': 'The Lord taught, “Let every man learn his duty, and to act in the office in which he is appointed, in all diligence” (Doctrine and Covenants 107:99). As a leader in The Church of Jesus Christ of Latter-day Saints, you should seek personal revelation to help you learn and fulfill the duties of your calling.Studying the scriptures and the teachings of latter-day prophets will help you understand and fulfill your duties. As you study the words of God, you will be more receptive to the influence of the Spirit (see Doctrine and Covenants 84:85).You also learn your duties by studying the instructions in this handbook. These instructions can invite revelation if they are used to provide an understanding of principles, policies, and procedures to apply while seeking the guidance o

Here I save the file to free memory from my variables.

In [55]:
import json

with open('chunks-data-2.json', 'w', encoding='utf-8') as f:
    
    chunks_dictionary = {"data" : final_chunks_embed}    

    json.dump(chunks_dictionary, f, ensure_ascii=False, indent=4)

In [56]:
## Clearing the memory

final_chunks = []
final_chunks_embed = []

## Uploading embeddings to Pinecone vector database
Finally I will upload all embeddings and metadata to Pinecone

In [57]:
import json
data = []
with open("processed-docs/chunks-data-2.json", "r") as f:
    data = json.loads(f.read())
    data = data["data"]

I will check that I have loaded data correctly

In [58]:
print(len(data))

1380


Time to initialize Pinecone

In [59]:
from pinecone import Pinecone, ServerlessSpec
import os

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

Now, I create an index to upsert my embeddings.

In [60]:
pc.create_index(
    name="general-handbook-2",
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

I realized I have to follow a specific format to upsert my vectors to Pinecone, so I will create a new list of dictionaries with the new format.

Update: Later I found that each upsert has a limit, so I divide all data into 3 lists

In [61]:
# This is an example of the new format for the chunks data
# {
#     "id": "1", 
#     "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
#     "metadata": {"chapter": "example", "chapter_url": "example", "section": "example", "title": "example", "content": "example"}
# }

new_data_1 = []
new_data_2 = []
new_data_3 = []

vector_id = 1

for chunk in data:
    if vector_id < 500:
        new_data_1.append({
            "id": str(vector_id),
            "values": chunk["embedding"],            
            "metadata" : {"chapter": chunk["chapter"], "chapter_url": chunk["chapter_url"], "section": chunk["section"], "title": chunk["title"], "content": chunk["content"]}
        })
    elif vector_id < 1000:
        new_data_2.append({
            "id": str(vector_id),
            "values": chunk["embedding"],            
            "metadata" : {"chapter": chunk["chapter"], "chapter_url": chunk["chapter_url"], "section": chunk["section"], "title": chunk["title"], "content": chunk["content"]}
        })
    else:
        new_data_3.append({
            "id": str(vector_id),
            "values": chunk["embedding"],            
            "metadata" : {"chapter": chunk["chapter"], "chapter_url": chunk["chapter_url"], "section": chunk["section"], "title": chunk["title"], "content": chunk["content"]}
        })
    
    vector_id += 1
    

In [62]:
largest = 0

for data in new_data_1:
    if len(data["metadata"]["content"]) > largest:
        largest = len(data["metadata"]["content"])

print("new_data_1 length:", len(new_data_1))
print("Largest:", largest)

largest = 0
for data in new_data_2:
    if len(data["metadata"]["content"]) > largest:
        largest = len(data["metadata"]["content"])

print("new_data_2 length:", len(new_data_2))
print("Largest:", largest)

largest = 0
for data in new_data_3:
    if len(data["metadata"]["content"]) > largest:
        largest = len(data["metadata"]["content"])

print("new_data_3 length:", len(new_data_3))
print("Largest:", largest)

new_data_1 length: 499
Largest: 4220
new_data_2 length: 500
Largest: 11656
new_data_3 length: 381
Largest: 4999


Finally, I will upsert all vectors to Pinecone

In [69]:
index = pc.Index("general-handbook-2")

index.upsert(
    vectors = new_data_1,
    namespace = "general-handbook-vectors-2"
)
print("Completed 1")

index.upsert(
    vectors = new_data_2,
    namespace = "general-handbook-vectors-2"
)
print("Completed 2")

index.upsert(
    vectors = new_data_3,
    namespace = "general-handbook-vectors-2"
)
print("Completed 3")

Completed 1
Completed 2
Completed 3


I deleted the index vectors a few times to try different approaches to upsert.

In [None]:
# index = pc.Index("general-handbook")
# index.delete(delete_all=True, namespace="general-handbook-vectors")