In [1]:
import requests, re
import tiktoken
from pathlib import Path
from typing import Any

In [2]:
def fetch_book(*, book_download_url="https://www.gutenberg.org/cache/epub/2701/pg2701.txt") -> str:
    """
    Fetches book. Default url is gutenberg book link to Moby Dick
    """
    r = requests.get(book_download_url, timeout=60)
    r.raise_for_status()
    return r.text

In [3]:
def strip_gutenberg_header_footer(*, book:str) -> str:
    start = re.search(r"^CHAPTER 1\.", book, re.M)
    end = re.search(r"End of the Project Gutenberg EBook of", book)

    book = book[start.start(): end.start()] if start and end else book
    return book.strip()

In [6]:
def download_or_load_from_cache(*, book_path:Path) -> str:
    if book_path.exists():
        with open(book_path, 'r', encoding="utf-8") as f:
            print(f"Loaded: {book_path.name}")
            moby_book = f.read()
        
    else:
        moby_book = fetch_book()
        moby_book = strip_gutenberg_header_footer(book=moby_book)

        with open(book_path, "w", encoding="utf-8") as f:
            f.write(moby_book)

    return moby_book

In [None]:
from bs4 import BeautifulSoup, ResultSet, Tag, AttributeValueList
from pathlib import Path

bp = Path(r"books\alices-adventures-in-wonderland_carroll-lewis_11_en.html")

with open(bp, 'r', encoding="utf-8") as f:
    b = BeautifulSoup(f)

In [None]:
#chapters = b.find_all(lambda tag: tag.has_attr("chapter"))
chapters:ResultSet[Tag] = b.find_all(attrs={"class": "chapter"})

In [27]:
for ch in chapters:
    txt = ch.get_text(separator="", strip=True)
    tag_name = ch.name
    attrs = ch.attrs
    print(tag_name, attrs)
# chapters[0].contents

div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}
div {'class': ['chapter']}


In [28]:
inner_html = "".join(str(x) for x in ch.contents)
inner_html

'\n<h2><a id="chap12"></a>CHAPTER XII.<br/>\n\nAlice’s Evidence</h2>\n<p>\n\n“Here!” cried Alice, quite forgetting in the flurry of the moment how large she\n\nhad grown in the last few minutes, and she jumped up in such a hurry that she\n\ntipped over the jury-box with the edge of her skirt, upsetting all the jurymen\n\non to the heads of the crowd below, and there they lay sprawling about,\n\nreminding her very much of a globe of goldfish she had accidentally upset the\n\nweek before.\n\n</p>\n<p>\n\n“Oh, I <i>beg</i> your pardon!” she exclaimed in a tone of great dismay, and\n\nbegan picking them up again as quickly as she could, for the accident of the\n\ngoldfish kept running in her head, and she had a vague sort of idea that they\n\nmust be collected at once and put back into the jury-box, or they would die.\n\n</p>\n<p>\n\n“The trial cannot proceed,” said the King in a very grave voice, “until all the\n\njurymen are back in their proper places—<i>all</i>,” he repeated with great

In [17]:
chaps = b.find_all("div", attrs="chapter")
type(chaps)

bs4.element.ResultSet

In [7]:
moby_book_p = Path("books", "moby.txt")
moby_book = download_or_load_from_cache(book_path=moby_book_p)

Loaded: moby.txt


In [1]:
import os

os.getenv("AZURE_SEARCH_ENDPOINT"), os.environ["AZURE_SEARCH_KEY"]

('https://ai-search-free-plan.search.windows.net#https://ai-search-bax.search.windows.net',
 'XEdNAWPfLCJSExkpOua9n4FiV1i5Ubpshc0WSRZXgRAzSeCIf8PD')

In [None]:
def extract_chapters(*, book_txt: str) -> dict[int, dict[str, str]]:
    """
    Extracts chapters from a book string. Exptects chapters in format, e.g. : CHAPTER 3. <chapter title> \n <content...>
    NB - Overwrites the initial "empty" chapters from table of contents, with the actual chapter content as the book is iterated over.

    Returns a dictionary of the form:
    {
        chapter_number: {
            "title": <chapter title>,
            "content": <chapter content>
        }
    }
    """
    
    # Regex to capture chapter headings 
    pattern = re.compile(r"^CHAPTER\s+(\d+)\.?\s*([^\n]*)", re.MULTILINE)
    matches = list(pattern.finditer(book_txt))
    
    chapters = {}
    
    for chapter_i, match in enumerate(matches):
        chapter_num = int(match.group(1))
        chapter_title = match.group(2).strip()
        
        start_idx = match.end()                                                                     
        end_idx = matches[chapter_i+1].start() if chapter_i+1 < len(matches) else len(book_txt)     # End index is the start of next chapter or the end of text
        
        content = book_txt[start_idx:end_idx].strip()
        
        chapters[chapter_num] = {                                   
            "title": chapter_title,
            "content": content
        }
    
    return chapters

In [11]:
chapters = extract_chapters(book_txt=moby_book)
chapters

{1: {'title': 'Loomings.',
  'content': 'Call me Ishmael. Some years ago—never mind how long precisely—having\n\nlittle or no money in my purse, and nothing particular to interest me\n\non shore, I thought I would sail about a little and see the watery part\n\nof the world. It is a way I have of driving off the spleen and\n\nregulating the circulation. Whenever I find myself growing grim about\n\nthe mouth; whenever it is a damp, drizzly November in my soul; whenever\n\nI find myself involuntarily pausing before coffin warehouses, and\n\nbringing up the rear of every funeral I meet; and especially whenever\n\nmy hypos get such an upper hand of me, that it requires a strong moral\n\nprinciple to prevent me from deliberately stepping into the street, and\n\nmethodically knocking people’s hats off—then, I account it high time to\n\nget to sea as soon as I can. This is my substitute for pistol and ball.\n\nWith a philosophical flourish Cato throws himself upon his sword; I\n\nquietly take 

In [None]:
def tiktoken_chunks(*, txt:str, max_tokens=600, overlap=60, encoding="cl100k_base"):
    enc = tiktoken.get_encoding(encoding)
    token_ids = enc.encode(txt)
    step = max_tokens - overlap
    out = []

    for i in range(0, len(token_ids), step):
        out.append(enc.decode(token_ids[i:i+max_tokens]))
    
    return out, token_ids

In [22]:
moby_out, token_ids = tiktoken_chunks(txt=chapters[1]['content'])

In [23]:
token_ids

[7368,
 757,
 57704,
 1764,
 301,
 13,
 4427,
 1667,
 4227,
 2345,
 37593,
 4059,
 1268,
 1317,
 24559,
 2345,
 69666,
 271,
 56492,
 477,
 912,
 3300,
 304,
 856,
 53101,
 11,
 323,
 4400,
 4040,
 311,
 2802,
 757,
 271,
 263,
 31284,
 11,
 358,
 3463,
 358,
 1053,
 30503,
 922,
 264,
 2697,
 323,
 1518,
 279,
 30125,
 727,
 961,
 271,
 1073,
 279,
 1917,
 13,
 1102,
 374,
 264,
 1648,
 358,
 617,
 315,
 10043,
 1022,
 279,
 87450,
 268,
 323,
 271,
 1610,
 15853,
 279,
 35855,
 13,
 43633,
 358,
 1505,
 7182,
 7982,
 44517,
 922,
 271,
 1820,
 11013,
 26,
 15716,
 433,
 374,
 264,
 41369,
 11,
 1377,
 73825,
 6841,
 304,
 856,
 13836,
 26,
 15716,
 271,
 40,
 1505,
 7182,
 4457,
 3935,
 6751,
 7251,
 985,
 1603,
 78766,
 83273,
 11,
 323,
 271,
 81088,
 709,
 279,
 14981,
 315,
 1475,
 32079,
 358,
 3449,
 26,
 323,
 5423,
 15716,
 271,
 2465,
 6409,
 981,
 636,
 1778,
 459,
 8582,
 1450,
 315,
 757,
 11,
 430,
 433,
 7612,
 264,
 3831,
 16033,
 271,
 652,
 16379,
 311,
 5471,
 757,


In [24]:
moby_out

['Call me Ishmael. Some years ago—never mind how long precisely—having\n\nlittle or no money in my purse, and nothing particular to interest me\n\non shore, I thought I would sail about a little and see the watery part\n\nof the world. It is a way I have of driving off the spleen and\n\nregulating the circulation. Whenever I find myself growing grim about\n\nthe mouth; whenever it is a damp, drizzly November in my soul; whenever\n\nI find myself involuntarily pausing before coffin warehouses, and\n\nbringing up the rear of every funeral I meet; and especially whenever\n\nmy hypos get such an upper hand of me, that it requires a strong moral\n\nprinciple to prevent me from deliberately stepping into the street, and\n\nmethodically knocking people’s hats off—then, I account it high time to\n\nget to sea as soon as I can. This is my substitute for pistol and ball.\n\nWith a philosophical flourish Cato throws himself upon his sword; I\n\nquietly take to the ship. There is nothing surprisin