In [3]:
with open('../data/state_of_the_union.md', 'r') as file:
    data = file.read()

In [1]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

splitter = SemanticChunker(OpenAIEmbeddings())

In [4]:
results = splitter.split_text(data)

In [6]:
import tiktoken

# Count the number of tokens in each page_content
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string, disallowed_special=()))
    return num_tokens

In [10]:
# results
sizes = []
for x in results:
    # print(f"Page {x['page_number']} has {num_tokens_from_string(x['page_content'])} tokens.")
    # print(num_tokens_from_string(x))
    sizes.append(num_tokens_from_string(x))

import numpy as np
sizes = np.array(sizes)

print(f"Mean: {np.mean(sizes)}")
print(f"Median: {np.median(sizes)}")
print(f"Std: {np.std(sizes)}")
print(f"Max: {np.max(sizes)}")
print(f"Min: {np.min(sizes)}")

Mean: 316.06060606060606
Median: 148.0
Std: 336.54185559797713
Max: 1260
Min: 3


In [12]:
results[0]

'Good evening. Good evening. If I were smart, I’d go home now. Mr.'

In [15]:
for x in results:
    print(find_target_in_document(data, x))

('Good evening. Good evening. If I were smart, I’d go home now.\n\nMr', 0, 65)
('Speaker, Madam Vice President, members of Congress, my fellow Americans.\n\nIn January 1941, Franklin Roosevelt came to this chamber to speak to the nation. And he said, “I address you at a moment unprecedented in the history of the Union”. Hitler was on the march. War was raging in Europe.\n\nPresident Roosevelt’s purpose was to wake up Congress and alert the American people that this was no ordinary time. Freedom and democracy were under assault in the world.\n\nTonight, I come to the same chamber to address the nation. Now it’s we who face an unprecedented moment in the history of the Union.\n\nAnd, yes, my purpose tonight is to wake up the Congress and alert the American people that this is no ordinary moment either. Not since President Lincoln and the Civil War have freedom and democracy been under assault at home as they are today.\n\nWhat makes our moment rare is that freedom and democracy are under

In [13]:
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def find_query_despite_whitespace(document, query):

    # Normalize spaces and newlines in the query
    normalized_query = re.sub(r'\s+', ' ', query).strip()
    
    # Create a regex pattern from the normalized query to match any whitespace characters between words
    pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split())
    
    # Compile the regex to ignore case and search for it in the document
    regex = re.compile(pattern, re.IGNORECASE)
    match = regex.search(document)
    
    if match:
        return document[match.start(): match.end()], match.start(), match.end()
    else:
        return None

def find_target_in_document(document, target):

    if target.endswith('.'):
        target = target[:-1]
    
    if target in document:
        start_index = document.find(target)
        end_index = start_index + len(target)
        return target, start_index, end_index
    else:
        raw_search = find_query_despite_whitespace(document, target)
        if raw_search is not None:
            return raw_search

    # Split the text into sentences
    sentences = re.split(r'[.!?]\s*|\n', document)

    # Find the sentence that matches the query best
    best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio)

    if best_match[1] < 98:
        return None
    
    reference = best_match[0]

    start_index = document.find(reference)
    end_index = start_index + len(reference)

    return reference, start_index, end_index