In [3]:
import re
import openai
from typing import List, Union, Dict, Any, Optional

In [None]:
# DOC PREPROCESSING
# Step 1: Parse the data (ur/pdf/pptx) and extract the text
#  - make sure to preserve latex and images from the pdfs
#  - make sure the text chunks are large enough to be meaningfu
# Step 2: Embed the text
# Step 3: Save in a dataframe with (doc_id, doc_title, doc_category, doc_text, doc_embedding)
# Step 4: Save metadata (actual doc file, images, citation, etc.)

# DOC SEARCH
# Step 1: Embed the query
# Step 2: Find the closest embedding
# Step 3: Return the top_k results with some range (e.g. previous 1, next 1)
# Step 4: Create a prompt with the query and the context
# Step 5: Return the prompt stream to the user
#  - I also want to return the metadata for the doc (e.g. doc_id, file, citation, images, category, etc.)
#  - GET THE REGULAR EMBEDDED SEARCH WORKING FIRST
#  - then figure out a way to return the metadata and visualizations like pdf file, images, diagrams, etc.

In [None]:
# Step 1: Parse the data (ur/pdf/pptx) and extract the text
#  - make sure to preserve latex and images from the pdfs
#  - make sure the text chunks are large enough to be meaningfu

In [None]:
def parse_pdf():
    pass

def parse_pptx():
    pass

def parse_url():
    pass

In [1]:
def sanitize_text(text):
    """
    Sanitize the input text by removing unsupported characters, trimming whitespace, and checking for empty strings.

    Args:
        text (str): The input text to be sanitized.

    Returns:
        str: The sanitized text, or None if the text is empty after sanitization.
    """

    # Remove any characters that are not supported by the tokenizer
    # This example assumes the tokenizer supports ASCII characters, digits, and common punctuation
    # You can modify the regular expression to match the specific tokenizer requirements
    sanitized_text = re.sub(r"[^\x00-\x7F]+", "", text)

    # Trim leading and trailing whitespace
    sanitized_text = sanitized_text.strip()

    # Check for empty strings
    if not sanitized_text:
        return None

    return sanitized_text

In [None]:
def embed_ada(text: str):
    """
    Embed a text string using the ADA model.
    """
    if not isinstance(text, str):
        raise TypeError(
            "Text must be a string. Use embed_ada_list() to embed a list of strings.")

    sanitized_text = sanitize_text(text).replace("\n", " ").strip()
    if sanitized_text == "":
        raise ValueError("Empty text passed to embed_text()")

    # Embed the text
    response = openai.Embedding.create(
        input=sanitized_text,
        model="text-embedding-ada-002",
    )
    embedding = response["data"][0]["embedding"]
    return embedding


def embed_ada_list(text_list: List):
    if not isinstance(text_list, list):
        raise TypeError(
            "Text must be a list. Use embed_ada() to embed a single string.")
    sanitized_list = [sanitize_text(t).replace(
        "\n", " ").strip() for t in text_list if t != ""]
    if len(sanitized_list) == 0:
        raise ValueError("Empty list passed to embed_text()")
    # Embed the text
    response = openai.Embedding.create(
        input=sanitized_list,
        model="text-embedding-ada-002",
    )
    embeddings = [item["embedding"] for item in response["data"]]
    return embeddings