# Table to LLM
- Will rip HTML from websites
- Raw HTML will be:
    - Processed to remove most tags using some Python
    - Porcessed directly by the LLM
- Chunk, embed and upsert the final data into a vector database
- Query it with user input

In [1]:
# Setup

import os
import re
import json
import uuid
import yaml
import tiktoken
import requests
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import markdown
import chromadb
from chromadb.config import Settings

In [2]:
# Vars

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
together_api_key = os.getenv('TOGETHER_API_KEY')

In [3]:
# Data

data_types = ['txt', 'html', 'json', 'md', 'yaml']
file_df = pd.read_csv(r"..\Data\Other\html_data.csv")
file_df.head(3)

Unnamed: 0,Index,URL,Num of Tables,Pictures of Tables,Dynamic Dropdown,Note
0,1,https://support.microsoft.com/en-us/office/int...,1,0,No,-
1,2,https://support.microsoft.com/en-us/office/for...,1,0,No,-
2,3,https://support.microsoft.com/en-us/office/vid...,1,0,No,-


In [4]:
# ChromaDB

chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")

## Functions

In [5]:
def fetch_html(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve HTML. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def get_url_from_index(search_index):
    row = file_df.loc[file_df['Index'] == search_index]
    if not row.empty:
        url_value = row['URL'].iloc[0]
        print("URL:", url_value)
        return url_value
    else:
        print("No URL found for the given condition.")
        return None

def get_all_urls():
    urls = file_df['URL'].tolist()
    return urls

In [6]:
# LLM API Functions

def make_openai_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {openai_api_key}',
    }
    response = requests.post('https://api.openai.com/v1/chat/completions', json=payload, headers=headers)
    
    result = response.json()

    input_tokens = result['usage']['prompt_tokens']
    output_tokens = result['usage']['completion_tokens']
    result_text = result['choices'][0]['message']['content']

    return input_tokens, output_tokens, result_text
    
def make_together_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {together_api_key}',
    }
    print(headers)
    response = requests.post('https://api.together.xyz/v1/chat/completions', json=payload, headers=headers)
    print(response)
    
    result = response.json()
    print(result)

    result_text = result['choices'][0]['message']['content']
    print(result_text)

    return result_text

In [7]:
# Chunking Related

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def chunk_text(text, encoding_name, max_tokens=2000):
    """
    Splits text into chunks with each chunk having a maximum of max_tokens tokens.
    """
    chunks = []
    current_chunk = ""
    words = text.split()
    for word in words:
        # Simulate adding the word to the current chunk and check the token count
        test_chunk = current_chunk + " " + word if current_chunk else word
        if num_tokens_from_string(test_chunk, encoding_name) <= max_tokens:
            current_chunk = test_chunk
        else:
            # Current chunk is full, start a new one
            chunks.append(current_chunk)
            current_chunk = word
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

In [8]:
def extract_tables_from_html(html_content: str) -> list[pd.DataFrame]:
    """Extract tables from HTML content and return them as a list of pandas DataFrames."""
    soup = BeautifulSoup(html_content, 'html.parser')
    return [pd.read_html(str(table))[0] for table in soup.find_all('table')]

def extract_tables_from_json(json_content: str) -> list[pd.DataFrame]:
    """Extract tables from JSON content and return them as a list of pandas DataFrames."""
    data = json.loads(json_content)
    return [pd.DataFrame(table_data) for table_data in data.values()]

def extract_tables_from_md(md_content: str) -> list[pd.DataFrame]:
    """Convert Markdown content to HTML and extract tables from it."""
    html_content = markdown.markdown(md_content)
    return extract_tables_from_html(html_content)

def extract_tables_from_file(html_text: str, source: str) -> list[pd.DataFrame]:
    """Extract tables from HTML content string based on its extension."""

    if source == "html":
        return extract_tables_from_html(html_text)
    elif source.endswith('.json'):
        return extract_tables_from_json(html_text)
    elif source.endswith('.md'):
        return extract_tables_from_md(html_text)
    else:
        raise ValueError("Unsupported file format")

In [9]:
def dataframe_to_text(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to a text format."""
    return dataframe.to_string(index=False)

def dataframe_to_html(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to HTML format."""
    return dataframe.to_html(index=False)

def dataframe_to_json(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to JSON format."""
    return dataframe.to_json(orient='records', indent=4)

def dataframe_to_md(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to Markdown format by first converting to HTML."""
    return markdown.markdown(dataframe.to_html(index=False))

def dataframe_to_yaml(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to YAML format."""
    return yaml.dump(dataframe.to_dict(orient='records'), default_flow_style=False)

def print_format_from_table(print_format: str, dataframe: pd.DataFrame) -> None:
    """Print or save a pandas DataFrame in the specified format."""
    format_functions = {
        'html': dataframe_to_html,
        'json': dataframe_to_json,
        'md': dataframe_to_md,
        'txt': dataframe_to_text,
        'yaml': dataframe_to_yaml
    }
    
    if print_format not in format_functions:
        raise ValueError("Unsupported save format")

    print_content = format_functions[print_format](dataframe)
    return print_content

In [10]:
# HTML Cleaning with Python or LLM

def python_strip_most_tags(html_content: str) -> str:
    # Remove most HTML tags and convert the table tag content into whatever format needed
    
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove all tags except for <table> tags
    for tag in soup.find_all(True):
        if tag.name != 'table':
            tag.unwrap()

    # Get text
    cleaned_html = soup.get_text(separator=' ')
    cleaned_html = re.sub(r'\s+', ' ', cleaned_html).strip()

    return cleaned_html

def llm_strip_most_tags(html_content: str) -> str:
    # Remove most HTML tags and convert the table tag content into whatever format needed

    payload = {
        'messages': [
            {
                'role': 'system',
                'content': f"You are a helpful assistant that takes as input raw HTML and returns it cleaned with no HTML tags "
            },
            {
                'role': 'user', 
                'content': f"{html_content}"
            }
        ],
        'model': "openchat/openchat-3.5-1210",
        'max_tokens': 2048,
        "temperature": 0.0
    }
    print(payload)
    cleaned_html = make_together_call(payload)
    
    return cleaned_html

## Single HTML Cleaning

In [11]:
# Removes every new line or indent and strips a table's tags too

try:
    fetched_url = get_url_from_index(1)
    if fetched_url:
        html = fetch_html(fetched_url)
        if html:
            print("Got HTML dump!\n")
            # print(f"{html}\n")

            cleaned_html = python_strip_most_tags(html)
            print(f"{cleaned_html}\n")

except ValueError as e:
    print(e)

URL: https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7
Got HTML dump!

Introduction to tables - Microsoft Support Skip to main content Microsoft Support Support Support Home Microsoft 365 Office Products Microsoft 365 Outlook Microsoft Teams OneDrive OneNote Windows Microsoft Edge more ... Devices Surface PC Accessories Mobile Xbox PC Gaming HoloLens Hardware warranties Account & billing Account Microsoft Store & billing Resources Install Microsoft 365 Community forums Microsoft 365 Admins Small Business Portal Developer Education Report a support scam More Buy Microsoft 365 All Microsoft Global Microsoft 365 Teams Copilot Windows Surface Xbox Deals Small Business Support Software Software Windows Apps AI Outlook OneDrive Microsoft Teams OneNote Microsoft Edge Skype PCs & Devices PCs & Devices Computers Shop Xbox Accessories VR & mixed reality Certified Refurbished Trade-in for cash Entertainment Entertainment Xbox Game Pass Ultimate

In [12]:
# Small HTML page is ~60k tokens which is beyond LLM's context window length

try:
    fetched_url = get_url_from_index(1)
    if fetched_url:
        html = fetch_html(fetched_url)
        if html:
            print("Got HTML dump!\n")
            # print(f"{html}\n")

            cleaned_html = llm_strip_most_tags(html)
            print(f"{cleaned_html}\n")

except ValueError as e:
    print(f"Error\n{e}\n")

URL: https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7
Got HTML dump!

{'messages': [{'role': 'system', 'content': 'You are a helpful assistant that takes as input raw HTML and returns it cleaned with no HTML tags '}, {'role': 'user', 'content': '\r\n<!DOCTYPE html>\r\n<html lang="en-US" dir="ltr">\r\n<head>\r\n\t<meta charset="utf-8" />\r\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0" />\r\n\t<title>Introduction to tables - Microsoft Support</title>\r\n\t\r\n\t\r\n\t\t<link rel="canonical" href="https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\r\n\t\t\r\n\t\t\t<link rel="alternate" hreflang="ar-SA" href="https://support.microsoft.com/ar-sa/topic/%D9%85%D9%82%D8%AF%D9%85%D8%A9-%D8%AD%D9%88%D9%84-%D8%A7%D9%84%D8%AC%D8%AF%D8%A7%D9%88%D9%84-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\r\n\t\t\t<link rel="alternate" hreflang="bg-BG" href="https://support.micr

KeyError: 'choices'

## Multi URL Testing

In [16]:
# Retrieve all URLs, clean and append content, metadata and IDs

master_list = []

all_urls = get_all_urls()
for url in all_urls:
    html = fetch_html(url)
    if html:
        print(f"Got HTML dump!")

        cleaned_html = python_strip_most_tags(html)
        document_id = str(uuid.uuid4())
        meta = {"source": url}
        
        item = {
            "document": cleaned_html,
            "metadata": meta,
            "id": document_id
        }
        master_list.append(item)

        print(f"item: {item}\n")

Got HTML dump!
item: {'document': "Introduction to tables - Microsoft Support Skip to main content Microsoft Support Support Support Home Microsoft 365 Office Products Microsoft 365 Outlook Microsoft Teams OneDrive OneNote Windows Microsoft Edge more ... Devices Surface PC Accessories Mobile Xbox PC Gaming HoloLens Hardware warranties Account & billing Account Microsoft Store & billing Resources Install Microsoft 365 Community forums Microsoft 365 Admins Small Business Portal Developer Education Report a support scam More Buy Microsoft 365 All Microsoft Global Microsoft 365 Teams Copilot Windows Surface Xbox Deals Small Business Support Software Software Windows Apps AI Outlook OneDrive Microsoft Teams OneNote Microsoft Edge Skype PCs & Devices PCs & Devices Computers Shop Xbox Accessories VR & mixed reality Certified Refurbished Trade-in for cash Entertainment Entertainment Xbox Game Pass Ultimate PC Game Pass Xbox games PC and Windows games Movies & TV Business Business Microsoft Clo

In [17]:
# Split documents into chunks

encoding_name = "cl100k_base"
chunked_documents = []
for item in master_list:
    print(f"\nOriginal Item: {item}")
    chunks = chunk_text(item['document'], encoding_name, max_tokens=500)
    for i, chunk in enumerate(chunks):
        new_id = item['id']
        new_id = f"{new_id}_{i+1}"

        new_meta = item['metadata'].copy()
        new_meta["ind"] = i

        new_item = {
            "document": chunk,
            "metadata": new_meta,
            "id": new_id
        }
        print(f"New Item: {new_item}")

        chunked_documents.append(new_item)


Original Item: {'document': "Introduction to tables - Microsoft Support Skip to main content Microsoft Support Support Support Home Microsoft 365 Office Products Microsoft 365 Outlook Microsoft Teams OneDrive OneNote Windows Microsoft Edge more ... Devices Surface PC Accessories Mobile Xbox PC Gaming HoloLens Hardware warranties Account & billing Account Microsoft Store & billing Resources Install Microsoft 365 Community forums Microsoft 365 Admins Small Business Portal Developer Education Report a support scam More Buy Microsoft 365 All Microsoft Global Microsoft 365 Teams Copilot Windows Surface Xbox Deals Small Business Support Software Software Windows Apps AI Outlook OneDrive Microsoft Teams OneNote Microsoft Edge Skype PCs & Devices PCs & Devices Computers Shop Xbox Accessories VR & mixed reality Certified Refurbished Trade-in for cash Entertainment Entertainment Xbox Game Pass Ultimate PC Game Pass Xbox games PC and Windows games Movies & TV Business Business Microsoft Cloud Mi

In [18]:
documents_list = []
metadata_list = []
ids_list = []

for doc_info in chunked_documents:
    documents_list.append(doc_info['document'])
    metadata_list.append(doc_info['metadata'])
    ids_list.append(doc_info['id'])

print(f"Documents List: {documents_list}")
print(f"Metadata List: {metadata_list}")
print(f"IDs List: {ids_list}")

Documents List: ['Introduction to tables - Microsoft Support Skip to main content Microsoft Support Support Support Home Microsoft 365 Office Products Microsoft 365 Outlook Microsoft Teams OneDrive OneNote Windows Microsoft Edge more ... Devices Surface PC Accessories Mobile Xbox PC Gaming HoloLens Hardware warranties Account & billing Account Microsoft Store & billing Resources Install Microsoft 365 Community forums Microsoft 365 Admins Small Business Portal Developer Education Report a support scam More Buy Microsoft 365 All Microsoft Global Microsoft 365 Teams Copilot Windows Surface Xbox Deals Small Business Support Software Software Windows Apps AI Outlook OneDrive Microsoft Teams OneNote Microsoft Edge Skype PCs & Devices PCs & Devices Computers Shop Xbox Accessories VR & mixed reality Certified Refurbished Trade-in for cash Entertainment Entertainment Xbox Game Pass Ultimate PC Game Pass Xbox games PC and Windows games Movies & TV Business Business Microsoft Cloud Microsoft Secu

In [19]:
# Add data to ChromaDB

collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)

In [20]:
# Query ChromaDB

results = collection.query(
    query_texts=["table field"],
    n_results=3
)
results

{'ids': [['967f8557-1c00-4020-ad86-626bacc3d3c0_4',
   '0fe42cb1-2389-44be-bbf4-938308b5f077_2',
   '95d94e7e-41f5-46a5-a027-1a68c74f1f54_2']],
 'distances': [[0.919323742389679, 0.985133707523346, 1.0933334827423096]],
 'metadatas': [[{'ind': 3,
    'source': 'https://support.microsoft.com/en-us/office/compare-two-tables-in-access-and-find-only-matching-data-16f301ac-40c1-43bc-80db-263f9a51eb4f'},
   {'ind': 1,
    'source': 'https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7'},
   {'ind': 1,
    'source': 'https://support.microsoft.com/en-us/office/format-a-table-e6e77bc6-1f4e-467e-b818-2e2acc488006'}]],
 'embeddings': None,
 'documents': [['Open a new or existing database. On the Create tab, in the Tables group, click Table . Access adds a new, blank table to your database. Note: You do not need to follow this step if you open a new, blank database, but you will need to follow it whenever you need to add a table to the database. Do

In [22]:
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': f"You are a helpful assistant. You will be provided context to the question the user is asking. Without any prior knowledge assumptions ONLY answer their qustion."
        },
        {
            'role': 'user',
            'content': f"What type of field would be created if i enter orange?\n{results}"
        }
    ],
    'model': "gpt-3.5-turbo",
    'max_tokens': 128,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}")
print(f"output tokens: {output_tokens}")
print(f"response: {response}")

input tokens: 1855
output tokens: 33
response: A text field with a data type of "Short Text" would be created if you enter "orange" in a new column in Datasheet view in Access.

