# Table to LLM
- Will rip HTML from websites
- Raw HTML will be either:
    - Processed to remove most tags using some Python
    - Porcessed directly by the LLM
- Chunk, embed and upsert the final data into a vector database
- Query it with user input

In [35]:
# Setup

import os
import re
import json
import uuid
import yaml
import random
import tiktoken
import requests
import pandas as pd
from io import StringIO
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import markdown
import chromadb
from chromadb.config import Settings

In [36]:
# Vars

load_dotenv()
azure_openai_api_key = os.getenv('AZURE_OPENAI_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
together_api_key = os.getenv('TOGETHER_API_KEY')

In [37]:
# Data

data_types = ['txt', 'html', 'json', 'md', 'yaml']
file_df = pd.read_csv(r"..\Data\Other\html_data.csv")
file_df.head(3)

Unnamed: 0,Index,URL,Num of Tables,Pictures of Tables,Dynamic Dropdown,Note
0,1,https://support.microsoft.com/en-us/office/int...,1,0,No,-
1,2,https://support.microsoft.com/en-us/office/for...,1,0,No,-
2,3,https://support.microsoft.com/en-us/office/vid...,1,0,No,-


## Functions

In [39]:
# Data File Related

def fetch_html(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve HTML. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [40]:
# LLM API Functions

def make_azure_openai_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {azure_openai_api_key}',
    }
    # TODO: Update the endpoint by deploying model resource on Azure
    response = requests.post('<endpoint>', json=payload, headers=headers)
    
    result = response.json()

    input_tokens = result['usage']['prompt_tokens']
    output_tokens = result['usage']['completion_tokens']
    result_text = result['choices'][0]['message']['content']

    return input_tokens, output_tokens, result_text

def make_openai_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {openai_api_key}',
    }
    response = requests.post('https://api.openai.com/v1/chat/completions', json=payload, headers=headers)
    
    result = response.json()

    input_tokens = result['usage']['prompt_tokens']
    output_tokens = result['usage']['completion_tokens']
    result_text = result['choices'][0]['message']['content']

    return input_tokens, output_tokens, result_text
    
def make_together_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {together_api_key}',
    }
    # print(headers)
    response = requests.post('https://api.together.xyz/v1/chat/completions', json=payload, headers=headers)
    # print(response)
    
    result = response.json()
    # print(result)

    result_text = result['choices'][0]['message']['content']
    # print(result_text)

    return result_text

In [41]:
# CUSTOM Chunking Related

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def chunk_text(text, encoding_name, max_tokens=2000):
    """
    Splits text into chunks with each chunk having a maximum of max_tokens tokens.
    """
    chunks = []
    current_chunk = ""
    words = text.split()
    for word in words:
        # Simulate adding the word to the current chunk and check the token count
        test_chunk = current_chunk + " " + word if current_chunk else word
        if num_tokens_from_string(test_chunk, encoding_name) <= max_tokens:
            current_chunk = test_chunk
        else:
            # Current chunk is full, start a new one
            chunks.append(current_chunk)
            current_chunk = word
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

def chunk_text_with_overlap(text, overlap_percentage=10, chunk_size=1024):
    # Example chunking logic with overlap, adjust as necessary
    chunks = []
    overlap_size = int(chunk_size * (overlap_percentage / 100))
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += (chunk_size - overlap_size)
    return chunks

In [42]:
# Extract DF from Formats

def extract_tables_from_html(html_content: str) -> pd.DataFrame:
    """Extract tables from HTML content and return them as a list of pandas DataFrames."""
    soup = BeautifulSoup(html_content, 'html.parser')
    html_io = StringIO(html_content)
    return pd.read_html(html_io)[0]
    # return [pd.read_html(str(table))[0] for table in soup.find_all('table')]

def extract_tables_from_json(json_content: str) -> pd.DataFrame:
    """Extract tables from JSON content and return them as a list of pandas DataFrames."""
    data = json.loads(json_content)
    return pd.DataFrame(list(data.values())[0])
    # return [pd.DataFrame(table_data) for table_data in data.values()]

def extract_tables_from_md(md_content: str) -> pd.DataFrame:
    """Convert Markdown content to HTML and extract tables from it."""
    html_content = markdown.markdown(md_content)
    return extract_tables_from_html(html_content)
    # return extract_tables_from_html(html_content)

def extract_tables_from_file(html_text: str, source: str) -> pd.DataFrame:
    """Extract tables from HTML content string based on its extension."""

    if source == "html":
        return extract_tables_from_html(html_text)
    elif source.endswith('.json'):
        return extract_tables_from_json(html_text)
    elif source.endswith('.md'):
        return extract_tables_from_md(html_text)
    else:
        raise ValueError("Unsupported file format")

In [43]:
# Convert DF to Formats

def dataframe_to_text(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to a text format."""
    return dataframe.to_string(index=False)

def dataframe_to_html(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to HTML format."""
    return dataframe.to_html(index=False)

def dataframe_to_json(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to JSON format."""
    return dataframe.to_json(orient='records', indent=4)

def dataframe_to_md(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to Markdown format by first converting to HTML."""
    return markdown.markdown(dataframe.to_html(index=False))

def dataframe_to_yaml(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to YAML format."""
    return yaml.dump(dataframe.to_dict(orient='records'), default_flow_style=False)

def print_format_from_table(print_format: str, dataframe: pd.DataFrame) -> None:
    """Print or save a pandas DataFrame in the specified format."""
    format_functions = {
        'html': dataframe_to_html,
        'json': dataframe_to_json,
        'md': dataframe_to_md,
        'txt': dataframe_to_text,
        'yaml': dataframe_to_yaml
    }
    
    if print_format not in format_functions:
        raise ValueError("Unsupported save format")

    print_content = format_functions[print_format](dataframe)
    return print_content

In [44]:
# HTML Processing with Python/LLM

def python_strip_most_tags(html_content: str) -> str:
    # Remove most HTML tags and convert the table tag content into whatever format needed
    
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove all tags except for <table> tags
    for tag in soup.find_all(True):
        if tag.name != 'table':
            tag.unwrap()

    # Get text
    cleaned_html = soup.get_text(separator=' ')
    cleaned_html = re.sub(r'\s+', ' ', cleaned_html).strip()

    return cleaned_html

def llm_strip_most_tags(html_content: str) -> str:
    # Remove most HTML tags and convert the table tag content into whatever format needed

    num_tokens_in_html = num_tokens_from_string(html_content, "cl100k_base")
    print(F"HTML contains {num_tokens_in_html} tokens")

    # Call split_html function to split html_content
    html_parts = chunk_text(html_content, "cl100k_base", 2048)
    
    # Initialize cleaned_html variable to store concatenated results
    cleaned_html = ''
    
    # Loop through each part of the html_content
    for part in html_parts:
        payload = {
            'messages': [
                {
                    'role': 'system',
                    'content': f"You are a system that takes in raw HTML and processes it to remove all tags and format the data in plain text. Write the headers correctly and then just print the row values."
                },
                {
                    'role': 'user', 
                    'content': f"{part}"
                }
            ],
            'model': "gpt-3.5-turbo",
            'max_tokens': 2048,
            "temperature": 0.0,
            "seed": 48
        }
        print(payload)
        cleaned_html += make_openai_call(payload)
    
    return cleaned_html    

## Saving HTML Data

In [45]:
# Pull all URLs from file
file_df = pd.read_csv(r"..\Data\Other\html_data.csv")
print(f"CSV Data\n{file_df.head(3)}\n")

all_urls = file_df['URL'].tolist()
print(f"All URLs\n{all_urls}")

CSV Data
   Index                                                URL  Num of Tables  \
0      1  https://support.microsoft.com/en-us/office/int...              1   
1      2  https://support.microsoft.com/en-us/office/for...              1   
2      3  https://support.microsoft.com/en-us/office/vid...              1   

   Pictures of Tables Dynamic Dropdown Note  
0                   0               No    -  
1                   0               No    -  
2                   0               No    -  

All URLs
['https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'https://support.microsoft.com/en-us/office/format-a-table-e6e77bc6-1f4e-467e-b818-2e2acc488006', 'https://support.microsoft.com/en-us/office/video-get-started-with-table-relationships-728d53ff-f332-4ac6-9382-574ee271500a', 'https://support.microsoft.com/en-us/office/resize-a-table-column-or-row-9340d478-21be-4392-81cf-488f7bbd6715', 'https://support.microsoft.com/en-us/offi

In [46]:
# Save HTML content from each URL to a file
html_output_path = r"..\Data\HTML"
for index, single_url in enumerate(all_urls, start=1):
    html_content = fetch_html(single_url)
    if html_content:
        file_name = f"page_{index}.html"
        file_path = os.path.join(html_output_path, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(html_content)
        print(f"Index #{index} - URL {single_url} - HTML content saved to {file_path}")

Index #1 - URL https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7 - HTML content saved to ..\Data\HTML\page_1.html
Index #2 - URL https://support.microsoft.com/en-us/office/format-a-table-e6e77bc6-1f4e-467e-b818-2e2acc488006 - HTML content saved to ..\Data\HTML\page_2.html
Index #3 - URL https://support.microsoft.com/en-us/office/video-get-started-with-table-relationships-728d53ff-f332-4ac6-9382-574ee271500a - HTML content saved to ..\Data\HTML\page_3.html
Index #4 - URL https://support.microsoft.com/en-us/office/resize-a-table-column-or-row-9340d478-21be-4392-81cf-488f7bbd6715 - HTML content saved to ..\Data\HTML\page_4.html
Index #5 - URL https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e - HTML content saved to ..\Data\HTML\page_5.html
Index #6 - URL https://support.microsoft.com/en-us/office/how-can-i-merge-two-or-more-tables-c80a9fce-c1ab-4425-bb96-497dd90

## RAG - Default Chunking Across All Articles

In [52]:
def process_html_default(raw_html, output_format, summarize=False):
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    # Remove all tags from the soup and keep only text
    text_only = soup.get_text(separator=' ', strip=True)

    return text_only

In [53]:
url_index = 0
master_list = []
html_folder_path = r"..\Data\HTML"
for file_name in os.listdir(html_folder_path):
    file_path = os.path.join(html_folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        html = file.read()
    
    # Get full HTML article formatted
    cleaned_html = process_html_default(html, "txt", False)

    # Chunk the remaining HTML with overlap
    remaining_chunks = chunk_text_with_overlap(cleaned_html, overlap_percentage=0, chunk_size=256)
    for i, chunk in enumerate(remaining_chunks):
        new_id = str(uuid.uuid4())
        meta = {
            "source": file_path,
            "url": all_urls[url_index],
            "chunk_index": i,
            "index": url_index
        }

        new_item = {
            "document": chunk,
            "metadata": meta,
            "id": new_id
        }
        master_list.append(new_item)
    
    url_index += 1 

## RAG - Custom Chunking For Tables

In [57]:
def process_html(raw_html, output_format, summarize=False):
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Find all table elements
    tables = soup.find_all('table')
    print(f"Num of Table Elements: {len(tables)}")
    
    top_level_tables = [table for table in tables if not table.find_parent('table')]
    print(f"Num of Parent Table Elements: {len(top_level_tables)}")

    # Filter out tables that are just wrappers for images or have insufficient content
    tables_with_content = []
    for table in top_level_tables:
        # Count all the elements within each table
        all_elements = table.find_all(True)
        # Count all the img tags within each table
        img_elements = table.find_all('img')
        
        # Count non-empty cells and img wrappers
        non_empty_cells = [cell for cell in table.find_all('td') if cell.get_text(strip=True)]
        img_wrappers = [cell for cell in table.find_all('td') if cell.find('img')]
        
        # Adjust condition to filter out tables that are primarily image wrappers or have insufficient content
        if len(non_empty_cells) > 0 and len(all_elements) - len(img_elements) > len(img_wrappers):
            tables_with_content.append(table)
    print(f"Num of Filtered Parent Table Elements: {len(tables_with_content)}")

    table_formatted = []
    for i, table in enumerate(tables_with_content):
        print(f"Table {i}")
        
        # Using table's HTML format instead of summaryu
        soup_table_df = extract_tables_from_file(str(table), "html")
        # print(soup_table_df)
        
        soup_table_format = print_format_from_table(output_format, soup_table_df)
        # print(soup_table_format)

        if not summarize:
            table_formatted.append(soup_table_format)
        else:
            payload = {
                'messages': [
                    {
                        'role': 'system',
                        'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
                    },
                    {
                        'role': 'user', 
                        'content': f"{soup_table_format}"
                    }
                ],
                'model': "openchat/openchat-3.5-1210",
                'max_tokens': 2048,
                "temperature": 0.0
            }
            soup_table_summary = make_together_call(payload)
            # print(f"soup_table_summary\n{soup_table_summary}")

            table_formatted.append(soup_table_summary)

        # You might want to remove the table or replace it with a placeholder
        table.extract()

    # Remove all tags from the soup and keep only text
    text_only = soup.get_text(separator=' ', strip=True)

    return text_only, table_formatted

def insert_random(string, fixed_string):
    # Generate a random index within the range of the existing string
    index = random.randint(0, len(string))
    
    # Insert the fixed string at the random index
    new_string = string[:index] + fixed_string + string[index:]
    
    return new_string

In [58]:
url_index = 0
master_list = []
html_folder_path = r"..\Data\HTML"
for file_name in os.listdir(html_folder_path):
    file_path = os.path.join(html_folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        html = file.read()
    
    # Update to change summarized vs raw table
    cleaned_html, table_summaries = process_html(html, "txt", False)
    document_id = str(uuid.uuid4())
    meta = {
        "source": file_path,
        "url": all_urls[url_index]
    }

    # Handle table summaries as separate chunks
    for i, summary in enumerate(table_summaries):
        item = {
            "document": summary,
            "metadata": {
                "source": file_path, 
                "url": all_urls[url_index],
                "type": "table_summary", 
                "table_index": i
            },
            "id": f"{document_id}_table_{i}"
        }
        print(f"item\n{item}")
        master_list.append(item)

    # Chunk the remaining HTML with overlap
    remaining_chunks = chunk_text_with_overlap(cleaned_html, overlap_percentage=10)
    for i, chunk in enumerate(remaining_chunks):
        new_id = f"{document_id}_{i+len(table_summaries)}"  # Adjust index based on table summaries
        new_meta = meta.copy()
        new_meta["ind"] = i
        
        # Use insert_random to update the chunk
        distractor_string = f"""
        Genghis's senior nökod were appointed to the highest ranks and received the greatest honours. 
        Bo'orchu and Muqali were each given ten thousand men to lead as commanders of the right and left wings of the army respectively
        """
        distractor_insertion_perc = 0
        if random.randint(1, 100) <= distractor_insertion_perc:
            chunk = insert_random(chunk, distractor_string.strip())  

        new_item = {
            "document": chunk,
            "metadata": new_meta,
            "id": new_id
        }
        master_list.append(new_item)
    
    url_index += 1 

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
item
{'document': '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        If you enter: Access creates a field with a data type of:\nIf you enter: Access creates a field with a data type of: John Short Text http://www.contoso.com You can use any valid Internet protocol prefix. For example, http://, https://, and mailto: are valid p

## Data Querying

In [60]:
# Split into 3 separate lists
documents_list = []
metadata_list = []
ids_list = []
for doc_info in master_list:
    documents_list.append(doc_info['document'])
    metadata_list.append(doc_info['metadata'])
    ids_list.append(doc_info['id'])
print(f"Documents List: {documents_list}")
print(f"Metadata List: {metadata_list}")
print(f"IDs List: {ids_list}")

Metadata List: [{'source': '..\\Data\\HTML\\page_1.html', 'url': 'https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'type': 'table_summary', 'table_index': 0}, {'source': '..\\Data\\HTML\\page_1.html', 'url': 'https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'ind': 0}, {'source': '..\\Data\\HTML\\page_1.html', 'url': 'https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'ind': 1}, {'source': '..\\Data\\HTML\\page_1.html', 'url': 'https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'ind': 2}, {'source': '..\\Data\\HTML\\page_1.html', 'url': 'https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'ind': 3}, {'source': '..\\Data\\HTML\\page_1.html', 'url': 'https://support.microsoft.com/en-us/office/introduction-to-tables-78ff2

In [61]:
# Add data to ChromaDB
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)
print(f"Inserted into vector database!")

Inserted into vector database!


In [63]:
# Final Query and Result

# query = f"what is the fat rate for Louisiana state"
# query = f"how many states have fat rate below 40"
# query = f"what is the % of population between 14-21 in the year 2020?"
query =f"details of Article Property and its search query"

# TODO: Have a way to flter by metadata for cases where we want to answer something like how many tables in <url>
results = collection.query(
    query_texts=query,
    n_results=5
)
print(f"results:\n{results}\n")

model_name = "gpt-3.5-turbo"
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': f"You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only."
        },
        {
            'role': 'user',
            'content': f"{query}\nContext\n{results}"
        }
    ],
    'model': model_name,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}")
print(f"output tokens: {output_tokens}")
print(f"response: {response}")

results:
{'ids': [['580bd5a0-c319-405f-8d6d-881ec33a2a22_table_5', '580bd5a0-c319-405f-8d6d-881ec33a2a22_23', '580bd5a0-c319-405f-8d6d-881ec33a2a22_table_1', '580bd5a0-c319-405f-8d6d-881ec33a2a22_16', '580bd5a0-c319-405f-8d6d-881ec33a2a22_17']], 'distances': [[0.8677704930305481, 0.9683190584182739, 1.0506821870803833, 1.0932114124298096, 1.0996675491333008]], 'metadatas': [[{'source': '..\\Data\\HTML\\page_23.html', 'table_index': 5, 'type': 'table_summary', 'url': 'https://support.microsoft.com/en-us/office/what-is-the-relationships-window-20c66348-854f-47f8-819a-5cce2518ee1d'}, {'ind': 15, 'source': '..\\Data\\HTML\\page_23.html', 'url': 'https://support.microsoft.com/en-us/office/what-is-the-relationships-window-20c66348-854f-47f8-819a-5cce2518ee1d'}, {'source': '..\\Data\\HTML\\page_23.html', 'table_index': 1, 'type': 'table_summary', 'url': 'https://support.microsoft.com/en-us/office/what-is-the-relationships-window-20c66348-854f-47f8-819a-5cce2518ee1d'}, {'ind': 8, 'source': '..