# Overall

## Setup

In [12]:
import os
import re
import glob
import json
import uuid
import yaml
import tiktoken
import requests
import pandas as pd
from io import StringIO
from dotenv import load_dotenv
from openai import AzureOpenAI
from bs4 import BeautifulSoup
import markdown
import chromadb
from chromadb.config import Settings

In [13]:
# Vars

load_dotenv()
azure_api_key = os.getenv('AZURE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
together_api_key = os.getenv('TOGETHER_API_KEY')

## Data

In [14]:
def fetch_html(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve HTML. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [15]:
# Pull all URLs from file
file_df = pd.read_csv(r"..\Data\Other\html_data.csv")
print(f"CSV Data\n{file_df.head(3)}\n")

all_urls = file_df['URL'].tolist()
print(f"All URLs\n{all_urls}")

CSV Data
   Index                                                URL  Num of Tables  \
0      1  https://support.microsoft.com/en-us/office/int...              1   
1      2  https://support.microsoft.com/en-us/office/for...              1   
2      3  https://support.microsoft.com/en-us/office/vid...              1   

   Pictures of Tables Dynamic Dropdown Note  
0                   0               No    -  
1                   0               No    -  
2                   0               No    -  

All URLs
['https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'https://support.microsoft.com/en-us/office/format-a-table-e6e77bc6-1f4e-467e-b818-2e2acc488006', 'https://support.microsoft.com/en-us/office/video-get-started-with-table-relationships-728d53ff-f332-4ac6-9382-574ee271500a', 'https://support.microsoft.com/en-us/office/resize-a-table-column-or-row-9340d478-21be-4392-81cf-488f7bbd6715', 'https://support.microsoft.com/en-us/offi

In [16]:
# Save HTML content from each URL to a file
html_output_path = r"..\Data\HTML"
for index, single_url in enumerate(all_urls, start=1):
    html_content = fetch_html(single_url)
    if html_content:
        file_name = f"page_{index}.html"
        file_path = os.path.join(html_output_path, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(html_content)
        print(f"HTML content saved to {file_path}")

HTML content saved to ..\Data\HTML\page_1.html
HTML content saved to ..\Data\HTML\page_2.html
HTML content saved to ..\Data\HTML\page_3.html
HTML content saved to ..\Data\HTML\page_4.html
HTML content saved to ..\Data\HTML\page_5.html
HTML content saved to ..\Data\HTML\page_6.html
HTML content saved to ..\Data\HTML\page_7.html
HTML content saved to ..\Data\HTML\page_8.html
HTML content saved to ..\Data\HTML\page_9.html
HTML content saved to ..\Data\HTML\page_10.html
HTML content saved to ..\Data\HTML\page_11.html
HTML content saved to ..\Data\HTML\page_12.html
HTML content saved to ..\Data\HTML\page_13.html
HTML content saved to ..\Data\HTML\page_14.html
HTML content saved to ..\Data\HTML\page_15.html
HTML content saved to ..\Data\HTML\page_16.html
HTML content saved to ..\Data\HTML\page_17.html
HTML content saved to ..\Data\HTML\page_18.html
Failed to retrieve HTML. Status code: 403
HTML content saved to ..\Data\HTML\page_20.html
HTML content saved to ..\Data\HTML\page_21.html
Failed 

## Functions

In [17]:
# LLM API Functions

def make_azure_call(payload):
    client = AzureOpenAI(
        azure_endpoint = "https://copilotcapstone-team11.openai.azure.com/", 
        api_key=azure_api_key,  
        api_version="2024-02-15-preview"
    )

    message_text = [
        {"role":"system","content":"You are an AI assistant that helps people find information."},
        {"role":"user","content":"Hello"}
    ]

    response = client.chat.completions.create(
        model = payload["model"],
        messages = payload["messages"],
        temperature = payload["temperature"],
        max_tokens = payload["max_tokens"]
    )
    # print(f"response: {response}")

    input_tokens = response.usage.prompt_tokens
    # print(f"input_tokens: {response}")
    output_tokens = response.usage.completion_tokens
    # print(f"output_tokens: {output_tokens}")
    result_text = response.choices[0].message.content
    # print(f"result_text: {result_text}")

    return input_tokens, output_tokens, result_text

def make_openai_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {openai_api_key}',
    }
    response = requests.post('https://api.openai.com/v1/chat/completions', json=payload, headers=headers)
    
    result = response.json()

    input_tokens = result['usage']['prompt_tokens']
    output_tokens = result['usage']['completion_tokens']
    result_text = result['choices'][0]['message']['content']

    return input_tokens, output_tokens, result_text
    
def make_together_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {together_api_key}',
    }
    # print(headers)
    response = requests.post('https://api.together.xyz/v1/chat/completions', json=payload, headers=headers)
    # print(response)
    
    result = response.json()
    # print(result)

    result_text = result['choices'][0]['message']['content']
    # print(result_text)

    return result_text

In [59]:
# Table Related

def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    # Get the text without tags
    clean_text = soup.get_text(separator=' ')
    
    return clean_text

def process_html_for_tables(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Find all table elements
    tables = soup.find_all('table')
    print(f"Num of Table Elements: {len(tables)}")
    
    top_level_tables = [table for table in tables if not table.find_parent('table')]
    print(f"Num of Parent Table Elements: {len(top_level_tables)}")

    # Filter out tables that are just wrappers for images or have insufficient content
    tables_with_content = []
    for table in top_level_tables:
        # Count all the elements within each table
        all_elements = table.find_all(True)
        # Count all the img tags within each table
        img_elements = table.find_all('img')
        
        # Count non-empty cells and img wrappers
        non_empty_cells = [cell for cell in table.find_all('td') if cell.get_text(strip=True)]
        img_wrappers = [cell for cell in table.find_all('td') if cell.find('img')]
        
        # Adjust condition to filter out tables that are primarily image wrappers or have insufficient content
        if len(non_empty_cells) > 0 and len(all_elements) - len(img_elements) > len(img_wrappers):
            tables_with_content.append(table)
    print(f"Num of Filtered Parent Table Elements: {len(tables_with_content)}\n")

    return tables_with_content

def process_html(raw_html, output_format, summarize=False):
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Find all table elements
    tables = soup.find_all('table')
    print(f"Num of Table Elements: {len(tables)}")
    
    top_level_tables = [table for table in tables if not table.find_parent('table')]
    print(f"Num of Parent Table Elements: {len(top_level_tables)}")

    # Filter out tables that are just wrappers for images or have insufficient content
    tables_with_content = []
    for table in top_level_tables:
        # Count all the elements within each table
        all_elements = table.find_all(True)
        # Count all the img tags within each table
        img_elements = table.find_all('img')
        
        # Count non-empty cells and img wrappers
        non_empty_cells = [cell for cell in table.find_all('td') if cell.get_text(strip=True)]
        img_wrappers = [cell for cell in table.find_all('td') if cell.find('img')]
        
        # Adjust condition to filter out tables that are primarily image wrappers or have insufficient content
        if len(non_empty_cells) > 0 and len(all_elements) - len(img_elements) > len(img_wrappers):
            tables_with_content.append(table)
    print(f"Num of Filtered Parent Table Elements: {len(tables_with_content)}")

    table_formatted = []
    for i, table in enumerate(tables_with_content):
        print(f"Table {i}")
        
        # Using table's HTML format instead of summaryu
        soup_table_df = extract_tables_from_file(str(table), "html")
        # print(soup_table_df)
        
        soup_table_format = print_format_from_table(output_format, soup_table_df)
        # print(soup_table_format)

        if not summarize:
            table_formatted.append(soup_table_format)
        else:
            payload = {
                'messages': [
                    {
                        'role': 'system',
                        'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
                    },
                    {
                        'role': 'user', 
                        'content': f"{soup_table_format}"
                    }
                ],
                'model': "openchat/openchat-3.5-1210",
                'max_tokens': 2048,
                "temperature": 0.0
            }
            soup_table_summary = make_together_call(payload)
            # print(f"soup_table_summary\n{soup_table_summary}")

            table_formatted.append(soup_table_summary)

        # You might want to remove the table or replace it with a placeholder
        table.extract()

    # Remove all tags from the soup and keep only text
    text_only = soup.get_text(separator=' ', strip=True)

    return text_only, table_formatted

def extract_tables_from_file(html_text: str, source: str) -> pd.DataFrame:
    """Extract tables from HTML content string based on its extension."""

    if source == "html":
        return extract_tables_from_html(html_text)
    elif source.endswith('.json'):
        return extract_tables_from_json(html_text)
    elif source.endswith('.md'):
        return extract_tables_from_md(html_text)
    else:
        raise ValueError("Unsupported file format")
def extract_tables_from_html(html_content: str) -> pd.DataFrame:
    """Extract tables from HTML content and return them as a list of pandas DataFrames."""
    soup = BeautifulSoup(html_content, 'html.parser')
    html_io = StringIO(html_content)
    return pd.read_html(html_io)[0]
    # return [pd.read_html(str(table))[0] for table in soup.find_all('table')]
def extract_tables_from_json(json_content: str) -> pd.DataFrame:
    """Extract tables from JSON content and return them as a list of pandas DataFrames."""
    data = json.loads(json_content)
    return pd.DataFrame(list(data.values())[0])
    # return [pd.DataFrame(table_data) for table_data in data.values()]
def extract_tables_from_md(md_content: str) -> pd.DataFrame:
    """Convert Markdown content to HTML and extract tables from it."""
    html_content = markdown.markdown(md_content)
    return extract_tables_from_html(html_content)
    # return extract_tables_from_html(html_content)

def print_format_from_table(print_format: str, dataframe: pd.DataFrame) -> None:
    """Print or save a pandas DataFrame in the specified format."""
    format_functions = {
        'html': dataframe_to_html,
        'json': dataframe_to_json,
        'md': dataframe_to_md,
        'txt': dataframe_to_text,
        'yaml': dataframe_to_yaml
    }
    
    if print_format not in format_functions:
        raise ValueError("Unsupported save format")

    print_content = format_functions[print_format](dataframe)
    return print_content
def dataframe_to_text(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to a text format."""
    return dataframe.to_string(index=False)
def dataframe_to_html(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to HTML format."""
    return dataframe.to_html(index=False)
def dataframe_to_json(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to JSON format."""
    return dataframe.to_json(orient='records', indent=4)
def dataframe_to_md(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to Markdown format by first converting to HTML."""
    return markdown.markdown(dataframe.to_html(index=False))
def dataframe_to_yaml(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to YAML format."""
    return yaml.dump(dataframe.to_dict(orient='records'), default_flow_style=False)

def extract_table_data(table):
    headers = [th.get_text(strip=True) for th in table.find_all('th')]
    rows = [[td.get_text(strip=True) for td in tr.find_all('td')] for tr in table.find_all('tr')]
    
    # Clean rows from empty values or rows
    cleaned_rows = [row for row in rows if any(row)]
    
    # Adjusting for tables without header rows, using column indexes as headers
    if not headers and cleaned_rows:
        headers = [f"Column {i+1}" for i in range(len(cleaned_rows[0]))]
    
    return headers, cleaned_rows
def convert_to_json(headers, rows):
    data_list = []
    for row in rows:
        row_data = {headers[i]: cell for i, cell in enumerate(row)}
        data_list.append(row_data)
    return json.dumps(data_list, indent=2)
def convert_to_markdown(headers, rows):
    md_string = "| " + " | ".join(headers) + " |\n"
    md_string += "| " + " | ".join(["---"] * len(headers)) + " |\n"
    for row in rows:
        md_string += "| " + " | ".join(row) + " |\n"
    return md_string
def convert_to_yaml(headers, rows):
    data_list = []
    for row in rows:
        row_data = {headers[i]: cell for i, cell in enumerate(row)}
        data_list.append(row_data)
    return yaml.dump(data_list)
def convert_to_txt(headers, rows):
    max_lengths = [max(len(headers[i]), max(len(row[i]) for row in rows)) for i in range(len(headers))]
    txt_string = " | ".join(headers[i].ljust(max_lengths[i]) for i in range(len(headers))) + "\n"
    txt_string += "-+-".join("-" * max_lengths[i] for i in range(len(headers))) + "\n"
    for row in rows:
        txt_string += " | ".join(row[i].ljust(max_lengths[i]) for i in range(len(row))) + "\n"
    return txt_string

def table_to_format(table_text: str, source_format: str, output_format: str) -> pd.DataFrame:
    """Extract tables from HTML content string based on its extension."""

    # print(f"source_format: {source_format} | output_format: {output_format}\ntable_text\n{table_text}\n")

    headers, rows = extract_table_data(table_text)  # Using the first suitable table

    if output_format == 'json':
        return convert_to_json(headers, rows)
    elif output_format == 'md' or output_format == 'markdown':
        return convert_to_markdown(headers, rows)
    elif output_format == 'yaml':
        return convert_to_yaml(headers, rows)
    elif output_format == 'txt':
        return convert_to_txt(headers, rows)
    else:
        return "Unsupported format requested."
    

In [19]:
# Token Related

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [20]:
# Chunking Related

def chunk_text_default(text, chunk_size=1024):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size
    return chunks

def chunk_text_with_overlap(text, chunk_size=1024, overlap_percentage=10):
    # Example chunking logic with overlap, adjust as necessary
    chunks = []
    overlap_size = int(chunk_size * (overlap_percentage / 100))
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += (chunk_size - overlap_size)
    return chunks

## Approaches

Single Table - NonRAG - Raw HTML

In [40]:
# Vars
file_name = "page_19.html"
table_num = 0
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Table from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {table_contents}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected table contains: 388 tokens

input tokens: 439
output tokens: 12

response: The obesity rate for Louisiana state is 35.9%.


Single Table - NonRAG - Formatted HTML to MD/JSON/YAML

In [41]:
# Vars
file_name = "page_19.html"
table_num = 0
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Table from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw {input_format_type} Table")
# print(f"{table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected Raw {input_format_type} table contains: {num_tokens_in_html} tokens\n")


# Format Table
formmated_table_contents = table_to_format(table_contents, input_format_type, print_format_type)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_table_contents}\n")
num_tokens_in_formatted = num_tokens_from_string(str(formmated_table_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {table_contents}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw html Table
Selected Raw html table contains: 388 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 155 tokens

input tokens: 439
output tokens: 12

response: The obesity rate for Louisiana state is 35.9%.


Single Table - NonRAG - Summarize HTML

In [42]:
# Vars
file_name = "page_19.html"
table_num = 0
input_format_type = f"html"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Table from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected Raw HTML table contains: {num_tokens_in_html} tokens\n")


# Format Table to Summary
payload = {
    'messages': [
        {
            'role': 'system',
            'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
        },
        {
            'role': 'user', 
            'content': f"{table_contents}"
        }
    ],
    'model': model_choice,
    'max_tokens': 2048,
    "temperature": 0.0
}
soup_table_summary = make_azure_call(payload)
print(f"Selected Raw HTML Table to Summary")
# print(f"{soup_table_summary}\n")
num_tokens_in_summary = num_tokens_from_string(str(soup_table_summary), "cl100k_base")
print(F"Selected Summary Table contains: {num_tokens_in_summary} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {formmated_table_contents}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected Raw HTML table contains: 388 tokens

Selected Raw HTML Table to Summary
Selected Summary Table contains: 91 tokens

input tokens: 205
output tokens: 12

response: The obesity rate for Louisiana state is 35.9%.


Single Table - RAG - Raw HTML

In [24]:
# Vars
file_name = "page_19.html"
table_num = 0
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
n_results = 3
chunk_size = 512


# Loading Table from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(table_contents), chunk_size)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected table contains: 388 tokens

all_chunks
['<table class="table"> <thead> <tr> <th scope="col">Rank</th> <th scope="col">State</th> <th scope="col">Obesity Rate</th> </tr> </thead> <tbody> <tr> <th scope="row">1</th> <td>Mississippi</td> <td>40.8%</td> </tr> <tr> <th scope="row">2</th> <td>West Virginia</td> <td>39.7%</td> </tr> <tr> <th scope="row">3</th> <td>Arkansas</td> <td>37.4%</td> </tr> <tr> <th scope="row">4</th> <td>Oklahoma</td> <td>36.8%</td> </tr> <tr> <th scope="row">5</th> <td>Kentucky</td> <td>36.5%</td> </tr> <tr> <th scope="row">6</', 'th> <td>Tennessee</td> <td>36.5%</td> </tr> <tr> <th scope="row">7</th> <td>Alabama</td> <td>36.1%</td> </tr> <tr> <th scope="row">8</th> <td>Michigan</td> <td>36%</td> </tr> <tr> <th scope="row">9</th> <td>Louisiana</td> <td>35.9%</td> </tr> <tr> <th scope="row">10</th> <td>South Carolina</td> <td>35.4%</td> </tr>

Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2


results:
{'ids': [['95d85399-5808-48b7-9980-6576c665b44e', '79b598c5-b307-4931-abfa-f03f66097c3e']], 'distances': [[1.4169235229492188, 1.5717264413833618]], 'metadatas': [[{'chunk_num': 0, 'file_name': 'page_19.html', 'table_num': 0}, {'chunk_num': 1, 'file_name': 'page_19.html', 'table_num': 0}]], 'embeddings': None, 'documents': [['<table class="table"> <thead> <tr> <th scope="col">Rank</th> <th scope="col">State</th> <th scope="col">Obesity Rate</th> </tr> </thead> <tbody> <tr> <th scope="row">1</th> <td>Mississippi</td> <td>40.8%</td> </tr> <tr> <th scope="row">2</th> <td>West Virginia</td> <td>39.7%</td> </tr> <tr> <th scope="row">3</th> <td>Arkansas</td> <td>37.4%</td> </tr> <tr> <th scope="row">4</th> <td>Oklahoma</td> <td>36.8%</td> </tr> <tr> <th scope="row">5</th> <td>Kentucky</td> <td>36.5%</td> </tr> <tr> <th scope="row">6</', 'th> <td>Tennessee</td> <td>36.5%</td> </tr> <tr> <th scope="row">7</th> <td>Alabama</td> <td>36.1%</td> </tr> <tr> <th scope="row">8</th> <td>Michi

Single Table - RAG - Formatted HTML to MD/JSON/YAML

In [25]:
# Vars
file_name = "page_19.html"
table_num = 0
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
n_results = 3
chunk_size = 512


# Loading Table from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")


# Format Table
formmated_table_contents = table_to_format(table_contents, input_format_type, print_format_type)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_table_contents}\n")
num_tokens_in_formatted = num_tokens_from_string(str(formmated_table_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(formmated_table_contents), chunk_size)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected table contains: 388 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 155 tokens

all_chunks
['| Rank | State | Obesity Rate | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n| Mississippi | 40.8% |\n| West Virginia | 39.7% |\n| Arkansas | 37.4% |\n| Oklahoma | 36.8% |\n| Kentucky | 36.5% |\n| Tennessee | 36.5% |\n| Alabama | 36.1% |\n| Michigan | 36% |\n| Louisiana | 35.9% |\n| South Carolina | 35.4% |\n']

documents_list: ['| Rank | State | Obesity Rate | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n| Mississippi | 40.8% |\n| West Virginia | 39.7% |\n| Arkansas | 37.4% |\n| Oklahoma | 36.8% |\n| Kentucky | 36.5% |\n| Tennessee | 36.5% |\n| Alabama | 36.1% |\n| Michigan | 36% |\n| Lou

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


results:
{'ids': [['8274548d-a85c-4801-b1a7-bf5904494bf0']], 'distances': [[1.1144390106201172]], 'metadatas': [[{'chunk_num': 0, 'file_name': 'page_19.html', 'table_num': 0}]], 'embeddings': None, 'documents': [['| Rank | State | Obesity Rate | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n| Mississippi | 40.8% |\n| West Virginia | 39.7% |\n| Arkansas | 37.4% |\n| Oklahoma | 36.8% |\n| Kentucky | 36.5% |\n| Tennessee | 36.5% |\n| Alabama | 36.1% |\n| Michigan | 36% |\n| Louisiana | 35.9% |\n| South Carolina | 35.4% |\n']], 'uris': None, 'data': None}
input tokens: 308
output tokens: 12

response: The obesity rate for Louisiana state is 35.9%.


Single Table - RAG - Summarize HTML

In [26]:
# Vars
file_name = "page_19.html"
table_num = 0
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
n_results = 3
chunk_size = 512


# Loading Table from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")


# Format Table
payload = {
    'messages': [
        {
            'role': 'system',
            'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
        },
        {
            'role': 'user', 
            'content': f"{table_contents}"
        }
    ],
    'model': model_choice,
    'max_tokens': 2048,
    "temperature": 0.0
}
soup_table_summary = make_azure_call(payload)
print(f"Selected Raw HTML Table to Summary")
# print(f"{soup_table_summary}\n")
num_tokens_in_summary = num_tokens_from_string(str(soup_table_summary), "cl100k_base")
print(F"Selected Summary Table contains: {num_tokens_in_summary} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(soup_table_summary), chunk_size)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected table contains: 388 tokens

Selected Raw HTML Table to Summary
Selected Summary Table contains: 91 tokens

all_chunks
["(435, 80, 'Title: Obesity Rates by State in the United States\\n\\nRow 1: Rank - The ranking of each state based on their obesity rate, with 1 being the highest.\\nRow 2: State - The name of each state in the United States.\\nRow 3: Obesity Rate - The percentage of the population in each state that is considered obese, based on their body mass index (BMI).')"]

documents_list: ["(435, 80, 'Title: Obesity Rates by State in the United States\\n\\nRow 1: Rank - The ranking of each state based on their obesity rate, with 1 being the highest.\\nRow 2: State - The name of each state in the United States.\\nRow 3: Obesity Rate - The percentage of the population in each state that is considered obese, based on their body mass index (BMI).')"]
metadata

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


results:
{'ids': [['da161e3a-b789-417c-94ee-4112451f28b0']], 'distances': [[1.1910569667816162]], 'metadatas': [[{'chunk_num': 0, 'file_name': 'page_19.html', 'table_num': 0}]], 'embeddings': None, 'documents': [["(435, 80, 'Title: Obesity Rates by State in the United States\\n\\nRow 1: Rank - The ranking of each state based on their obesity rate, with 1 being the highest.\\nRow 2: State - The name of each state in the United States.\\nRow 3: Obesity Rate - The percentage of the population in each state that is considered obese, based on their body mass index (BMI).')"]], 'uris': None, 'data': None}
input tokens: 237
output tokens: 72

response: According to the provided context, the table shows the obesity rates by state in the United States, including the ranking of each state based on their obesity rate, the name of each state, and the percentage of the population in each state that is considered obese based on their body mass index (BMI). There is no information about the fat rate 

Multiple Tables - NonRAG - Raw HTML

In [43]:
# Vars
file_name = "page_19.html"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Tables from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected ALL Raw Tables")
# print(f"{all_table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"ALL tables contain: {num_tokens_in_html} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {all_table_contents}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected ALL Raw Tables
ALL tables contain: 390 tokens

input tokens: 439
output tokens: 12

response: The obesity rate for Louisiana state is 35.9%.


Multiple Table - NonRAG - Formatted HTML to MD/JSON/YAML

In [28]:
# Vars
file_name = "page_19.html"
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Tables from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected ALL Raw {input_format_type.upper()} Table")
# print(f"{all_table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"Selected Raw {input_format_type} table contains: {num_tokens_in_html} tokens\n")


# Format Table
formmated_tables_contents = []
for html_table in all_table_contents:
    formmated_table_content = table_to_format(table_contents, input_format_type, print_format_type)
    formmated_tables_contents.append(formmated_table_content)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_tables_contents}\n")
num_tokens_in_formatted = num_tokens_from_string(str(formmated_tables_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")


# rRsult
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {formmated_tables_contents}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected ALL Raw HTML Table
Selected Raw html table contains: 390 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 169 tokens

input tokens: 217
output tokens: 12

response: The obesity rate for Louisiana state is 35.9%.


Multiple Table - NonRAG - Summarize HTML

In [29]:
# Vars
file_name = "page_19.html"
input_format_type = f"html"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Tables from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected Raw Table")
# print(f"{all_table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"Selected Raw HTML table contains: {num_tokens_in_html} tokens\n")


# Format Tables
summarized_tables_contents = []
for html_table in all_table_contents:
    payload = {
        'messages': [
            {
                'role': 'system',
                'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
            },
            {
                'role': 'user', 
                'content': f"{html_table}"
            }
        ],
        'model': model_choice,
        'max_tokens': 2048,
        "temperature": 0.0
    }
    soup_table_summary = make_azure_call(payload)
    summarized_tables_contents.append(soup_table_summary)
print(f"Selected Raw HTML Table to Summary")
# print(f"{summarized_tables_contents}\n")
num_tokens_in_summary = num_tokens_from_string(str(summarized_tables_contents), "cl100k_base")
print(F"Selected Summary Table contains: {num_tokens_in_summary} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {summarized_tables_contents}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected Raw HTML table contains: 390 tokens

Selected Raw HTML Table to Summary
Selected Summary Table contains: 91 tokens

input tokens: 139
output tokens: 57

response: The provided context shows a list of obesity rates by state in the United States, including the ranking of each state based on their obesity rate and the percentage of the population in each state that is considered obese. However, there is no information provided specifically about the fat rate for Louisiana state.


Multiple Table - RAG - Raw HTML

In [30]:
# Vars
file_name = "page_19.html"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
n_results = 3
chunk_size = 512


# Loading Tables from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected ALL Raw Tables")
# print(f"{all_table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"ALL tables contain: {num_tokens_in_html} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the multiple tables
all_chunks = chunk_text_default(str(all_table_contents), chunk_size)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        # 'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected ALL Raw Tables
ALL tables contain: 390 tokens

all_chunks
['[<table class="table"> <thead> <tr> <th scope="col">Rank</th> <th scope="col">State</th> <th scope="col">Obesity Rate</th> </tr> </thead> <tbody> <tr> <th scope="row">1</th> <td>Mississippi</td> <td>40.8%</td> </tr> <tr> <th scope="row">2</th> <td>West Virginia</td> <td>39.7%</td> </tr> <tr> <th scope="row">3</th> <td>Arkansas</td> <td>37.4%</td> </tr> <tr> <th scope="row">4</th> <td>Oklahoma</td> <td>36.8%</td> </tr> <tr> <th scope="row">5</th> <td>Kentucky</td> <td>36.5%</td> </tr> <tr> <th scope="row">6<', '/th> <td>Tennessee</td> <td>36.5%</td> </tr> <tr> <th scope="row">7</th> <td>Alabama</td> <td>36.1%</td> </tr> <tr> <th scope="row">8</th> <td>Michigan</td> <td>36%</td> </tr> <tr> <th scope="row">9</th> <td>Louisiana</td> <td>35.9%</td> </tr> <tr> <th scope="row">10</th> <td>South Carolina</td> <td>35.4%</td> </tr

Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2


results:
{'ids': [['c4508d2f-ceab-410c-abb7-31c3f74a5a52', '031d7009-990f-4e4d-b9ac-9d34f86f6f98']], 'distances': [[1.4295642375946045, 1.5662596225738525]], 'metadatas': [[{'chunk_num': 0, 'file_name': 'page_19.html'}, {'chunk_num': 1, 'file_name': 'page_19.html'}]], 'embeddings': None, 'documents': [['[<table class="table"> <thead> <tr> <th scope="col">Rank</th> <th scope="col">State</th> <th scope="col">Obesity Rate</th> </tr> </thead> <tbody> <tr> <th scope="row">1</th> <td>Mississippi</td> <td>40.8%</td> </tr> <tr> <th scope="row">2</th> <td>West Virginia</td> <td>39.7%</td> </tr> <tr> <th scope="row">3</th> <td>Arkansas</td> <td>37.4%</td> </tr> <tr> <th scope="row">4</th> <td>Oklahoma</td> <td>36.8%</td> </tr> <tr> <th scope="row">5</th> <td>Kentucky</td> <td>36.5%</td> </tr> <tr> <th scope="row">6<', '/th> <td>Tennessee</td> <td>36.5%</td> </tr> <tr> <th scope="row">7</th> <td>Alabama</td> <td>36.1%</td> </tr> <tr> <th scope="row">8</th> <td>Michigan</td> <td>36%</td> </tr> <tr

Multiple Table - RAG - Formatted HTML to MD/JSON/YAML

In [31]:
# Vars
file_name = "page_19.html"
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Tables from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected ALL Raw {input_format_type.upper()} Table")
# print(f"{all_table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"Selected Raw {input_format_type} table contains: {num_tokens_in_html} tokens\n")


# Format Tables
formmated_tables_contents = []
for html_table in all_table_contents:
    formmated_table_content = table_to_format(table_contents, input_format_type, print_format_type)
    formmated_tables_contents.append(formmated_table_content)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_tables_contents}\n")
num_tokens_in_formatted = num_tokens_from_string(str(formmated_tables_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(formmated_tables_contents), 512)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected ALL Raw HTML Table
Selected Raw html table contains: 390 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 169 tokens

all_chunks
["['| Rank | State | Obesity Rate | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |\\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\\n| Mississippi | 40.8% |\\n| West Virginia | 39.7% |\\n| Arkansas | 37.4% |\\n| Oklahoma | 36.8% |\\n| Kentucky | 36.5% |\\n| Tennessee | 36.5% |\\n| Alabama | 36.1% |\\n| Michigan | 36% |\\n| Louisiana | 35.9% |\\n| South Carolina | 35.4% |\\n']"]

documents_list: ["['| Rank | State | Obesity Rate | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |\\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\\n| Mississippi | 40.8% |\\n| West Virginia | 39.7% |\\n| Arkansas | 37.4% |\\n| Oklahoma | 36.8% |\\n| Kentucky | 36.5% |\\n| Tennessee | 36.5% |\\n| 

Multiple Table - RAG - Summarize HTML

In [32]:
# Vars
file_name = "page_19.html"
input_format_type = f"html"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
n_results = 3
chunk_size = 512


# Loading Tables from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected Raw Table")
# print(f"{all_table_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"Selected Raw HTML table contains: {num_tokens_in_html} tokens\n")


# Format Tables
summarized_tables_contents = []
for html_table in all_table_contents:
    payload = {
        'messages': [
            {
                'role': 'system',
                'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
            },
            {
                'role': 'user', 
                'content': f"{html_table}"
            }
        ],
        'model': "Copilotteam11",
        'max_tokens': 2048,
        "temperature": 0.0
    }
    soup_table_summary = make_azure_call(payload)
    summarized_tables_contents.append(soup_table_summary)
print(f"Selected Raw HTML Table to Summary")
# print(f"{summarized_tables_contents}\n")
num_tokens_in_summary = num_tokens_from_string(str(summarized_tables_contents), "cl100k_base")
print(F"Selected Summary Table contains: {num_tokens_in_summary} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(summarized_tables_contents), chunk_size)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected Raw HTML table contains: 390 tokens

Selected Raw HTML Table to Summary
Selected Summary Table contains: 89 tokens

all_chunks
['[(435, 78, "Title: Obesity Rates by State in the United States\\n\\nRow 1: Rank - The ranking of each state based on their obesity rate, with 1 being the highest.\\nRow 2: State - The name of the state being ranked.\\nRow 3: Obesity Rate - The percentage of the state\'s population that is considered obese, based on BMI (Body Mass Index) calculations.")]']

documents_list: ['[(435, 78, "Title: Obesity Rates by State in the United States\\n\\nRow 1: Rank - The ranking of each state based on their obesity rate, with 1 being the highest.\\nRow 2: State - The name of the state being ranked.\\nRow 3: Obesity Rate - The percentage of the state\'s population that is considered obese, based on BMI (Body Mass Index) calculations.")]']
metadata_

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


results:
{'ids': [['7110a699-9871-464f-82f0-8ca4066014c0']], 'distances': [[1.1959799528121948]], 'metadatas': [[{'chunk_num': 0, 'file_name': 'page_19.html', 'table_num': 0}]], 'embeddings': None, 'documents': [['[(435, 78, "Title: Obesity Rates by State in the United States\\n\\nRow 1: Rank - The ranking of each state based on their obesity rate, with 1 being the highest.\\nRow 2: State - The name of the state being ranked.\\nRow 3: Obesity Rate - The percentage of the state\'s population that is considered obese, based on BMI (Body Mass Index) calculations.")]']], 'uris': None, 'data': None}
input tokens: 236
output tokens: 33

response: According to the provided context, there is information about obesity rates by state in the United States, but there is no specific information about the fat rate for Louisiana state.


Single Article - NonRAG - Raw HTML

In [52]:
# Vars
file_name = "page_19.html"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"


# Loading Tables from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
print(f"Selected Single Raw HTML Article")
# print(f"{html_content}\n")
num_tokens_in_html = num_tokens_from_string(str(html_content), "cl100k_base")
print(F"Selected Single Raw HTML Article contains: {num_tokens_in_html} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {html_content}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Selected Single Raw HTML Article
Selected Single Raw HTML Article contains: 376136 tokens



BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 376187 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

Single Article - NonRAG - Cleaned HTML

In [53]:
# Vars
file_name = "page_19.html"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"


# Loading Article from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
print(f"Selected Single Raw HTML Article")
# print(f"{html_content}\n")
num_tokens_in_html = num_tokens_from_string(str(html_content), "cl100k_base")
print(F"Selected Single Raw Article contains: {num_tokens_in_html} tokens\n")


# Clean HTML Article
html_content = clean_html(html_content)
print(f"Cleaned Single Raw HTML Content")
# print(f"{html_content}\n")
num_tokens_in_html = num_tokens_from_string(str(html_content), "cl100k_base")
print(F"Selected Single Cleaned Article contains: {num_tokens_in_html} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_query.strip() + "\nContext\n" + html_content
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Selected Single Raw HTML Article
Selected Single Raw Article contains: 376136 tokens

Cleaned Single Raw HTML Content
Selected Single Cleaned Article contains: 3060 tokens

input tokens: 3108
output tokens: 29

response: The obesity rate for Louisiana state is 35.9%, making it the 9th state with the highest rates of obesity in the US.


Single Article - NonRAG - Formatted HTML to MD/JSON/YAML

In [64]:
# Vars
file_name = "page_19.html"
print_format_type = f"md"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Article from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
print(f"Selected Single Raw HTML Article")
# print(f"{html_content}\n")
num_tokens_in_html = num_tokens_from_string(str(html_content), "cl100k_base")
print(F"Selected Single Raw HTML Article contains: {num_tokens_in_html} tokens\n")


# Fromatting Article's Tables
article_text_only, article_tables_formatted = process_html(html_content, print_format_type, False)
print(f"Formatted Single Raw HTML Article to {print_format_type}")
# print(f"{article_text_only}\n{article_tables_formatted}\n")
num_tokens_in_html_formatted = num_tokens_from_string(article_text_only+str(article_tables_formatted), "cl100k_base")
print(F"Formatted Single Raw HTML Article to {print_format_type} contains: {num_tokens_in_html_formatted} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_query.strip() + "\nContext\n" + article_text_only+str(article_tables_formatted)
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Selected Single Raw HTML Article
Selected Single Raw HTML Article contains: 376136 tokens

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
Formatted Single Raw HTML Article to md
Formatted Single Raw HTML Article to md contains: 3248 tokens

input tokens: 3296
output tokens: 12

response: The obesity rate for Louisiana state is 35.9%.


Single Article - NonRAG - Summarize HTML

In [66]:
# Vars
file_name = "page_19.html"
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""


# Loading Article from File
file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
print(f"Selected Single Raw HTML Article")
# print(f"{html_content}\n")
num_tokens_in_html = num_tokens_from_string(str(html_content), "cl100k_base")
print(F"Selected Single Raw HTML Article contains: {num_tokens_in_html} tokens\n")


# Formatting Article's Tables
article_text_only, article_tables_formatted = process_html(html_content, print_format_type, True)
print(f"Formatted Single Raw HTML Article to {print_format_type}")
# print(f"{article_text_only}\n")
# print(f"{article_tables_formatted}\n")
num_tokens_in_html_formatted = num_tokens_from_string(article_text_only+str(article_tables_formatted), "cl100k_base")
print(F"Formatted Single Raw HTML Article to {print_format_type} contains: {num_tokens_in_html_formatted} tokens\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_query.strip() + "\nContext\n" + article_text_only+str(article_tables_formatted)
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Selected Single Raw HTML Article
Selected Single Raw HTML Article contains: 376136 tokens

Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
Formatted Single Raw HTML Article to md
[' Title: "Top 10 U.S. States with the Highest Obesity Rates"\n\n1. Mississippi: 40.8% obesity rate\n2. West Virginia: 39.7% obesity rate\n3. Arkansas: 37.4% obesity rate\n4. Oklahoma: 36.8% obesity rate\n5. Kentucky: 36.5% obesity rate\n6. Tennessee: 36.5% obesity rate\n7. Alabama: 36.1% obesity rate\n8. Michigan: 36% obesity rate\n9. Louisiana: 35.9% obesity rate\n10. South Carolina: 35.4% obesity rate']

Formatted Single Raw HTML Article to md contains: 2852 tokens

input tokens: 2900
output tokens: 28

response: The context provided is about weight loss statistics and the weight loss service industry. There is no information provided about the fat rate for Louisiana state.


Multiple Articles - RAG - Raw HTML

In [36]:
# Vars
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
n_results= 3
chunk_size = 1024


# Loading HTML from files
html_contents = []
directory = r"..\Data\HTML"
html_files = glob.glob(os.path.join(directory, '*.html'))
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        html_contents.append(html_content)
print(f"Num of Articles: {len(html_contents)}")
# print(f"{html_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(html_contents), "cl100k_base")
print(F"All Articles contain: {num_tokens_in_html} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = []
for article_html_content in html_contents:
    chunk = chunk_text_default(str(article_html_content), chunk_size)
    all_chunks.append(chunk)
all_chunks_flat = [chunk for chunk_list in all_chunks for chunk in chunk_list]
print(f"all_chunks_flat: {len(all_chunks_flat)}\n{all_chunks_flat[:3]}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks_flat):
    documents_list.append(chunk)
    metadata_list.append({
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list[:3]}\nmetadata_list: {metadata_list[:3]}\nids_list: {ids_list[:3]}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Selected ALL Articles: 23
ALL tables contain: 1662510 tokens

all_chunks_flat: 4788
['\n\n<!DOCTYPE html>\n\n<html lang="en-US" dir="ltr">\n\n<head>\n\n\t<meta charset="utf-8" />\n\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0" />\n\n\t<title>Introduction to tables - Microsoft Support</title>\n\n\t\n\n\t\n\n\t\t<link rel="canonical" href="https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\n\n\t\t\t<link rel="alternate" hreflang="ar-SA" href="https://support.microsoft.com/ar-sa/topic/%D9%85%D9%82%D8%AF%D9%85%D8%A9-%D8%AD%D9%88%D9%84-%D8%A7%D9%84%D8%AC%D8%AF%D8%A7%D9%88%D9%84-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\t<link rel="alternate" hreflang="bg-BG" href="https://support.microsoft.com/bg-bg/topic/%D0%B2%D1%8A%D0%B2%D0%B5%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5-%D0%B2-%D1%82%D0%B0%D0%B1%D0%BB%D0%B8%D1%86%D0%B8%D1%82%D0%B5-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\t<link rel="alternate" hre

Multiple Articles - RAG - Cleaned HTML

In [49]:
# Vars
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
n_results= 3
chunk_size = 1024


# Loading HTML from files
html_contents = []
directory = r"..\Data\HTML"
html_files = glob.glob(os.path.join(directory, '*.html'))
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        html_contents.append(html_content)
print(f"Num of Raw HTML Articles: {len(html_contents)}")
# print(f"{html_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(html_contents), "cl100k_base")
print(F"All Raw HTML Articles contain: {num_tokens_in_html} tokens\n")


# Cleaning HTML contents
cleaned_html_contents = []
for single_html_content in html_contents:
    cleaned_html_content = clean_html(single_html_content)
    cleaned_html_contents.append(cleaned_html_content)
print(f"Num of Cleaned HTML Articles: {len(cleaned_html_contents)}")
# print(f"{html_contents}\n")
num_tokens_in_cleaned_html = num_tokens_from_string(str(cleaned_html_contents), "cl100k_base")
print(F"All Cleaned HTML Articles contain: {num_tokens_in_cleaned_html} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = []
for article_html_content in cleaned_html_contents:
    chunk = chunk_text_default(str(article_html_content), chunk_size)
    all_chunks.append(chunk)
all_chunks_flat = [chunk for chunk_list in all_chunks for chunk in chunk_list]
print(f"all_chunks_flat: {len(all_chunks_flat)}\n{all_chunks_flat[:3]}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks_flat):
    documents_list.append(chunk)
    metadata_list.append({
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list[:3]}\nmetadata_list: {metadata_list[:3]}\nids_list: {ids_list[:3]}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Raw HTML Articles: 23
All Raw HTML Articles contain: 1662510 tokens

Num of Cleaned HTML Articles: 23
All Cleaned HTML Articles contain: 150387 tokens

all_chunks_flat: 472
['\n \n \n \n \n \n Introduction to tables - Microsoft Support \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n Skip to main content \n \n \n \n \n \n \n \n Microsoft \n \n \n \n Support \n \n \n \n \n Support \n \n \n \n \n\n                            Support\n\n                         \n \n \n \n \n  Home  \n \n \n Microsoft 365 \n \n \n Office \n \n \n \n Products \n \n \n Microsoft 365 \n \n \n Outlook \n \n \n Microsoft Teams \n \n \n OneDrive \n \n \n OneNote \n \n \n Windows \n \n \n Microsoft Edge

Multiple Articles - RAG - Formatted HTML to MD/JSON/YAML

In [69]:
# Vars
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
print_format_type = f"md"
n_results= 3
chunk_size = 1024


# Loading HTML from files
html_contents = []
directory = r"..\Data\HTML"
html_files = glob.glob(os.path.join(directory, '*.html'))
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        html_contents.append(html_content)
print(f"Num of Raw HTML Articles: {len(html_contents)}")
# print(f"{html_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(html_contents), "cl100k_base")
print(F"All Raw HTML Articles contain: {num_tokens_in_cleaned_html} tokens\n")


# Formatting Article's Tables
formatted_html_contents = []
for single_html_content in html_contents:
    article_text_only, article_tables_formatted = process_html(single_html_content, print_format_type, False)
    formatted_html_contents.append(article_text_only + str(article_tables_formatted))
print(f"\nNum of Formatted HTML Articles: {len(formatted_html_contents)}")
# print(f"{formatted_html_contents}\n")
num_tokens_in_formatted_html = num_tokens_from_string(str(formatted_html_contents), "cl100k_base")
print(F"All Formatted HTML Articles contain: {num_tokens_in_formatted_html} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = []
for article_html_content in formatted_html_contents:
    chunk = chunk_text_default(str(article_html_content), chunk_size)
    all_chunks.append(chunk)
all_chunks_flat = [chunk for chunk_list in all_chunks for chunk in chunk_list]
print(f"all_chunks_flat: {len(all_chunks_flat)}\n{all_chunks_flat[:3]}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks_flat):
    documents_list.append(chunk)
    metadata_list.append({
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list[:3]}\nmetadata_list: {metadata_list[:3]}\nids_list: {ids_list[:3]}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Raw HTML Articles: 23
All Raw HTML Articles contain: 150387 tokens

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
Num of Table Elements: 0
Num of Parent Table Elements: 0
Num of Filtered Parent Table Elements: 0
Num of Table Elements: 3
Num of Parent Table Elements: 3
Num of Filtered Parent Table Elements: 3
Table 0
Table 1
Table 2
Num of Table Elements: 6
Num of Parent Table Elements: 6
Num of Filtered Parent Table Elements: 4
Table 0
Table 1
Table 2
Table 3
Num of Table Elements: 2
Num of Parent Table Elements: 2
Num of Filtered Parent Table Elements: 2
Table 0
Table 1
Num of Table Elements: 2
Num of Parent Table Elements: 2
Num of Filtered Parent Table Elements: 2
Table 0
Table 1
Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
Num of Table Elements: 2
Num 

Multiple Articles - RAG - Summarize HTML

In [70]:
# Vars
user_query = f"""
what is the fat rate for Louisiana state
"""
model_choice = "Copilotteam11"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
n_results= 3
chunk_size = 1024


# Loading HTML from files
html_contents = []
directory = r"..\Data\HTML"
html_files = glob.glob(os.path.join(directory, '*.html'))
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        html_contents.append(html_content)
print(f"Num of Raw HTML Articles: {len(html_contents)}")
# print(f"{html_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(html_contents), "cl100k_base")
print(F"All Raw HTML Articles contain: {num_tokens_in_cleaned_html} tokens\n")


# Formatting Article's Tables
formatted_html_contents = []
for single_html_content in html_contents:
    article_text_only, article_tables_formatted = process_html(single_html_content, "html", True)
    formatted_html_contents.append(article_text_only + str(article_tables_formatted))
print(f"\nNum of Formatted HTML Articles: {len(formatted_html_contents)}")
# print(f"{formatted_html_contents}\n")
num_tokens_in_formatted_html = num_tokens_from_string(str(formatted_html_contents), "cl100k_base")
print(F"All Formatted HTML Articles contain: {num_tokens_in_formatted_html} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = []
for article_html_content in formatted_html_contents:
    chunk = chunk_text_default(str(article_html_content), chunk_size)
    all_chunks.append(chunk)
all_chunks_flat = [chunk for chunk_list in all_chunks for chunk in chunk_list]
print(f"all_chunks_flat: {len(all_chunks_flat)}\n{all_chunks_flat[:3]}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks_flat):
    documents_list.append(chunk)
    metadata_list.append({
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list[:3]}\nmetadata_list: {metadata_list[:3]}\nids_list: {ids_list[:3]}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=n_results
)
print(f"results:\n{results}\n")


# Result
payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': f"""
                {user_query}
                Context
                {results}
            """.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_azure_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")
print(f"response: {response}")

Num of Raw HTML Articles: 23
All Raw HTML Articles contain: 150387 tokens

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
Num of Table Elements: 0
Num of Parent Table Elements: 0
Num of Filtered Parent Table Elements: 0
Num of Table Elements: 3
Num of Parent Table Elements: 3
Num of Filtered Parent Table Elements: 3
Table 0
Table 1
Table 2
Num of Table Elements: 6
Num of Parent Table Elements: 6
Num of Filtered Parent Table Elements: 4
Table 0
Table 1
Table 2
Table 3
Num of Table Elements: 2
Num of Parent Table Elements: 2
Num of Filtered Parent Table Elements: 2
Table 0
Table 1
Num of Table Elements: 2
Num of Parent Table Elements: 2
Num of Filtered Parent Table Elements: 2
Table 0
Table 1
Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
Num of Table Elements: 1
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1
Table 0
Num of Table Elements: 2
Num 